WordCount.php 2.78 KB
Newer Older
Pham Huy committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
<?php
/**
 * Experimental PO/POT file word counter.
 * Word counts are approximate, including numbers and sprintf tokens.
 * Currently only used for source words in latin script, presumed to be in English.
 */
class Loco_gettext_WordCount implements Countable {

    /**
     * @var LocoPoIterator
     */
    private $po;

    /**
     * Source Words: Cached count of "msgid" fields, presumed en_US
     * @var int
     */
    private $sw;



    /**
     * Create counter for a pre-parsed PO/POT file.
     */
    public function __construct( Loco_gettext_Data $po ){
        $this->po = $po;
    }



    /**
     * @internal
     */
    private function countField( $f ){
        $n = 0;
        foreach( $this->po as $r ){
            $n += self::simpleCount( $r[$f] );
        }
        return $n;
    }



    /**
     * Default count function returns source words (msgid) in current file.
     * @return int
     */
    public function count(){
        $n = $this->sw;
        if( is_null($n) ){
            $n = $this->countField('source');
            $this->sw = $n;
        }
        return $n;
    }



    /**
     * Very simple word count, only suitable for latin characters, and biased toward English.
     * @return int
     */
    public static function simpleCount( $str ){
        $n = 0;
        if( isset($str{0}) ){

            // TODO should we strip PHP string formatting?
            // e.g. "Hello %s" currently counts as 2 words.
            // $str = preg_replace('/%(?:\\d+\\$)?(?:\'.|[-+0 ])*\\d*(?:\\.\\d+)?[suxXbcdeEfFgGo%]/', '', $str );

            // Strip HTML (but only if open and close tags detected, else "< foo" would be stripped to nothing
            if( false !== strpos($str,'<') && false !== strpos($str,'>') ){
                $str = strip_tags($str);
            }

            // always html-decode, else escaped punctuation will be counted as words
            $str = html_entity_decode( $str, ENT_QUOTES, 'UTF-8');

            // Collapsing apostrophe'd words into single units:
            // Simplest way to handle ambiguity of "It's Tim's" (technically three words in English)
            $str = preg_replace('/(\\w+)\'(\\w)(\\W|$)/u', '\\1\\2\\3', $str );
            
            // Combining floating numbers into single units
            // e.g. "£1.50" and "€1,50" should be one word each
            $str = preg_replace('/\\d[\\d,\\.]+/', '0', $str );

            // count words by standard Unicode word boundaries
            $words = preg_split( '/\\W+/u', $str, -1, PREG_SPLIT_NO_EMPTY );
            $n += count($words);

            /*/ TODO should we exclude some words (like numbers)?
            foreach( $words as $word ){
                if( ! ctype_digit($word) ){
                    $n++;
                }
            }*/
        }
        return $n;
    }

}