pluf/src/Pluf/Text/Lang.php - srchub Git Source Tree - The official repository and issue tracking for srchub.

Root/pluf/src/Pluf/Text/Lang.php

<?php
/* -*- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
# ***** BEGIN LICENSE BLOCK *****
# This file is part of Plume Framework, a simple PHP Application Framework.
# Copyright (C) 2001-2007 Loic d'Anterroches and contributors.
#
# Plume Framework is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Plume Framework is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#
# ***** END LICENSE BLOCK ***** */

/**
 * Detect the language of a text.
 *
 * <code>
 * list($lang, $confid) = Pluf_Text_Lang::detect($string);
 * </code>
 */
class Pluf_Text_Lang
{
    /**
     * Given a string, returns the language.
     *
     * Algorithm by Cavnar et al. 94.
     *
     * @param string
     * @param bool Is the string clean (false)
     * @return array Language, Confidence
     */
    public static function detect($string, $is_clean=false)
    {
        if (!$is_clean) {
            $string = Pluf_Text::cleanString($string);
        }
        
    }

    /**
     * Returns the sorted n-grams of a document.
     *
     * FIXME: We should detect the proportion of thai/chinese/japanese
     * characters and switch to unigram instead of n-grams if the
     * proportion is greater than 50%.
     *
     * @param string The clean document.
     * @param int Maximum size of the n grams (3)
     * @return array N-Grams
     */
    public static function docNgrams($string, $n=3)
    {
        // do not remove the accents 
        $words = Pluf_Text::tokenize($string, false); 
        $ngrams = array();
        for ($i=2;$i<=$n;$i++) {
            foreach ($words as $word=>$occ) {
                foreach (self::makeNgrams($word, $i) as $ngram) {
                    $ngrams[] = array($ngram, $occ);
                }
            }
        }
        $out = array();
        foreach ($ngrams as $ngram) {
            if (!isset($out[$ngram[0]])) {
                $out[$ngram[0]] = $ngram[1];
            } else {
                $out[$ngram[0]] += $ngram[1];
            }
        }
        // split the ngrams by occurence.
        $ngrams = array();
        foreach ($out as $ngram=>$occ) {
            if (isset($ngrams[$occ])) {
                $ngrams[$occ][] = $ngram;
            } else {
                $ngrams[$occ] = array($ngram);
            }
        }
        krsort($ngrams);
        $res = array();
        foreach ($ngrams as $occ=>$list) {
            sort($list);
            foreach ($list as $ngram) {
                $res[] = $ngram;
            }
        }
        return $res;
    }

    /**
     * Returns the n-grams of rank n of the word.
     *
     * @param string Word.
     * @return array N-grams
     */
    public static function makeNgrams($word, $n=3)
    {
        $chars = array('_');
        $chars = $chars + Pluf_Text::stringToChars($word);
        $chars[] = '_';
        $l = count($chars);
        $ngrams = array();
        for ($i=0;$i<$l+1-$n;$i++) {
            $ngrams[$i] = array();
        }
        $n_ngrams = $l+1-$n;
        for ($i=0;$i<$l;$i++) {
            for ($j=0;$j<$n;$j++) {
                if (isset($ngrams[$i-$j])) {
                    $ngrams[$i-$j][] = $chars[$i];
                }
            }
        }
        $out = array();
        foreach ($ngrams as $ngram) {
            $t = implode('', $ngram);
            if ($t != '__') {
                $out[] = $t;
            }
        }
        return $out;
    }

    /**
     * Return the distance between two document ngrams.
     *
     * @param array n-gram
     * @param array n-gram
     * @return integer distance
     */
    public static function ngramDistance($n1, $n2)
    {
        $res = 0;
        $n_n1 = count($n1);
        $n_n2 = count($n2);
        if ($n_n1 > $n_n2) {
            list($n_n1, $n_n2) = array($n_n2, $n_n1);
            list($n1, $n2) = array($n2, $n1);
        }
        for ($i=0;$i<$n_n1;$i++) {
            if (false !== ($index = array_search($n1[$i], $n2))) {
                $offset = abs($index - $i);
                $res += ($offset > 3) ? 3 : $offset;
            } else {
                $res += 3;
            }
        }
        $res += ($n_n2 - $n_n1) * 3;
        return $res;
    }
}
Download this file
srchub

srchub Git Source Tree

Root/pluf/src/Pluf/Text/Lang.php

Branches