srchub-old

srchub-old Mercurial Source Tree


Root/pluf/src/Pluf/Text/Lang.php

<?php
/* -*- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
# ***** BEGIN LICENSE BLOCK *****
# This file is part of Plume Framework, a simple PHP Application Framework.
# Copyright (C) 2001-2007 Loic d'Anterroches and contributors.
#
# Plume Framework is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Plume Framework is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#
# ***** END LICENSE BLOCK ***** */

/**
 * Detect the language of a text.
 *
 * <code>
 * list($lang, $confid) = Pluf_Text_Lang::detect($string);
 * </code>
 */
class Pluf_Text_Lang
{
    /**
     * Given a string, returns the language.
     *
     * Algorithm by Cavnar et al. 94.
     *
     * @param string
     * @param bool Is the string clean (false)
     * @return array Language, Confidence
     */
    public static function detect($string, $is_clean=false)
    {
        if (!$is_clean) {
            $string = Pluf_Text::cleanString($string);
        }
        
    }

    /**
     * Returns the sorted n-grams of a document.
     *
     * FIXME: We should detect the proportion of thai/chinese/japanese
     * characters and switch to unigram instead of n-grams if the
     * proportion is greater than 50%.
     *
     * @param string The clean document.
     * @param int Maximum size of the n grams (3)
     * @return array N-Grams
     */
    public static function docNgrams($string, $n=3)
    {
        // do not remove the accents 
        $words = Pluf_Text::tokenize($string, false); 
        $ngrams = array();
        for ($i=2;$i<=$n;$i++) {
            foreach ($words as $word=>$occ) {
                foreach (self::makeNgrams($word, $i) as $ngram) {
                    $ngrams[] = array($ngram, $occ);
                }
            }
        }
        $out = array();
        foreach ($ngrams as $ngram) {
            if (!isset($out[$ngram[0]])) {
                $out[$ngram[0]] = $ngram[1];
            } else {
                $out[$ngram[0]] += $ngram[1];
            }
        }
        // split the ngrams by occurence.
        $ngrams = array();
        foreach ($out as $ngram=>$occ) {
            if (isset($ngrams[$occ])) {
                $ngrams[$occ][] = $ngram;
            } else {
                $ngrams[$occ] = array($ngram);
            }
        }
        krsort($ngrams);
        $res = array();
        foreach ($ngrams as $occ=>$list) {
            sort($list);
            foreach ($list as $ngram) {
                $res[] = $ngram;
            }
        }
        return $res;
    }

    /**
     * Returns the n-grams of rank n of the word.
     *
     * @param string Word.
     * @return array N-grams
     */
    public static function makeNgrams($word, $n=3)
    {
        $chars = array('_');
        $chars = $chars + Pluf_Text::stringToChars($word);
        $chars[] = '_';
        $l = count($chars);
        $ngrams = array();
        for ($i=0;$i<$l+1-$n;$i++) {
            $ngrams[$i] = array();
        }
        $n_ngrams = $l+1-$n;
        for ($i=0;$i<$l;$i++) {
            for ($j=0;$j<$n;$j++) {
                if (isset($ngrams[$i-$j])) {
                    $ngrams[$i-$j][] = $chars[$i];
                }
            }
        }
        $out = array();
        foreach ($ngrams as $ngram) {
            $t = implode('', $ngram);
            if ($t != '__') {
                $out[] = $t;
            }
        }
        return $out;
    }

    /**
     * Return the distance between two document ngrams.
     *
     * @param array n-gram
     * @param array n-gram
     * @return integer distance
     */
    public static function ngramDistance($n1, $n2)
    {
        $res = 0;
        $n_n1 = count($n1);
        $n_n2 = count($n2);
        if ($n_n1 > $n_n2) {
            list($n_n1, $n_n2) = array($n_n2, $n_n1);
            list($n1, $n2) = array($n2, $n1);
        }
        for ($i=0;$i<$n_n1;$i++) {
            if (false !== ($index = array_search($n1[$i], $n2))) {
                $offset = abs($index - $i);
                $res += ($offset > 3) ? 3 : $offset;
            } else {
                $res += 3;
            }
        }
        $res += ($n_n2 - $n_n1) * 3;
        return $res;
    }
}
Source at commit 7d92c7ed8bd9 created 10 years 10 months ago.
By "Nathan Adams ", Adding some extension mapping to FileUtil

Archive Download this file

Branches

Tags

Page rendered in 1.41774s using 11 queries.