Root/
<?php /* -*- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* # ***** BEGIN LICENSE BLOCK ***** # This file is part of Plume Framework, a simple PHP Application Framework. # Copyright (C) 2001-2007 Loic d'Anterroches and contributors. # # Plume Framework is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation; either version 2.1 of the License, or # (at your option) any later version. # # Plume Framework is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # ***** END LICENSE BLOCK ***** */ /** * Detect the language of a text. * * <code> * list($lang, $confid) = Pluf_Text_Lang::detect($string); * </code> */ class Pluf_Text_Lang { /** * Given a string, returns the language. * * Algorithm by Cavnar et al. 94. * * @param string * @param bool Is the string clean (false) * @return array Language, Confidence */ public static function detect( $string , $is_clean =false) { if (! $is_clean ) { $string = Pluf_Text::cleanString( $string ); } } /** * Returns the sorted n-grams of a document. * * FIXME: We should detect the proportion of thai/chinese/japanese * characters and switch to unigram instead of n-grams if the * proportion is greater than 50%. * * @param string The clean document. * @param int Maximum size of the n grams (3) * @return array N-Grams */ public static function docNgrams( $string , $n =3) { // do not remove the accents $words = Pluf_Text::tokenize( $string , false); $ngrams = array (); for ( $i =2; $i <= $n ; $i ++) { foreach ( $words as $word => $occ ) { foreach (self::makeNgrams( $word , $i ) as $ngram ) { $ngrams [] = array ( $ngram , $occ ); } } } $out = array (); foreach ( $ngrams as $ngram ) { if (!isset( $out [ $ngram [0]])) { $out [ $ngram [0]] = $ngram [1]; } else { $out [ $ngram [0]] += $ngram [1]; } } // split the ngrams by occurence. $ngrams = array (); foreach ( $out as $ngram => $occ ) { if (isset( $ngrams [ $occ ])) { $ngrams [ $occ ][] = $ngram ; } else { $ngrams [ $occ ] = array ( $ngram ); } } krsort( $ngrams ); $res = array (); foreach ( $ngrams as $occ => $list ) { sort( $list ); foreach ( $list as $ngram ) { $res [] = $ngram ; } } return $res ; } /** * Returns the n-grams of rank n of the word. * * @param string Word. * @return array N-grams */ public static function makeNgrams( $word , $n =3) { $chars = array ( '_' ); $chars = $chars + Pluf_Text::stringToChars( $word ); $chars [] = '_' ; $l = count ( $chars ); $ngrams = array (); for ( $i =0; $i < $l +1- $n ; $i ++) { $ngrams [ $i ] = array (); } $n_ngrams = $l +1- $n ; for ( $i =0; $i < $l ; $i ++) { for ( $j =0; $j < $n ; $j ++) { if (isset( $ngrams [ $i - $j ])) { $ngrams [ $i - $j ][] = $chars [ $i ]; } } } $out = array (); foreach ( $ngrams as $ngram ) { $t = implode( '' , $ngram ); if ( $t != '__' ) { $out [] = $t ; } } return $out ; } /** * Return the distance between two document ngrams. * * @param array n-gram * @param array n-gram * @return integer distance */ public static function ngramDistance( $n1 , $n2 ) { $res = 0; $n_n1 = count ( $n1 ); $n_n2 = count ( $n2 ); if ( $n_n1 > $n_n2 ) { list( $n_n1 , $n_n2 ) = array ( $n_n2 , $n_n1 ); list( $n1 , $n2 ) = array ( $n2 , $n1 ); } for ( $i =0; $i < $n_n1 ; $i ++) { if (false !== ( $index = array_search ( $n1 [ $i ], $n2 ))) { $offset = abs ( $index - $i ); $res += ( $offset > 3) ? 3 : $offset ; } else { $res += 3; } } $res += ( $n_n2 - $n_n1 ) * 3; return $res ; } } |
Source at commit 196380cf92e8 created 10 years 2 months ago. By Nathan Adams, Revmoing cache for viewing source (temporarily) |
---|