pluf2

pluf2 Git Source Tree


Root/src/Pluf/Text.php

<?php
/* -*- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
# ***** BEGIN LICENSE BLOCK *****
# This file is part of Plume Framework, a simple PHP Application Framework.
# Copyright (C) 2001-2007 Loic d'Anterroches and contributors.
#
# Plume Framework is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Plume Framework is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#
# ***** END LICENSE BLOCK ***** */

/**
 * Utility class to clean/manipulate strings. 
 */

class Pluf_Text
{
    /**
     * Given a string, cleaned from the not interesting characters,
     * returns an array with the words as index and the number of
     * times it was in the text as the value.
     *
     * @credits Tokenizer of DokuWiki to handle Thai and CJK words.
     *          http://www.splitbrain.org/projects/dokuwiki
     *
     * @param string Cleaned, lowercased and utf-8 encoded string.
     * @param bool Remove the accents (True)
     * @return array Word and number of occurences.
     */
    public static function tokenize($string, $remove_accents=True)
    {
        if ($remove_accents) {
            $string = self::removeAccents($string);
        }
        $asian1 = '[\x{0E00}-\x{0E7F}]'; // Thai
        $asian2 = '['.
                   '\x{2E80}-\x{3040}'.  // CJK -> Hangul
                   '\x{309D}-\x{30A0}'.
                   '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'.
                   '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
                   '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
                   ']';
        $asian3 = '['. // Hiragana/Katakana (can be two characters)
                   '\x{3042}\x{3044}\x{3046}\x{3048}'.
                   '\x{304A}-\x{3062}\x{3064}-\x{3082}'.
                   '\x{3084}\x{3086}\x{3088}-\x{308D}'.
                   '\x{308F}-\x{3094}'.
                   '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'.
                   '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'.
                   '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'.
                   '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'.
                   ']['.
                   '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'.
                   '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'.
                   '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'.
                   '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'.
                   '\x{31F0}-\x{31FF}'.
                   ']?';
        $asian = '(?:'.$asian1.'|'.$asian2.'|'.$asian3.')';
        $words = array();
        // handle asian chars as single words.
        $asia = @preg_replace('/('.$asian.')/u',' \1 ',$string);
        if (!is_null($asia)) {
            //will not be called if regexp failure
            $string = $asia;
        }
        $arr = preg_split('/\s+/', $string, -1, PREG_SPLIT_NO_EMPTY);
        foreach ($arr as $w) {
            $w = trim($w);
            if (isset($words[$w])) {
                $words[$w]++;
            } else {
                $words[$w] = 1;
            }
        }
        return $words;
    }

    /**
     * Clean a string from the HTML and the unnecessary
     * punctuation. Convert the string to lowercase.
     *
     * @info Require mbstring extension.
     *
     * @param string String.
     * @return string Cleaned lowercase string.
     */
    public static function cleanString($string)
    {
        $string = html_entity_decode($string, ENT_QUOTES, 'utf-8');
        $string = str_replace('<?php', '', $string);
        $string = strip_tags($string);
        $string = strtr($string, "\r\n\t", '   ');
        $string = strtr($string, 
                        '.<>,;:(){}[]\\|*@!?^_=/\'~`%$#',
                        '                            ');
        return mb_strtolower($string, 'UTF-8');
    }

    /**
     * Remove the accentuated characters.
     *
     * Requires a string in lowercase, the removal is not perfect but
     * is better than nothing.
     *
     * @param string Lowercased string in utf-8.
     * @return string String with some of the accents removed.
     */
    public static function removeAccents($string)
    {
        $map = array(
                     'à'=>'a', 'ô'=>'o', 'ď'=>'d', 'ḟ'=>'f', 'ë'=>'e',
                     'š'=>'s', 'ơ'=>'o', 'ß'=>'ss', 'ă'=>'a', 'ř'=>'r', 
                     'ț'=>'t', 'ň'=>'n', 'ā'=>'a', 'ķ'=>'k', 'ŝ'=>'s', 
                     'ỳ'=>'y', 'ņ'=>'n', 'ĺ'=>'l', 'ħ'=>'h', 'ṗ'=>'p', 
                     'ó'=>'o', 'ú'=>'u', 'ě'=>'e', 'é'=>'e', 'ç'=>'c',
                     'ẁ'=>'w', 'ċ'=>'c', 'õ'=>'o', 'ṡ'=>'s', 'ø'=>'o', 
                     'ģ'=>'g', 'ŧ'=>'t', 'ș'=>'s', 'ė'=>'e', 'ĉ'=>'c',
                     'ś'=>'s', 'î'=>'i', 'ű'=>'u', 'ć'=>'c', 'ę'=>'e', 
                     'ŵ'=>'w', 'ṫ'=>'t', 'ū'=>'u', 'č'=>'c', 'ö'=>'oe', 
                     'è'=>'e', 'ŷ'=>'y', 'ą'=>'a', 'ł'=>'l', 'ų'=>'u', 
                     'ů'=>'u', 'ş'=>'s', 'ğ'=>'g', 'ļ'=>'l', 'ƒ'=>'f', 
                     'ž'=>'z', 'ẃ'=>'w', 'ḃ'=>'b', 'å'=>'a', 'ì'=>'i', 
                     'ï'=>'i', 'ḋ'=>'d', 'ť'=>'t', 'ŗ'=>'r', 'ä'=>'ae', 
                     'í'=>'i', 'ŕ'=>'r', 'ê'=>'e', 'ü'=>'ue', 'ò'=>'o',
                     'ē'=>'e', 'ñ'=>'n', 'ń'=>'n', 'ĥ'=>'h', 'ĝ'=>'g', 
                     'đ'=>'d', 'ĵ'=>'j', 'ÿ'=>'y', 'ũ'=>'u', 'ŭ'=>'u', 
                     'ư'=>'u', 'ţ'=>'t', 'ý'=>'y', 'ő'=>'o', 'â'=>'a', 
                     'ľ'=>'l', 'ẅ'=>'w', 'ż'=>'z', 'ī'=>'i', 'ã'=>'a', 
                     'ġ'=>'g', 'ṁ'=>'m', 'ō'=>'o', 'ĩ'=>'i', 'ù'=>'u', 
                     'į'=>'i', 'ź'=>'z', 'á'=>'a', 'û'=>'u', 'þ'=>'th', 
                     'ð'=>'dh', 'æ'=>'ae', 'µ'=>'u', 'ĕ'=>'e',
                     );
        return strtr($string, $map);
    }

    /**
     * Convert a string to a list of characters.
     *
     * @param string utf-8 encoded string.
     * @return array Characters.
     */
    public static function stringToChars($string)
    {
        $chars = array();
        $strlen = mb_strlen($string, 'UTF-8');
        for ($i=0;$i<$strlen;$i++) {
            $chars[] = mb_substr($string,$i, 1, 'UTF-8');
        }
        return $chars;
    }

    /**
     * Prevent a string to be all uppercase. 
     *
     * If more than 50% of the words in the string are uppercases and
     * if the string contains more than one word, the string is
     * converted using the mb_convert_case.
     *
     * @see http://www.php.net/mb_convert_case
     *
     * @param string String to test.
     * @param int Mode to convert the string (MB_CASE_TITLE)
     * @return string Cleaned string.
     */
    public static function preventUpperCase($string, $mode=MB_CASE_TITLE)
    {
        $elts = mb_split(' ', $string);
        $n_elts = count($elts);
        if ($n_elts > 1) {
            $tot = 0;
            foreach ($elts as $elt) {
                if ($elt == '') {
                    $n_elts--;
                    continue;
                }
                if ($elt == mb_strtoupper($elt, 'UTF-8')) {
                    $tot++;
                }
            }
            if ( (float) $tot / (float) $n_elts >= 0.5) {
                return mb_convert_case(mb_strtolower($string, 'UTF-8'), 
                                       $mode, 'UTF-8');
            }
        }
        return $string;
    }

    /**
     * Simple uppercase prevention.
     *
     * Contrary to self::preventUpperCase, this method will also
     * prevent a single word to be uppercase.
     *
     * @param string String possibly in uppercase.
     * @param int Mode to convert the string (MB_CASE_TITLE)
     * @return string Mode cased if all uppercase in input.
     */
    public static function simplePreventUpperCase($string, $mode=MB_CASE_TITLE)
    {
        if ($string == mb_strtoupper($string)) {
            return mb_convert_case(mb_strtolower($string), $mode, 'UTF-8');
        }
        return $string;
    }
}

Archive Download this file

Branches

Number of commits:
Page rendered in 0.08846s using 11 queries.