srchub-old

srchub-old Mercurial Source Tree


Root/pluf/src/Pluf/Text/Stemmer/Porter.php

<?php
/* -*- tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
# ***** BEGIN LICENSE BLOCK *****
# This file is part of Plume Framework, a simple PHP Application Framework.
# Copyright (C) 2001-2007 Loic d'Anterroches and contributors.
#
# Plume Framework is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Plume Framework is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#
# ***** END LICENSE BLOCK ***** */

/**
 * Copyright (c) 2005 Richard Heyes (http://www.phpguru.org/)
 *
 * All rights reserved.
 *
 * This script is free software.
 *  
 * Downloaded from: http://tartarus.org/~martin/PorterStemmer/php.txt
 * Modified to be multibyte compatible.
 */

/**
 * PHP5 Implementation of the Porter Stemmer algorithm. Certain elements
 * were borrowed from the (broken) implementation by Jon Abernathy.
 *
 * Usage:
 *
 *  $stem = Pluf_Text_Stemmer_Porter::stem($word);
 *
 * How easy is that?
 */

class Pluf_Text_Stemmer_Porter
{
    /**
     * Regex for matching a consonant
     * @var string
     */
    private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';


    /**
     * Regex for matching a vowel
     * @var string
     */
    private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';


    /**
     * Stems a word. Simple huh?
     *
     * @param  string $word Word to stem
     * @return string       Stemmed word
     */
    public static function stem($word)
    {
        if (strlen($word) <= 2) {
            return $word;
        }

        $word = self::step1ab($word);
        $word = self::step1c($word);
        $word = self::step2($word);
        $word = self::step3($word);
        $word = self::step4($word);
        $word = self::step5($word);

        return $word;
    }


    /**
     * Step 1
     */
    private static function step1ab($word)
    {
        // Part a
        if (substr($word, -1) == 's') {

            self::replace($word, 'sses', 'ss')
                OR self::replace($word, 'ies', 'i')
                OR self::replace($word, 'ss', 'ss')
                OR self::replace($word, 's', '');
        }

        // Part b
        if (substr($word, -2, 1) != 'e' OR !self::replace($word, 'eed', 'ee', 0)) { // First rule
            $v = self::$regex_vowel;

            // ing and ed
            if (   preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
                   OR preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons

                // If one of above two test successful
                if (    !self::replace($word, 'at', 'ate')
                        AND !self::replace($word, 'bl', 'ble')
                        AND !self::replace($word, 'iz', 'ize')) {

                    // Double consonant ending
                    if (    self::doubleConsonant($word)
                            AND substr($word, -2) != 'll'
                            AND substr($word, -2) != 'ss'
                            AND substr($word, -2) != 'zz') {

                        $word = substr($word, 0, -1);

                    } else if (self::m($word) == 1 AND self::cvc($word)) {
                        $word .= 'e';
                    }
                }
            }
        }
        return $word;
    }


    /**
     * Step 1c
     *
     * @param string $word Word to stem
     */
    private static function step1c($word)
    {
        $v = self::$regex_vowel;

        if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
            self::replace($word, 'y', 'i');
        }

        return $word;
    }


    /**
     * Step 2
     *
     * @param string $word Word to stem
     */
    private static function step2($word)
    {
        switch (substr($word, -2, 1)) {
        case 'a':
            self::replace($word, 'ational', 'ate', 0)
                OR self::replace($word, 'tional', 'tion', 0);
            break;

        case 'c':
            self::replace($word, 'enci', 'ence', 0)
                OR self::replace($word, 'anci', 'ance', 0);
            break;

        case 'e':
            self::replace($word, 'izer', 'ize', 0);
            break;

        case 'g':
            self::replace($word, 'logi', 'log', 0);
            break;

        case 'l':
            self::replace($word, 'entli', 'ent', 0)
                OR self::replace($word, 'ousli', 'ous', 0)
                OR self::replace($word, 'alli', 'al', 0)
                OR self::replace($word, 'bli', 'ble', 0)
                OR self::replace($word, 'eli', 'e', 0);
            break;

        case 'o':
            self::replace($word, 'ization', 'ize', 0)
                OR self::replace($word, 'ation', 'ate', 0)
                OR self::replace($word, 'ator', 'ate', 0);
            break;

        case 's':
            self::replace($word, 'iveness', 'ive', 0)
                OR self::replace($word, 'fulness', 'ful', 0)
                OR self::replace($word, 'ousness', 'ous', 0)
                OR self::replace($word, 'alism', 'al', 0);
            break;

        case 't':
            self::replace($word, 'biliti', 'ble', 0)
                OR self::replace($word, 'aliti', 'al', 0)
                OR self::replace($word, 'iviti', 'ive', 0);
            break;
        }

        return $word;
    }


    /**
     * Step 3
     *
     * @param string $word String to stem
     */
    private static function step3($word)
    {
        switch (substr($word, -2, 1)) {
        case 'a':
            self::replace($word, 'ical', 'ic', 0);
            break;

        case 's':
            self::replace($word, 'ness', '', 0);
            break;

        case 't':
            self::replace($word, 'icate', 'ic', 0)
                OR self::replace($word, 'iciti', 'ic', 0);
            break;

        case 'u':
            self::replace($word, 'ful', '', 0);
            break;

        case 'v':
            self::replace($word, 'ative', '', 0);
            break;

        case 'z':
            self::replace($word, 'alize', 'al', 0);
            break;
        }

        return $word;
    }


    /**
     * Step 4
     *
     * @param string $word Word to stem
     */
    private static function step4($word)
    {
        switch (substr($word, -2, 1)) {
        case 'a':
            self::replace($word, 'al', '', 1);
            break;

        case 'c':
            self::replace($word, 'ance', '', 1)
                OR self::replace($word, 'ence', '', 1);
            break;

        case 'e':
            self::replace($word, 'er', '', 1);
            break;

        case 'i':
            self::replace($word, 'ic', '', 1);
            break;

        case 'l':
            self::replace($word, 'able', '', 1)
                OR self::replace($word, 'ible', '', 1);
            break;

        case 'n':
            self::replace($word, 'ant', '', 1)
                OR self::replace($word, 'ement', '', 1)
                OR self::replace($word, 'ment', '', 1)
                OR self::replace($word, 'ent', '', 1);
            break;

        case 'o':
            if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
                self::replace($word, 'ion', '', 1);
            } else {
                self::replace($word, 'ou', '', 1);
            }
            break;

        case 's':
            self::replace($word, 'ism', '', 1);
            break;

        case 't':
            self::replace($word, 'ate', '', 1)
                OR self::replace($word, 'iti', '', 1);
            break;

        case 'u':
            self::replace($word, 'ous', '', 1);
            break;

        case 'v':
            self::replace($word, 'ive', '', 1);
            break;

        case 'z':
            self::replace($word, 'ize', '', 1);
            break;
        }

        return $word;
    }


    /**
     * Step 5
     *
     * @param string $word Word to stem
     */
    private static function step5($word)
    {
        // Part a
        if (substr($word, -1) == 'e') {
            if (self::m(substr($word, 0, -1)) > 1) {
                self::replace($word, 'e', '');

            } else if (self::m(substr($word, 0, -1)) == 1) {

                if (!self::cvc(substr($word, 0, -1))) {
                    self::replace($word, 'e', '');
                }
            }
        }

        // Part b
        if (self::m($word) > 1 AND self::doubleConsonant($word) AND substr($word, -1) == 'l') {
            $word = substr($word, 0, -1);
        }

        return $word;
    }


    /**
     * Replaces the first string with the second, at the end of the string. If third
     * arg is given, then the preceding string must match that m count at least.
     *
     * @param  string $str   String to check
     * @param  string $check Ending to check for
     * @param  string $repl  Replacement string
     * @param  int    $m     Optional minimum number of m() to meet
     * @return bool          Whether the $check string was at the end
     *                       of the $str string. True does not necessarily mean
     *                       that it was replaced.
     */
    private static function replace(&$str, $check, $repl, $m = null)
    {
        $len = 0 - strlen($check);

        if (substr($str, $len) == $check) {
            $substr = substr($str, 0, $len);
            if (is_null($m) OR self::m($substr) > $m) {
                $str = $substr . $repl;
            }

            return true;
        }

        return false;
    }


    /**
     * What, you mean it's not obvious from the name?
     *
     * m() measures the number of consonant sequences in $str. if c is
     * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
     * presence,
     *
     * <c><v>       gives 0
     * <c>vc<v>     gives 1
     * <c>vcvc<v>   gives 2
     * <c>vcvcvc<v> gives 3
     *
     * @param  string $str The string to return the m count for
     * @return int         The m count
     */
    private static function m($str)
    {
        $c = self::$regex_consonant;
        $v = self::$regex_vowel;

        $str = preg_replace("#^$c+#", '', $str);
        $str = preg_replace("#$v+$#", '', $str);

        preg_match_all("#($v+$c+)#", $str, $matches);

        return count($matches[1]);
    }


    /**
     * Returns true/false as to whether the given string contains two
     * of the same consonant next to each other at the end of the string.
     *
     * @param  string $str String to check
     * @return bool        Result
     */
    private static function doubleConsonant($str)
    {
        $c = self::$regex_consonant;

        return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
    }


    /**
     * Checks for ending CVC sequence where second C is not W, X or Y
     *
     * @param  string $str String to check
     * @return bool        Result
     */
    private static function cvc($str)
    {
        $c = self::$regex_consonant;
        $v = self::$regex_vowel;
        return     preg_match("#($c$v$c)$#", $str, $matches)
            AND strlen($matches[1]) == 3
            AND $matches[1]{2} != 'w'
            AND $matches[1]{2} != 'x'
            AND $matches[1]{2} != 'y';
    }
}

Source at commit 7d92c7ed8bd9 created 10 years 10 months ago.
By "Nathan Adams ", Adding some extension mapping to FileUtil

Archive Download this file

Branches

Tags

Page rendered in 1.10378s using 11 queries.