diff --git a/src/Pluf/Text/UTF8.php b/src/Pluf/Text/UTF8.php index e7ff6dd..a04f2ad 100644 --- a/src/Pluf/Text/UTF8.php +++ b/src/Pluf/Text/UTF8.php @@ -115,6 +115,64 @@ class Pluf_Text_UTF8 return true; } + /** + * Detect if a string is in a Russian charset. + * + * This should be used when the mb_string detection encoding is + * failing. For example: + * + *
+ * $encoding = mb_detect_encoding($string, mb_detect_order(), true); + * if ($encoding == false) { + * $encoding = Pluf_Text_UTF8::detect_cyr_charset($string); + * } + *+ * + * @link http://forum.php.su/topic.php?forum=1&topic=1346 + * + * @param string + * @return string Possible Russian encoding + */ + public static function detect_cyr_charset($str) + { + $charsets = array( + 'KOI8-R' => 0, + 'Windows-1251' => 0, + 'CP-866' => 0, + 'ISO-8859-5' => 0, + 'MacCyrillic' => 0 + ); + $length = strlen($str); + for ($i=0; $i<$length; $i++) { + $char = ord($str[$i]); + //non-russian characters + if ($char < 128 || $char > 256) continue; + + //CP866 + if (($char > 159 && $char < 176) || ($char > 223 && $char < 242)) + $charsets['CP-866']+=3; + if (($char > 127 && $char < 160)) $charsets['CP-866']+=1; + + //KOI8-R + if (($char > 191 && $char < 223)) $charsets['KOI8-R']+=3; + if (($char > 222 && $char < 256)) $charsets['KOI8-R']+=1; + + //WIN-1251 + if ($char > 223 && $char < 256) $charsets['Windows-1251']+=3; + if ($char > 191 && $char < 224) $charsets['Windows-1251']+=1; + + //MAC + if ($char > 221 && $char < 255) $charsets['MacCyrillic']+=3; + if ($char > 127 && $char < 160) $charsets['MacCyrillic']+=1; + + //ISO-8859-5 + if ($char > 207 && $char < 240) $charsets['ISO-8859-5']+=3; + if ($char > 175 && $char < 208) $charsets['ISO-8859-5']+=1; + + } + arsort($charsets); + return key($charsets); + } /**