RussianTextUtils.class.php

Go to the documentation of this file.
00001 <?php
00002 /***************************************************************************
00003  *   Copyright (C) 2004-2007 by Sveta A. Smirnova                          *
00004  *                                                                         *
00005  *   This program is free software; you can redistribute it and/or modify  *
00006  *   it under the terms of the GNU Lesser General Public License as        *
00007  *   published by the Free Software Foundation; either version 3 of the    *
00008  *   License, or (at your option) any later version.                       *
00009  *                                                                         *
00010  ***************************************************************************/
00011 /* $Id: RussianTextUtils.class.php 4687 2007-12-09 18:57:18Z voxus $ */
00012 
00016     final class RussianTextUtils extends StaticFactory
00017     {
00018         const MALE      = 0;
00019         const FEMALE    = 1;
00020         const NEUTRAL   = 2;
00021         
00022         private static $orderedSuffixes = array(
00023             self::MALE      => array('ый', 'ой', 'ий'),
00024             self::FEMALE    => array('ая', 'ья', null),
00025             self::NEUTRAL   => array('ое', 'ье', null)
00026         );
00027         
00028         private static $orderedDigits = array(
00029             'перв',
00030             'втор',
00031             'трет',
00032             'четвёрт',
00033             'пят',
00034             'шест',
00035             'седьм',
00036             'восьм',
00037             'девят',
00038             'десят'
00039         );
00040         
00041         private static $bytePrefixes = array(
00042             null, 'К', 'М', 'Г', 'Т', 'П'
00043         );
00044         
00045         private static $lettersMapping = array(
00046             'а' => 'a',        'б' => 'b',        'в' => 'v',        'г' => 'g',
00047             'д' => 'd',        'е' => 'e',        'ё' => 'jo',   'ж' => 'zh',
00048             'з' => 'z',        'и' => 'i',        'й' => 'jj',   'к' => 'k',
00049             'л' => 'l',        'м' => 'm',        'н' => 'n',        'о' => 'o',
00050             'п' => 'p',        'р' => 'r',        'с' => 's',        'т' => 't',
00051             'у' => 'u',        'ф' => 'f',        'х' => 'kh',   'ц' => 'c',
00052             'ч' => 'ch',   'ш' => 'sh',   'щ' => 'shh',  'ъ' => '\'',
00053             'ы' => 'y',        'ь' => '\'',   'э' => 'eh',   'ю' => 'ju',
00054             'я' => 'ja',
00055             
00056             'А' => 'A',        'Б' => 'B',        'В' => 'V',        'Г' => 'G',
00057             'Д' => 'D',    'Е' => 'E',        'Ё' => 'JO',   'Ж' => 'ZH',
00058             'З' => 'Z',        'И' => 'I',        'Й' => 'JJ',   'К' => 'K',
00059             'Л' => 'L',        'М' => 'M',        'Н' => 'N',        'О' => 'O',
00060             'П' => 'P',        'Р' => 'R',        'С' => 'S',        'Т' => 'T',
00061             'У' => 'U',        'Ф' => 'F',        'Х' => 'KH',   'Ц' => 'C',
00062             'Ч' => 'CH',   'Ш' => 'SH',   'Щ' => 'SHH',  'Ъ' => '\'',
00063             'Ы' => 'Y',        'Ь' => '\'',   'Э' => 'EH',   'Ю' => 'JU',
00064             'Я' => 'JA'
00065         );
00066         
00067         private static $flippedLettersMapping = array();
00068         
00069         private static $ambiguousDetection = false;
00070         
00080         public static function selectCaseForNumber($number, $cases)
00081         {
00082             if (($number % 10) == 1 && ($number % 100) != 11) {
00083                 
00084                 return $cases[0];
00085                 
00086             } elseif (
00087                 ($number % 10) > 1
00088                 && ($number % 10) < 5
00089                 && ($number < 10 || $number > 20)
00090             ) {
00091                 
00092                 return $cases[1];
00093                 
00094             } else {
00095                 return $cases[2];
00096             }
00097         }
00098         
00103         public static function getMonthInGenitiveCase($month)
00104         {
00105             static $months = array(
00106                 'января', 'февраля', 'марта', 'апреля',
00107                 'мая', 'июня', 'июля', 'августа', 'сентября',
00108                 'октября', 'ноября', 'декабря'
00109             );
00110             
00111             return $months[$month - 1];
00112         }
00113         
00114         public static function getMonthInSubjectiveCase($month)
00115         {
00116             static $months = array(
00117                 'январь', 'февраль', 'март', 'апрель',
00118                 'май', 'июнь', 'июль', 'август', 'сентябрь',
00119                 'октябрь', 'ноябрь', 'декабрь'
00120             );
00121             
00122             return $months[$month - 1];
00123         }
00124         
00125         public static function getDayOfWeek($day, $full = false)
00126         {
00127             static $weekDays = array(
00128                 'вс', 'пн', 'вт', 'ср',
00129                 'чт', 'пт', 'сб', 'вс'
00130             );
00131             
00132             static $weekDaysFull = array(
00133                 'Воскресенье', 'Понедельник', 'Вторник', 'Среда',
00134                 'Четверг', 'Пятница', 'Суббота', 'Воскресенье'
00135             );
00136             
00137             if ($full)
00138                 return $weekDaysFull[$day];
00139             else
00140                 return $weekDays[$day];
00141         }
00142         
00143         public static function getDateAsText(Timestamp $date, $todayWordNeed = true)
00144         {
00145             $dayStart = Timestamp::makeToday();
00146             $tomorrowDayStart = $dayStart->spawn('+1 day');
00147             
00148             if (
00149                 (Timestamp::compare($date, $dayStart) == 1)
00150                 && (Timestamp::compare($date, $tomorrowDayStart) == -1)
00151             )
00152                 return
00153                     (
00154                         $todayWordNeed === true
00155                             ? 'сегодня '
00156                             : null
00157                     )
00158                     .'в '
00159                     .date('G:i', $date->toStamp());
00160             
00161             $yesterdayStart = $dayStart->spawn('-1 day');
00162             
00163             if (
00164                 (Timestamp::compare($date, $yesterdayStart) == 1)
00165                 && (Timestamp::compare($date, $dayStart) == -1)
00166             )
00167                 return 'вчера в '.date('G:i', $date->toStamp());
00168             
00169             return date('j.m.Y в G:i', $date->toStamp());
00170         }
00171         
00172         public static function friendlyFileSize($size, $precision = 2)
00173         {
00174             if ($size < 1024)
00175                 return
00176                     $size.' '.self::selectCaseForNumber(
00177                         $size, array('байт', 'байта', 'байт')
00178                     );
00179             else
00180                 return TextUtils::friendlyFileSize(
00181                     $size, $precision, self::$bytePrefixes, true
00182                 ).'Б';
00183         }
00184         
00185         public static function getHumanDay(Date $date, $wordDayNeed = true)
00186         {
00187             $today      = Date::makeToday();
00188             $tomorrow   = $today->spawn('+1 day');
00189             
00190             if ($date->toDate() == $today->toDate() && $wordDayNeed == true)
00191                 return 'сегодня';
00192             elseif ($date->toDate() == $tomorrow->toDate() && $wordDayNeed == true)
00193                 return 'завтра';
00194             else
00195                 return
00196                     (int) $date->getDay()
00197                     . ' '
00198                     . RussianTextUtils::getMonthInGenitiveCase(
00199                         $date->getMonth()
00200                     );
00201         }
00202         
00203         public static function toTranslit($sourceString)
00204         {
00205             return strtr($sourceString, self::$lettersMapping);
00206         }
00207         
00208         public static function toRussian($sourceString)
00209         {
00210             if (!self::$flippedLettersMapping)
00211                 self::$flippedLettersMapping =
00212                     array_flip(self::$lettersMapping);
00213             
00214             return strtr($sourceString, self::$flippedLettersMapping);
00215         }
00216         
00221         public static function detectEncoding($data)
00222         {
00223             static $tables = array(
00224                 'KOI8-R' => array(), 'WINDOWS-1251' => array()
00225             );
00226             
00227             $table = CyrillicPairs::getTable();
00228             
00229             $score = array('UTF-8' => 0, 'KOI8-R' => 0, 'WINDOWS-1251' => 0);
00230             
00231             foreach (
00232                 preg_split('~[\.\,\-\s\:\;\?\!\'\"\(\)\d<>]~', $data) as $word
00233             ) {
00234                 for ($i = 0; $i < strlen($word) - 2; ++$i) {
00235                     foreach (array_keys($score) as $encoding) {
00236                         if ($encoding == 'UTF-8')
00237                             $pairLengthBytes = 4;
00238                         else
00239                             $pairLengthBytes = 2;
00240                         
00241                         if ($i + $pairLengthBytes >= strlen($word))
00242                             continue;
00243                         
00244                         $pair = substr($word, $i, $pairLengthBytes);
00245                         
00246                         $value = 0;
00247                         
00248                         if ($encoding === 'UTF-8') {
00249                             
00250                             if (isset($table[$pair]))
00251                                 $value = $table[$pair];
00252                             
00253                         } elseif (
00254                             isset($tables[$encoding][$pair])
00255                         ) {
00256                             $value = $tables[$encoding][$pair];
00257                         } else {
00258                             
00259                             $utf8Pair = mb_convert_encoding(
00260                                 $pair, 'UTF-8', $encoding
00261                             );
00262                             
00263                             if (isset($table[$utf8Pair])) {
00264                                 $value = $table[$utf8Pair];
00265                                 $tables[$encoding][$pair] = $table[$utf8Pair];
00266                             } else {
00267                                 $tables[$encoding][$pair] = false;
00268                             }
00269                         }
00270                         
00271                         $score[$encoding] += $value;
00272                     }
00273                     
00274                 }
00275             }
00276             
00277             $koi8Ratio =
00278                 $score['KOI8-R']
00279                 / ($score['WINDOWS-1251'] + $score['UTF-8'] + 1);
00280             
00281             $winRatio =
00282                 $score['WINDOWS-1251']
00283                 / ($score['KOI8-R'] + $score['UTF-8'] + 1);
00284             
00285             $utf8Ratio =
00286                 $score['UTF-8']
00287                 / ($score['KOI8-R'] + $score['WINDOWS-1251'] + 1);
00288             
00289             $minRatio = 1.5;
00290             $doubtRatio = 1;
00291             
00292             if (
00293                 ($koi8Ratio < $minRatio && $koi8Ratio > $doubtRatio)
00294                 || ($winRatio < $minRatio && $winRatio > $doubtRatio)
00295                 || ($utf8Ratio < $minRatio && $utf8Ratio > $doubtRatio)
00296             ) {
00297                 self::$ambiguousDetection = true;
00298             } else
00299                 self::$ambiguousDetection = false;
00300             
00301             if ($koi8Ratio > $winRatio && $koi8Ratio > $utf8Ratio)
00302                 return 'KOI8-R';
00303             
00304             if ($winRatio > $utf8Ratio)
00305                 return 'WINDOWS-1251';
00306             
00307             if ($winRatio + $koi8Ratio + $utf8Ratio > 0)
00308                 return 'UTF-8';
00309             
00310             return 'ASCII';
00311         }
00312         
00313         public static function isAmbiguousDetection()
00314         {
00315             return self::$ambiguousDetection;
00316         }
00317     }
00318 ?>

Generated on Sun Dec 9 21:56:24 2007 for onPHP by  doxygen 1.5.4