RussianTextUtils.class.php

Go to the documentation of this file.
00001 <?php
00002 /***************************************************************************
00003  *   Copyright (C) 2004-2007 by Sveta A. Smirnova                          *
00004  *                                                                         *
00005  *   This program is free software; you can redistribute it and/or modify  *
00006  *   it under the terms of the GNU Lesser General Public License as        *
00007  *   published by the Free Software Foundation; either version 3 of the    *
00008  *   License, or (at your option) any later version.                       *
00009  *                                                                         *
00010  ***************************************************************************/
00011 
00015     final class RussianTextUtils extends StaticFactory
00016     {
00017         const MALE      = 0;
00018         const FEMALE    = 1;
00019         const NEUTRAL   = 2;
00020         
00021         private static $orderedSuffixes = array(
00022             self::MALE      => array('ый', 'ой', 'ий'),
00023             self::FEMALE    => array('ая', 'ья', null),
00024             self::NEUTRAL   => array('ое', 'ье', null)
00025         );
00026         
00027         private static $orderedDigits = array(
00028             'перв',
00029             'втор',
00030             'трет',
00031             'четвёрт',
00032             'пят',
00033             'шест',
00034             'седьм',
00035             'восьм',
00036             'девят',
00037             'десят'
00038         );
00039         
00040         private static $bytePrefixes = array(
00041             null, 'К', 'М', 'Г', 'Т', 'П'
00042         );
00043         
00044         private static $lettersMapping = array(
00045             'а' => 'a',        'б' => 'b',        'в' => 'v',        'г' => 'g',
00046             'д' => 'd',        'е' => 'e',        'ё' => 'jo',   'ж' => 'zh',
00047             'з' => 'z',        'и' => 'i',        'й' => 'jj',   'к' => 'k',
00048             'л' => 'l',        'м' => 'm',        'н' => 'n',        'о' => 'o',
00049             'п' => 'p',        'р' => 'r',        'с' => 's',        'т' => 't',
00050             'у' => 'u',        'ф' => 'f',        'х' => 'kh',   'ц' => 'c',
00051             'ч' => 'ch',   'ш' => 'sh',   'щ' => 'shh',  'ъ' => '\'',
00052             'ы' => 'y',        'ь' => '\'',   'э' => 'eh',   'ю' => 'ju',
00053             'я' => 'ja',
00054             
00055             'А' => 'A',        'Б' => 'B',        'В' => 'V',        'Г' => 'G',
00056             'Д' => 'D',    'Е' => 'E',        'Ё' => 'JO',   'Ж' => 'ZH',
00057             'З' => 'Z',        'И' => 'I',        'Й' => 'JJ',   'К' => 'K',
00058             'Л' => 'L',        'М' => 'M',        'Н' => 'N',        'О' => 'O',
00059             'П' => 'P',        'Р' => 'R',        'С' => 'S',        'Т' => 'T',
00060             'У' => 'U',        'Ф' => 'F',        'Х' => 'KH',   'Ц' => 'C',
00061             'Ч' => 'CH',   'Ш' => 'SH',   'Щ' => 'SHH',  'Ъ' => '\'',
00062             'Ы' => 'Y',        'Ь' => '\'',   'Э' => 'EH',   'Ю' => 'JU',
00063             'Я' => 'JA'
00064         );
00065         
00066         private static $monthInGenitiveCase = array(
00067             'января', 'февраля', 'марта', 'апреля',
00068             'мая', 'июня', 'июля', 'августа', 'сентября',
00069             'октября', 'ноября', 'декабря'
00070         );
00071         
00072         private static $flippedLettersMapping = array();
00073         
00074         private static $ambiguousDetection = false;
00075         
00085         public static function selectCaseForNumber($number, $cases)
00086         {
00087             if (($number % 10) == 1 && ($number % 100) != 11) {
00088                 
00089                 return $cases[0];
00090                 
00091             } elseif (
00092                 ($number % 10) > 1
00093                 && ($number % 10) < 5
00094                 && ($number % 100 < 10 || $number % 100 > 20)
00095             ) {
00096                 
00097                 return $cases[1];
00098                 
00099             } else {
00100                 return $cases[2];
00101             }
00102         }
00103         
00108         public static function getMonthInGenitiveCase($month)
00109         {
00110             return self::$monthInGenitiveCase[$month - 1];
00111         }
00112         
00113         public static function getMonthByGenitiveCase($string)
00114         {
00115             static $flipped = null;
00116             
00117             if (!$flipped)
00118                 $flipped = array_flip(self::$monthInGenitiveCase);
00119             
00120             if (isset($flipped[$string]))
00121                 return $flipped[$string] + 1;
00122             
00123             throw new MissingElementException();
00124         }
00125         
00126         public static function getMonthInSubjectiveCase($month)
00127         {
00128             static $months = array(
00129                 'январь', 'февраль', 'март', 'апрель',
00130                 'май', 'июнь', 'июль', 'август', 'сентябрь',
00131                 'октябрь', 'ноябрь', 'декабрь'
00132             );
00133             
00134             return $months[$month - 1];
00135         }
00136         
00137         public static function getDayOfWeek($day, $full = false)
00138         {
00139             static $weekDays = array(
00140                 'вс', 'пн', 'вт', 'ср',
00141                 'чт', 'пт', 'сб', 'вс'
00142             );
00143             
00144             static $weekDaysFull = array(
00145                 'Воскресенье', 'Понедельник', 'Вторник', 'Среда',
00146                 'Четверг', 'Пятница', 'Суббота', 'Воскресенье'
00147             );
00148             
00149             if ($full)
00150                 return $weekDaysFull[$day];
00151             else
00152                 return $weekDays[$day];
00153         }
00154         
00155         public static function getDateAsText(Timestamp $date, $todayWordNeed = true)
00156         {
00157             $dayStart = Timestamp::makeToday();
00158             $tomorrowDayStart = $dayStart->spawn('+1 day');
00159             
00160             if (
00161                 (Timestamp::compare($date, $dayStart) == 1)
00162                 && (Timestamp::compare($date, $tomorrowDayStart) == -1)
00163             )
00164                 return
00165                     (
00166                         $todayWordNeed === true
00167                             ? 'сегодня '
00168                             : null
00169                     )
00170                     .'в '
00171                     .date('G:i', $date->toStamp());
00172             
00173             $yesterdayStart = $dayStart->spawn('-1 day');
00174             
00175             if (
00176                 (Timestamp::compare($date, $yesterdayStart) == 1)
00177                 && (Timestamp::compare($date, $dayStart) == -1)
00178             )
00179                 return 'вчера в '.date('G:i', $date->toStamp());
00180             
00181             return date('j.m.Y в G:i', $date->toStamp());
00182         }
00183         
00184         public static function friendlyFileSize($size, $precision = 2)
00185         {
00186             if ($size < 1024)
00187                 return
00188                     $size.' '.self::selectCaseForNumber(
00189                         $size, array('байт', 'байта', 'байт')
00190                     );
00191             else
00192                 return TextUtils::friendlyFileSize(
00193                     $size, $precision, self::$bytePrefixes, true
00194                 ).'Б';
00195         }
00196         
00197         public static function getHumanDay(Date $date, $wordDayNeed = true)
00198         {
00199             $today      = Date::makeToday();
00200             $tomorrow   = $today->spawn('+1 day');
00201             
00202             if ($date->toDate() == $today->toDate() && $wordDayNeed == true)
00203                 return 'сегодня';
00204             elseif ($date->toDate() == $tomorrow->toDate() && $wordDayNeed == true)
00205                 return 'завтра';
00206             else
00207                 return
00208                     (int) $date->getDay()
00209                     . ' '
00210                     . RussianTextUtils::getMonthInGenitiveCase(
00211                         $date->getMonth()
00212                     );
00213         }
00214         
00215         public static function toTranslit($sourceString)
00216         {
00217             return strtr($sourceString, self::$lettersMapping);
00218         }
00219         
00220         public static function toRussian($sourceString)
00221         {
00222             if (!self::$flippedLettersMapping)
00223                 self::$flippedLettersMapping =
00224                     array_flip(self::$lettersMapping);
00225             
00226             return strtr($sourceString, self::$flippedLettersMapping);
00227         }
00228         
00233         public static function detectEncoding($data)
00234         {
00235             static $tables = array(
00236                 'KOI8-R' => array(), 'WINDOWS-1251' => array()
00237             );
00238             
00239             $table = CyrillicPairs::getTable();
00240             
00241             $score = array('UTF-8' => 0, 'KOI8-R' => 0, 'WINDOWS-1251' => 0);
00242             
00243             foreach (
00244                 preg_split('~[\.\,\-\s\:\;\?\!\'\"\(\)\d<>]~', $data) as $word
00245             ) {
00246                 for ($i = 0; $i < strlen($word) - 2; ++$i) {
00247                     foreach (array_keys($score) as $encoding) {
00248                         if ($encoding == 'UTF-8')
00249                             $pairLengthBytes = 4;
00250                         else
00251                             $pairLengthBytes = 2;
00252                         
00253                         if ($i + $pairLengthBytes >= strlen($word))
00254                             continue;
00255                         
00256                         $pair = substr($word, $i, $pairLengthBytes);
00257                         
00258                         $value = 0;
00259                         
00260                         if ($encoding === 'UTF-8') {
00261                             
00262                             if (isset($table[$pair]))
00263                                 $value = $table[$pair];
00264                             
00265                         } elseif (
00266                             isset($tables[$encoding][$pair])
00267                         ) {
00268                             $value = $tables[$encoding][$pair];
00269                         
00270                         } else {
00271                             
00272                             $utf8Pair = mb_convert_encoding(
00273                                 $pair, 'UTF-8', $encoding
00274                             );
00275                             
00276                             if (isset($table[$utf8Pair])) {
00277                                 $value = $table[$utf8Pair];
00278                                 $tables[$encoding][$pair] = $table[$utf8Pair];
00279                             } else {
00280                                 $tables[$encoding][$pair] = false;
00281                             }
00282                         }
00283                         
00284                         $score[$encoding] += $value;
00285                     }
00286                     
00287                 }
00288             }
00289             
00290             $koi8Ratio =
00291                 $score['KOI8-R']
00292                 / ($score['WINDOWS-1251'] + $score['UTF-8'] + 1);
00293             
00294             $winRatio =
00295                 $score['WINDOWS-1251']
00296                 / ($score['KOI8-R'] + $score['UTF-8'] + 1);
00297             
00298             $utf8Ratio =
00299                 $score['UTF-8']
00300                 / ($score['KOI8-R'] + $score['WINDOWS-1251'] + 1);
00301             
00302             $minRatio = 1.5;
00303             $doubtRatio = 1;
00304             
00305             if (
00306                 ($koi8Ratio < $minRatio && $koi8Ratio > $doubtRatio)
00307                 || ($winRatio < $minRatio && $winRatio > $doubtRatio)
00308                 || ($utf8Ratio < $minRatio && $utf8Ratio > $doubtRatio)
00309             ) {
00310                 self::$ambiguousDetection = true;
00311             } else
00312                 self::$ambiguousDetection = false;
00313             
00314             if ($koi8Ratio > $winRatio && $koi8Ratio > $utf8Ratio)
00315                 return 'KOI8-R';
00316             
00317             if ($winRatio > $utf8Ratio)
00318                 return 'WINDOWS-1251';
00319             
00320             if ($winRatio + $koi8Ratio + $utf8Ratio > 0)
00321                 return 'UTF-8';
00322             
00323             return 'ASCII';
00324         }
00325         
00326         public static function isAmbiguousDetection()
00327         {
00328             return self::$ambiguousDetection;
00329         }
00330     }
00331 ?>