RussianTypograph.class.php

Go to the documentation of this file.
00001 <?php
00002 /***************************************************************************
00003  *   Copyright (C) 2006-2009 by Konstantin V. Arkhipov                     *
00004  *                                                                         *
00005  *   This program is free software; you can redistribute it and/or modify  *
00006  *   it under the terms of the GNU Lesser General Public License as        *
00007  *   published by the Free Software Foundation; either version 3 of the    *
00008  *   License, or (at your option) any later version.                       *
00009  *                                                                         *
00010  ***************************************************************************/
00011 
00017     final class RussianTypograph extends BaseFilter
00018     {
00019         const MAGIC_DELIMITER = '<>'; // brilliant!
00020         
00021         private static $symbols =
00022             array(
00023                 ' '        => ' ', // bovm
00024                 ' < '   => ' &lt; ',
00025                 ' > '   => ' &gt; ',
00026                 '…'       => '&hellip;',
00027                 '...'   => '&hellip;',
00028                 '™'       => '&trade;',
00029                 '(tm)'  => '&trade;',
00030                 '(TM)'  => '&trade;',
00031                 '©'        => '&copy;',
00032                 '(c)'   => '&copy;',
00033                 '(C)'   => '&copy;',
00034                 '№'       => '&#8470;',
00035                 '—'       => '&mdash;',
00036                 '–'       => '&mdash;',
00037                 '«'        => '&laquo;',
00038                 '»'        => '&raquo;',
00039                 '„'       => '&bdquo;',
00040                 '“'       => '&ldquo;',
00041                 '•'       => '&bull;',
00042                 '®'        => '&reg;',
00043                 '¼'        => '&frac14;',
00044                 '½'        => '&frac12;',
00045                 '¾'        => '&frac34;',
00046                 '±'        => '&plusmn;',
00047                 '+/-'   => '&plusmn;',
00048                 '!='    => '&ne;',
00049                 '<>'    => '&ne;',
00050                 
00051                 // just to avoid regexp's
00052                 ' 1/4'  => ' &frac14;',
00053                 ' 1/2'  => ' &frac12;',
00054                 ' 3/4'  => ' &frac34;',
00055                 '1/4 '  => '&frac14; ',
00056                 '1/2 '  => '&frac12; ',
00057                 '3/4 '  => '&frac34; '
00058             );
00059         
00060         private static $from = array(
00061             '~\-{2,}~',                         // --
00062             '~([\w\pL\pP]+)\s+\-\s+~u',         // foo - bar
00063             '~(\s)\s*~u',                       // n -> 2 whitespaces to process short strings (bar to a foo)
00064             '~([\s\pP]|^)([\w\pL]{1,2})\s~Uu',  // bar a foo | bar to a foo
00065             '~(&nbsp;|\s)\s+~u',                // compress whitespaces
00066             '~\"(.*)\"~e',                      // "qu"o"te"
00067             '~\"([^\s]*)\"~',                   // "quote"
00068             '~\"([^\s]*)\s+([^\s\.]*)\"~',      // "quote quote"
00069             '~([\w\pL\']+)~eu'                  // rock'n'roll
00070         );
00071         
00072         private static $to = array(
00073             '-',
00074             '$1&nbsp;&#151; ',
00075             '$1$1',
00076             '$1$2&nbsp;',
00077             '$1',
00078             '\'&laquo;\'.$this->innerQuotes(\'$1\').\'&raquo;\'',
00079             '&laquo;$1&raquo;',
00080             '&laquo;$1 $2&raquo;',
00081             'str_replace("\'", \'&#146;\', \'$1\')'
00082         );
00083         
00087         public static function me()
00088         {
00089             return Singleton::getInstance(__CLASS__);
00090         }
00091         
00092         public function apply($value)
00093         {
00094             if (!$value = trim(strtr($value, self::$symbols)))
00095                 return null;
00096             
00097             $list =
00098                 preg_split(
00099                     '~([^<>]*)(?![^<]*?>)~',
00100                     $value,
00101                     null,
00102                     PREG_SPLIT_DELIM_CAPTURE
00103                         | PREG_SPLIT_NO_EMPTY
00104                         | PREG_SPLIT_OFFSET_CAPTURE
00105                 );
00106             
00107             $tags = array();
00108             $text = null;
00109             
00110             foreach ($list as $row) {
00111                 $string = $row[0];
00112                 if (
00113                     (strpos($string, '<') === false)
00114                     && (strpos($string, '>') === false)
00115                 ) {
00116                     $text .= $string;
00117                 } else {
00118                     $tags[] = $string;
00119                     $text .= self::MAGIC_DELIMITER;
00120                 }
00121             }
00122             
00123             $text = $this->typographize($text);
00124             
00125             if ($tags) {
00126                 $i = 0;
00127                 $out = null;
00128                 
00129                 foreach (explode(self::MAGIC_DELIMITER, $text) as $chunk) {
00130                     $out .= $chunk;
00131                     
00132                     if (isset($tags[$i]))
00133                         $out .= $tags[$i++];
00134                 }
00135                 
00136                 return $out;
00137             }
00138             
00139             return CompressWhitespaceFilter::me()->apply($text);
00140         }
00141         
00142         private function typographize($text)
00143         {
00144             if (mb_strlen($text) < 2)
00145                 return $text;
00146             
00147             return
00148                 preg_replace(
00149                     self::$from,
00150                     self::$to,
00151                     stripslashes($text)
00152                 );
00153         }
00154         
00155         private function innerQuotes($text)
00156         {
00157             return
00158                 preg_replace(
00159                     array(
00160                         '~&laquo;(.*)&raquo;~U',
00161                         '~\"(.*)\"~U',
00162                     ),
00163                     '&#132;$1&#147;',
00164                     stripslashes($text)
00165                 );
00166         }
00167     }
00168 ?>