TidyValidator.class.php

Go to the documentation of this file.
00001 <?php
00002 /***************************************************************************
00003  *   Copyright (C) 2007 by Sergey M. Skachkov                              *
00004  *                                                                         *
00005  *   This program is free software; you can redistribute it and/or modify  *
00006  *   it under the terms of the GNU Lesser General Public License as        *
00007  *   published by the Free Software Foundation; either version 3 of the    *
00008  *   License, or (at your option) any later version.                       *
00009  *                                                                         *
00010  ***************************************************************************/
00011 
00017     final class TidyValidator
00018     {
00019         private $content        = null;
00020         private $messages       = null;
00021         private $errorCount     = null;
00022         private $warningCount   = null;
00023         
00024         private $config             = array(
00025             'output-xhtml'      => true,
00026             'doctype'           => 'strict',
00027             'wrap'              => 0,
00028             'quote-marks'       => true,
00029             'drop-empty-paras'  => true
00030         );
00031         
00032         private $header         = '
00033             <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
00034             <html xmlns="http://www.w3.org/1999/xhtml">
00035             <head>
00036                 <title></title>
00037             </head>
00038             <body>';
00039         
00040         private $headerLines        = 7;
00041         
00042         private $encoding       = 'utf8';
00043         
00047         public static function create()
00048         {
00049             return new self;
00050         }
00051         
00060         public function setContent($content)
00061         {
00062             $this->content = $content;
00063             
00064             return $this;
00065         }
00066         
00067         public function getContent()
00068         {
00069             return $this->content;
00070         }
00071         
00072         public function getMessages()
00073         {
00074             return $this->messages;
00075         }
00076         
00085         public function setConfig($config)
00086         {
00087             $this->config = $config;
00088             
00089             return $this;
00090         }
00091         
00092         public function getConfig()
00093         {
00094             return $this->config;
00095         }
00096         
00103         public function setHeader($header)
00104         {
00105             $this->header = $header;
00106             $this->headerLines = count(explode("\n", $header));
00107             
00108             return $this;
00109         }
00110         
00111         public function getHeader()
00112         {
00113             return $this->header;
00114         }
00115         
00124         public function setEncoding($encoding)
00125         {
00126             $this->encoding = $encoding;
00127             
00128             return $this;
00129         }
00130         
00131         public function getEncoding()
00132         {
00133             return $this->encoding;
00134         }
00135         
00136         public function getErrorCount()
00137         {
00138             return $this->errorCount;
00139         }
00140         
00141         public function getWarningCount()
00142         {
00143             return $this->warningCount;
00144         }
00145         
00165         public function validateContent($content = null)
00166         {
00167             static $symbols = array(
00168                 '…'       => '&hellip;',
00169                 '™'       => '&trade;',
00170                 '©'        => '&copy;',
00171                 '№'       => '&#8470;',
00172                 '—'       => '&mdash;',
00173                 '–'       => '&mdash;',
00174                 '«'        => '&laquo;',
00175                 '»'        => '&raquo;',
00176                 '„'       => '&bdquo;',
00177                 '“'       => '&ldquo;',
00178                 '•'       => '&bull;',
00179                 '®'        => '&reg;',
00180                 '¼'        => '&frac14;',
00181                 '½'        => '&frac12;',
00182                 '¾'        => '&frac34;',
00183                 '±'        => '&plusmn;'
00184             );
00185             
00186             if ($content) {
00187                 $this->setContent($content);
00188             } elseif (!$this->getContent()) {
00189                 return $this;
00190             }
00191             
00192             $tidy = tidy_parse_string(
00193                 $this->getHeader()."\n".$this->getContent()."\n</body></html>",
00194                 $this->getConfig(),
00195                 $this->getEncoding()
00196             );
00197             
00198             $this->errorCount = tidy_error_count($tidy);
00199             $this->warningCount = tidy_warning_count($tidy);
00200             
00201             $rawMessages = tidy_get_error_buffer($tidy);
00202             $out = null;
00203             
00204             if (!empty($rawMessages)) {
00205                 $errorStrings =
00206                     explode(
00207                         "\n",
00208                         htmlspecialchars($rawMessages)
00209                     );
00210                 
00211                 foreach ($errorStrings as $string) {
00212                     list (/* $line */, $num, /* $col */, $rest) =
00213                         explode(' ', $string, 4);
00214                     
00215                     $out .=
00216                         (
00217                             $out == null
00218                                 ? null
00219                                 : "\n"
00220                         )
00221                         .'line '
00222                         .($num - ($this->headerLines))
00223                         .' column '.$rest;
00224                 }
00225             }
00226             
00227             $tidy->cleanRepair();
00228             
00229             $outContent = array();
00230             
00231             preg_match_all('/<body>(.*)<\/body>/s', $tidy, $outContent);
00232             
00233             Assert::isTrue(isset($outContent[1][0]));
00234             
00235             $outContent[1][0] = strtr($outContent[1][0], $symbols);
00236             
00237             $crcBefore = crc32(
00238                 preg_replace('/[\t\n\r\0 ]/', null, $this->getContent())
00239             );
00240             $crcAfter = crc32(
00241                 preg_replace('/[\t\n\r\0 ]/', null, $outContent[1][0])
00242             );
00243             
00244             if ($crcBefore != $crcAfter) {
00245                 if (
00246                     (
00247                         $this->countTags('<[\t ]*p[\t ]*>', $this->getContent())
00248                         != $this->countTags('<[\t ]*p[\t ]*>', $outContent[1][0])
00249                     ) || (
00250                         $this->countTags(
00251                             '<[\t ]*\/[\t ]*p[\t ]*>',
00252                             $this->getContent()
00253                         )
00254                         != $this->countTags(
00255                             '<[\t ]*\/[\t ]*p[\t ]*>',
00256                             $outContent[1][0]
00257                         )
00258                     )
00259                 ) {
00260                     $out =
00261                         (
00262                             $out == null
00263                                 ? null
00264                                 : $out."\n\n"
00265                         )
00266                         .'Paragraphs have been changed, please review content';
00267                 } else
00268                     if (!$out) {
00269                         $out = 'Content has been changed, please review';
00270                     }
00271             }
00272             
00273             $this->messages = $out;
00274             $this->content = $outContent[1][0];
00275             
00276             return $this;
00277         }
00278         
00279         private function countTags($tag, $text)
00280         {
00281             $matches = array();
00282             
00283             if (preg_match_all("/$tag/i", $text, $matches))
00284                 return count($matches[0]);
00285             
00286             return 0;
00287         }
00288     }
00289 ?>