HtmlTokenizer.class.php

Go to the documentation of this file.
00001 <?php
00002 /***************************************************************************
00003  *   Copyright (C) 2007-2008 by Ivan Y. Khvostishkov                       *
00004  *                                                                         *
00005  *   This program is free software; you can redistribute it and/or modify  *
00006  *   it under the terms of the GNU Lesser General Public License as        *
00007  *   published by the Free Software Foundation; either version 3 of the    *
00008  *   License, or (at your option) any later version.                       *
00009  *                                                                         *
00010  ***************************************************************************/
00011 
00015     final class HtmlTokenizer
00016     {
00017         const INITIAL_STATE             = 1;
00018         const START_TAG_STATE           = 2;
00019         const END_TAG_STATE             = 3;
00020         const INSIDE_TAG_STATE          = 4;
00021         const ATTR_NAME_STATE           = 5;
00022         const WAITING_EQUAL_SIGN_STATE  = 6;
00023         const ATTR_VALUE_STATE          = 7;
00024         
00025         const CDATA_STATE           = 8; // <![CDATA[ ... ]]>
00026         const COMMENT_STATE         = 9; // <!-- ... -->
00027         const INLINE_TAG_STATE      = 10; // script, style
00028         const EXTERNAL_TAG_STATE    = 11; // <?php ... ? >
00029         const DOCTYPE_TAG_STATE     = 12;
00030         
00031         const FINAL_STATE           = 42;
00032         
00033         const SPACER_MASK           = '[ \r\n\t]';
00034         const ID_FIRST_CHAR_MASK    = '[A-Za-z]';
00035         const ID_CHAR_MASK          = '[-_:.A-Za-z0-9]';
00036         
00037         private $inlineTags         = array('style', 'script', 'textarea');
00038         
00039         private $stream     = null;
00040         
00041         private $char       = null;
00042         
00043         // for logging
00044         private $line           = 1;
00045         private $linePosition   = 1;
00046         private $previousChar   = null;
00047         
00048         private $mark       = null;
00049         
00050         private $state      = self::INITIAL_STATE;
00051         
00052         private $tags       = array();
00053         private $errors     = array();
00054         
00055         private $buffer     = null;
00056         
00057         private $tagId      = null;
00058         
00059         private $tag            = null;
00060         private $completeTag    = null;
00061         private $previousTag    = null;
00062         
00063         private $attrName       = null;
00064         private $attrValue      = null;
00065         private $insideQuote    = null;
00066         
00067         private $substringFound = false;
00068         
00069         private $suppressWhitespaces    = false;
00070         private $lowercaseAttributes    = false;
00071         private $lowercaseTags          = false;
00072         
00073         public function __construct(InputStream $stream)
00074         {
00075             $this->stream = $stream;
00076             
00077             $this->getNextChar();
00078         }
00079         
00083         public static function create(InputStream $stream)
00084         {
00085             return new self($stream);
00086         }
00087         
00091         public function suppressWhitespaces($isSuppressWhitespaces)
00092         {
00093             Assert::isBoolean($isSuppressWhitespaces);
00094             
00095             $this->suppressWhitespaces = $isSuppressWhitespaces;
00096             
00097             return $this;
00098         }
00099         
00103         public function lowercaseAttributes($isLowercaseAttributes)
00104         {
00105             Assert::isBoolean($isLowercaseAttributes);
00106             
00107             $this->lowercaseAttributes = $isLowercaseAttributes;
00108             
00109             return $this;
00110         }
00111         
00115         public function lowercaseTags($isLowercaseTags)
00116         {
00117             Assert::isBoolean($isLowercaseTags);
00118             
00119             $this->lowercaseTags = $isLowercaseTags;
00120             
00121             return $this;
00122         }
00123         
00127         public function nextToken()
00128         {
00129             if ($this->state == self::FINAL_STATE)
00130                 return null;
00131             
00132             $this->completeTag = null;
00133             
00134             while ($this->state != self::FINAL_STATE && !$this->completeTag)
00135                 $this->state = $this->handleState();
00136             
00137             if ($this->state == self::FINAL_STATE && $this->char !== null)
00138                 throw new WrongStateException('state machine is broken');
00139             
00140             $this->previousTag = $this->completeTag;
00141             
00142             return $this->completeTag;
00143         }
00144         
00145         public function getErrors()
00146         {
00147             return $this->errors;
00148         }
00149         
00150         public static function isIdFirstChar($char)
00151         {
00152             return (preg_match('/'.self::ID_FIRST_CHAR_MASK.'/', $char) > 0);
00153         }
00154         
00155         public static function isIdChar($char)
00156         {
00157             return (preg_match('/'.self::ID_CHAR_MASK.'/', $char) > 0);
00158         }
00159         
00160         public static function isValidId($id)
00161         {
00162             $matches = preg_match(
00163                 '/^'.self::ID_FIRST_CHAR_MASK.self::ID_CHAR_MASK.'*$/',
00164                 $id
00165             );
00166             
00167             return ($matches > 0);
00168         }
00169         
00170         public static function isSpacerChar($char)
00171         {
00172             return (preg_match('/'.self::SPACER_MASK.'/', $char) > 0);
00173         }
00174         
00175         public static function removeWhitespaces(Cdata $cdata)
00176         {
00177             $string = $cdata->getData();
00178             
00179             $string = preg_replace(
00180                 '/^'.self::SPACER_MASK.'+/',
00181                 ' ',
00182                 $string
00183             );
00184             
00185             $string = preg_replace(
00186                 '/'.self::SPACER_MASK.'+$/',
00187                 ' ',
00188                 $string
00189             );
00190             
00191             if ($string === '' || $string === null)
00192                 return null;
00193             
00194             $cdata->setData($string);
00195             
00196             return $cdata;
00197         }
00198         
00199         public function isInlineTag($id)
00200         {
00201             return in_array($id, $this->inlineTags);
00202         }
00203         
00204         private static function optionalLowercase($string, $ignoreCase)
00205         {
00206             if (!$ignoreCase)
00207                 return $string;
00208             else
00209                 return strtolower($string);
00210         }
00211         
00212         private function getNextChar()
00213         {
00214             $this->char = $this->stream->read(1);
00215             
00216             if ($this->char === null)
00217                 return null;
00218             
00219             if (
00220                 $this->char == "\n" && $this->previousChar != "\r"
00221                 || $this->char == "\r"
00222             ) {
00223                 ++$this->line;
00224                 $this->linePosition = 1;
00225             } else {
00226                 ++$this->linePosition;
00227             }
00228             
00229             $this->previousChar = $this->char;
00230             
00231             return $this->char;
00232         }
00233         
00234         private function getChars($count)
00235         {
00236             $result = null;
00237             
00238             while ($this->char !== null && $count > 0) {
00239                 $result .= $this->char;
00240                 
00241                 $this->getNextChar();
00242                 
00243                 --$count;
00244             }
00245             
00246             return $result;
00247         }
00248         
00252         private function mark()
00253         {
00254             $this->mark = array(
00255                 $this->char, $this->previousChar,
00256                 $this->line, $this->linePosition
00257             );
00258             
00259             $this->stream->mark();
00260             
00261             return $this;
00262         }
00263         
00267         private function reset()
00268         {
00269             Assert::isNotNull($this->mark);
00270             
00271             list (
00272                 $this->char, $this->previousChar,
00273                 $this->line, $this->linePosition
00274             ) = $this->mark;
00275             
00276             $this->stream->reset();
00277             
00278             return $this;
00279         }
00280         
00284         private function skip($count)
00285         {
00286             for ($i = 0; $i < $count; ++$i)
00287                 $this->getNextChar();
00288             
00289             return $this;
00290         }
00291         
00292         private function lookAhead($count)
00293         {
00294             $this->stream->mark();
00295             
00296             $result = $this->stream->read($count);
00297             
00298             $this->stream->reset();
00299             
00300             return $result;
00301         }
00302         
00303         private function skipString($string, $skipSpaces = false)
00304         {
00305             $this->mark();
00306             
00307             if ($skipSpaces) {
00308                 while (
00309                     $this->char !== null
00310                     && self::isSpacerChar($this->char)
00311                 )
00312                     $this->getNextChar();
00313             }
00314             
00315             $length = strlen($string);
00316             
00317             if ($this->getChars($length) === $string)
00318                 return true;
00319             
00320             $this->reset();
00321             
00322             return false;
00323         }
00324         
00328         private function makeTag()
00329         {
00330             Assert::isNotNull($this->tag);
00331             
00332             Assert::isNull($this->attrName);
00333             Assert::isNull($this->attrValue);
00334             
00335             Assert::isNull($this->insideQuote);
00336             
00337             if (
00338                 !$this->suppressWhitespaces
00339                 || !$this->tag instanceof Cdata
00340                 || (self::removeWhitespaces($this->tag) !== null)
00341             )
00342                 $this->tags[] = $this->completeTag = $this->tag;
00343             
00344             $this->tagId = $this->tag = null;
00345             
00346             return $this;
00347         }
00348         
00352         private function setupTag(SgmlTag $tag)
00353         {
00354             Assert::isNull($this->tag);
00355             Assert::isNotNull($this->tagId);
00356             
00357             $this->tag = $tag->setId($this->tagId);
00358             
00359             $this->tagId = null;
00360             
00361             return $this->tag;
00362         }
00363         
00364         private function handleState()
00365         {
00366             switch ($this->state) {
00367                 case self::INITIAL_STATE:
00368                     
00369                     if (
00370                         $this->previousTag instanceof SgmlOpenTag
00371                         && $this->isInlineTag($this->previousTag->getId())
00372                     )
00373                         return $this->inlineTagState();
00374                     else
00375                         return $this->outsideTagState();
00376                     
00377                 case self::START_TAG_STATE:
00378                     return $this->startTagState();
00379                     
00380                 case self::END_TAG_STATE:
00381                     return $this->endTagState();
00382                     
00383                 case self::INSIDE_TAG_STATE:
00384                     return $this->insideTagState();
00385                     
00386                 case self::ATTR_NAME_STATE:
00387                     return $this->attrNameState();
00388                     
00389                 case self::WAITING_EQUAL_SIGN_STATE:
00390                     return $this->waitingEqualSignState();
00391                     
00392                 case self::ATTR_VALUE_STATE:
00393                     return $this->attrValueState();
00394                     
00395                 case self::CDATA_STATE:
00396                     return $this->cdataState();
00397                     
00398                 case self::COMMENT_STATE:
00399                     return $this->commentState();
00400                     
00401                 case self::EXTERNAL_TAG_STATE:
00402                     return $this->externalTagState();
00403                 
00404                 case self::DOCTYPE_TAG_STATE:
00405                     return $this->doctypeTagState();
00406             }
00407             
00408             throw new WrongStateException('state machine is broken');
00409         }
00410         
00414         private function dumpBuffer()
00415         {
00416             if ($this->buffer !== null) {
00417                 $this->tag = Cdata::create()->setData($this->buffer);
00418                 
00419                 $this->buffer = null;
00420                 
00421                 $this->makeTag();
00422             }
00423             
00424             return $this;
00425         }
00426         
00427         private function checkSpecialTagState()
00428         {
00429             if ($this->char != '!')
00430                 return null;
00431             
00432             $specialStartTags = array(
00433                 '![CDATA['  => self::CDATA_STATE,
00434                 '!--'       => self::COMMENT_STATE
00435             );
00436             
00437             foreach ($specialStartTags as $tag => $state) {
00438                 
00439                 if ($this->skipString($tag))
00440                     return $state;
00441             }
00442             
00443             return null;
00444         }
00445         
00446         // INITIAL_STATE
00447         private function outsideTagState()
00448         {
00449             Assert::isNull($this->tag);
00450             Assert::isNull($this->tagId);
00451             
00452             Assert::isNull($this->attrName);
00453             Assert::isNull($this->attrValue);
00454             
00455             Assert::isNull($this->insideQuote);
00456             
00457             while ($this->char !== null) {
00458                 
00459                 if ($this->char != '<') {
00460                     
00461                     $this->buffer .= $this->char;
00462                     $this->getNextChar();
00463                     
00464                 } else {
00465                     
00466                     $this->getNextChar();
00467                     
00468                     if (
00469                         self::isIdFirstChar($this->char)
00470                         || $this->char == '?' || $this->char == '!'
00471                     ) {
00472                         $this->dumpBuffer();
00473                         
00474                         // TODO: handle at start tag state
00475                         $specialTagState = $this->checkSpecialTagState();
00476                         
00477                         if ($specialTagState !== null) {
00478                             // comment, cdata
00479                             return $specialTagState;
00480                         }
00481                         
00482                         $this->tagId = $this->char;
00483                         
00484                         $this->getNextChar();
00485                         
00486                         return self::START_TAG_STATE;
00487                         
00488                     } elseif ($this->char == '/') {
00489                         // </
00490                         
00491                         $this->dumpBuffer();
00492                         
00493                         $this->getNextChar();
00494                         
00495                         return self::END_TAG_STATE;
00496                         
00497                     } else {
00498                         // <2, <ф, <[space], <>, <[eof]
00499                         
00500                         $this->warning(
00501                             'incorrect start-tag, treating it as cdata'
00502                         );
00503                         
00504                         $this->buffer .= '<'.$this->char;
00505                         
00506                         $this->getNextChar();
00507                         
00508                         continue;
00509                     }
00510                     
00511                     Assert::isUnreachable();
00512                 }
00513             }
00514             
00515             $this->dumpBuffer();
00516             
00517             return self::FINAL_STATE;
00518         }
00519         
00523         private function createOpenTag()
00524         {
00525             if (!self::isValidId($this->tagId))
00526                 $this->error("tag id '{$this->tagId}' is invalid");
00527             elseif ($this->lowercaseTags)
00528                 $this->tagId = strtolower($this->tagId);
00529             
00530             return $this->setupTag(SgmlOpenTag::create());
00531         }
00532         
00533         // START_TAG_STATE
00534         private function startTagState()
00535         {
00536             Assert::isNull($this->tag);
00537             Assert::isNotNull($this->tagId); // strlen(tagId) == 1
00538             
00539             Assert::isNull($this->attrName);
00540             Assert::isNull($this->attrValue);
00541             
00542             Assert::isNull($this->insideQuote);
00543             
00544             while ($this->char !== null) {
00545                 
00546                 if ($this->char == '>') {
00547                     // <b>, <divмусор>
00548                     
00549                     $this->createOpenTag();
00550                     
00551                     $this->makeTag();
00552                     
00553                     $this->getNextChar();
00554                     
00555                     return self::INITIAL_STATE;
00556                     
00557                 } elseif (self::isSpacerChar($this->char)) {
00558                     // <p[space], <divмусор[space], <?php[space],
00559                     // <?xml[space], <!DOCTYPE[space]
00560                     
00561                     $externalTag =
00562                         ($this->tagId[0] == '?')
00563                         && ($this->tagId != '?xml');
00564                     
00565                     $doctypeTag = (strtoupper($this->tagId) == '!DOCTYPE');
00566                     
00567                     if ($externalTag) {
00568                         $this->setupTag(
00569                             SgmlIgnoredTag::create()->
00570                             setEndMark('?')
00571                         );
00572                     } elseif ($doctypeTag) {
00573                         $this->setupTag(SgmlIgnoredTag::create());
00574                     } else
00575                         $this->createOpenTag();
00576                     
00577                     if ($externalTag)
00578                         return self::EXTERNAL_TAG_STATE;
00579                     elseif ($doctypeTag)
00580                         return self::DOCTYPE_TAG_STATE;
00581                     else {
00582                         // don't eating spacer for external and doctype tags
00583                         $this->getNextChar();
00584                         
00585                         return self::INSIDE_TAG_STATE;
00586                     }
00587                 } else {
00588                     $char = $this->char;
00589                     
00590                     $this->getNextChar();
00591                     
00592                     if ($char == '/' && $this->char == '>') {
00593                         // <br/>
00594                         
00595                         $this->createOpenTag()->setEmpty(true);
00596                         
00597                         $this->makeTag();
00598                         
00599                         $this->getNextChar();
00600                         
00601                         return self::INITIAL_STATE;
00602                     }
00603                     
00604                     $this->tagId .= $char;
00605                 }
00606             }
00607             
00608             // ... <tag[end-of-file]
00609             
00610             $this->error('unexpected end of file, tag id is incomplete');
00611             
00612             $this->createOpenTag();
00613             
00614             $this->makeTag();
00615             
00616             return self::FINAL_STATE;
00617         }
00618         
00622         private function dumpEndTag()
00623         {
00624             if (!$this->tagId) {
00625                 // </>
00626                 $this->warning('empty end-tag, storing with empty id');
00627                 
00628             } elseif (!self::isValidId($this->tagId)) {
00629                 
00630                 $this->error("end-tag id '{$this->tagId}' is invalid");
00631             }
00632             
00633             $this->tag = SgmlEndTag::create()->
00634                 setId(
00635                     self::optionalLowercase($this->tagId, $this->lowercaseTags)
00636                 );
00637             
00638             $this->makeTag();
00639             
00640             return $this;
00641         }
00642         
00643         // END_TAG_STATE
00644         private function endTagState()
00645         {
00646             Assert::isNull($this->tag);
00647             
00648             Assert::isTrue(
00649                 $this->tagId === null
00650                 || $this->char == '>'
00651                 || self::isSpacerChar($this->char)
00652             );
00653             
00654             Assert::isNull($this->attrName);
00655             Assert::isNull($this->attrValue);
00656             
00657             Assert::isNull($this->insideQuote);
00658             
00659             $eatingGarbage = false;
00660             
00661             while ($this->char !== null) {
00662                 
00663                 if ($this->char == '>') {
00664                     
00665                     $this->dumpEndTag();
00666                     
00667                     $this->getNextChar();
00668                     
00669                     return self::INITIAL_STATE;
00670                     
00671                 } elseif ($eatingGarbage) {
00672                     
00673                     $this->getNextChar();
00674                     
00675                     continue;
00676                     
00677                 } elseif (self::isSpacerChar($this->char)) {
00678                     // most browsers parse end-tag until next '>' char
00679                     
00680                     $eatingGarbage = true;
00681                     
00682                     $this->getNextChar();
00683                     
00684                     continue;
00685                 }
00686                 
00687                 $this->tagId .= $this->char;
00688                 
00689                 $this->getNextChar();
00690             }
00691             
00692             // ... </[end-of-file], </sometag[eof]
00693             
00694             // NOTE: opera treats </[eof] as cdata, firefox as tag
00695             $this->error("unexpected end of file, end-tag is incomplete");
00696             
00697             $this->dumpEndTag();
00698             
00699             return self::FINAL_STATE;
00700         }
00701         
00702         // INSIDE_TAG_STATE
00703         private function insideTagState()
00704         {
00705             Assert::isNull($this->tagId);
00706             
00707             Assert::isNull($this->attrName);
00708             Assert::isNull($this->attrValue);
00709             
00710             Assert::isNotNull($this->tag);
00711             Assert::isTrue($this->tag instanceof SgmlOpenTag);
00712             
00713             Assert::isNull($this->insideQuote);
00714             
00715             while ($this->char !== null) {
00716                 
00717                 if (self::isSpacerChar($this->char)) {
00718                     $this->getNextChar();
00719                     
00720                 } elseif ($this->char == '>') {
00721                     // <tag ... >
00722                     
00723                     $this->makeTag();
00724                     
00725                     $this->getNextChar();
00726                     
00727                     return self::INITIAL_STATE;
00728                     
00729                 } elseif ($this->char == '=') {
00730                     
00731                     // most browsers' behaviour
00732                     $this->error(
00733                         'unexpected equal sign, attr name considered empty'
00734                     );
00735                     
00736                     $this->getNextChar();
00737                     
00738                     // call?
00739                     return self::ATTR_VALUE_STATE;
00740                     
00741                 } else {
00742                     
00743                     $char = $this->char;
00744                     
00745                     $this->getNextChar();
00746                     
00747                     if ($char == '/' && $this->char == '>') {
00748                         // <tag />, <tag id=value />
00749                         
00750                         $this->tag->setEmpty(true);
00751                         
00752                         $this->makeTag();
00753                         
00754                         $this->getNextChar();
00755                         
00756                         return self::INITIAL_STATE;
00757                     }
00758                     
00759                     $this->attrName = $char;
00760                     
00761                     // call?
00762                     return self::ATTR_NAME_STATE;
00763                 }
00764             }
00765             
00766             // <tag [eof], <tag id=val [eof]
00767             
00768             $this->error('unexpected end of file, incomplete tag stored');
00769             
00770             $this->makeTag();
00771                 
00772             return self::FINAL_STATE;
00773         }
00774         
00778         private function dumpAttribute()
00779         {
00780             if ($this->attrName) {
00781                 
00782                 if (!self::isValidId($this->attrName))
00783                     $this->error("attribute name '{$this->attrName}' is invalid");
00784                 else
00785                     $this->attrName = strtolower($this->attrName);
00786                 
00787             }
00788             
00789             if ($this->attrValue === null || $this->attrValue === '')
00790                 $this->warning("empty value for attr == '{$this->attrName}'");
00791             
00792             $this->tag->setAttribute($this->attrName, $this->attrValue);
00793             
00794             $this->attrName = $this->attrValue = null;
00795             
00796             return $this;
00797         }
00798         
00799         // ATTR_NAME_STATE
00800         private function attrNameState()
00801         {
00802             Assert::isNotNull($this->tag);
00803             Assert::isTrue($this->tag instanceof SgmlOpenTag);
00804             
00805             Assert::isNotNull($this->attrName); // length == 1
00806             Assert::isNull($this->attrValue);
00807             
00808             Assert::isNull($this->insideQuote);
00809             
00810             while ($this->char !== null) {
00811                 
00812                 if (self::isSpacerChar($this->char)) {
00813                     // <tag attr[space]
00814                     
00815                     $this->getNextChar();
00816                     
00817                     // call?
00818                     return self::WAITING_EQUAL_SIGN_STATE;
00819                     
00820                 } elseif ($this->char == '>') {
00821                     // <tag attr>
00822                     
00823                     $this->dumpAttribute();
00824                     
00825                     $this->makeTag();
00826                     
00827                     $this->getNextChar();
00828                     
00829                     return self::INITIAL_STATE;
00830                     
00831                 } elseif ($this->char == '=') {
00832                     // <tag id=
00833                     
00834                     $this->getNextChar();
00835                     
00836                     // empty string, not null, to be sure that value needed
00837                     $this->attrValue = '';
00838                     
00839                     // call?
00840                     return self::ATTR_VALUE_STATE;
00841                     
00842                 } else {
00843                     
00844                     $char = $this->char;
00845                     
00846                     $this->getNextChar();
00847                     
00848                     if ($char == '/' && $this->char == '>') {
00849                         // <option attr=value checked/>
00850                         
00851                         $this->tag->setEmpty(true);
00852                         
00853                         $this->dumpAttribute();
00854                         
00855                         $this->makeTag();
00856                         
00857                         $this->getNextChar();
00858                         
00859                         return self::INITIAL_STATE;
00860                     }
00861                     
00862                     $this->attrName .= $char;
00863                 }
00864             }
00865             
00866             // <tag i[eof]
00867             
00868             // NOTE: opera treats it as cdata, firefox does not
00869             $this->dumpAttribute();
00870             
00871             $this->error('unexpected end of file, incomplete tag stored');
00872             
00873             $this->makeTag();
00874             
00875             return self::FINAL_STATE;
00876         }
00877         
00878         // WAITING_EQUAL_SIGN_STATE
00879         private function waitingEqualSignState()
00880         {
00881             Assert::isNotNull($this->tag);
00882             Assert::isTrue($this->tag instanceof SgmlOpenTag);
00883             Assert::isNull($this->tagId);
00884             Assert::isNotNull($this->attrName);
00885             Assert::isNull($this->attrValue);
00886             
00887             Assert::isNull($this->insideQuote);
00888             
00889             while ($this->char !== null) {
00890                 
00891                 if (self::isSpacerChar($this->char)) {
00892                     // <tag attr[space*]
00893                     
00894                     $this->getNextChar();
00895                     
00896                 } elseif ($this->char == '=') {
00897                     
00898                     $this->getNextChar();
00899                     
00900                     // empty string, not null, to be sure that value needed
00901                     $this->attrValue = '';
00902                     
00903                     // call?
00904                     return self::ATTR_VALUE_STATE;
00905                     
00906                 } else {
00907                     // <tag attr x, <tag attr >
00908                     
00909                     $this->dumpAttribute();
00910                     
00911                     return self::INSIDE_TAG_STATE;
00912                 }
00913             }
00914             
00915             // <tag id[space*][eof]
00916             
00917             $this->dumpAttribute();
00918             
00919             $this->error('unexpected end of file, incomplete tag stored');
00920             
00921             $this->makeTag();
00922             
00923             return self::FINAL_STATE;
00924         }
00925         
00926         // ATTR_VALUE_STATE
00927         private function attrValueState()
00928         {
00929             Assert::isNull($this->tagId);
00930             
00931             Assert::isNotNull($this->tag);
00932             Assert::isTrue($this->tag instanceof SgmlOpenTag);
00933             
00934             while ($this->char !== null) {
00935                 
00936                 if (!$this->insideQuote && self::isSpacerChar($this->char)) {
00937                     $this->getNextChar();
00938                     
00939                     if ($this->attrValue !== null && $this->attrValue !== '') {
00940                         // NOTE: "0" is accepted value
00941                         // <tag id=unquottedValue[space]
00942                         
00943                         $this->dumpAttribute();
00944                         
00945                         return self::INSIDE_TAG_STATE;
00946                     }
00947                     
00948                     // <tag id=[space*]
00949                     continue;
00950                     
00951                 } elseif (!$this->insideQuote && $this->char == '>') {
00952                     // <tag id=value>, <a href=catalog/>
00953                     
00954                     $this->dumpAttribute();
00955                     
00956                     $this->makeTag();
00957                     
00958                     $this->getNextChar();
00959                     
00960                     return self::INITIAL_STATE;
00961                     
00962                 } else {
00963                     if (
00964                         $this->char == '"' || $this->char == "'"
00965                         || $this->char == $this->insideQuote // may be '>'
00966                     ) {
00967                         if (!$this->insideQuote) {
00968                             
00969                             $this->insideQuote = $this->char;
00970                             
00971                             $this->getNextChar();
00972                             
00973                             // a place to rollback if second quote will not be
00974                             // found.
00975                             $this->mark();
00976                             
00977                             continue;
00978                             
00979                         } elseif ($this->char == $this->insideQuote) {
00980                             // attr = "value", attr='value', attr='value>([^']*)
00981                             
00982                             $this->dumpAttribute();
00983                             
00984                             $this->getNextChar();
00985                             
00986                             if ($this->insideQuote == '>') {
00987                                 $this->insideQuote = null;
00988                                 
00989                                 $this->makeTag();
00990                                 
00991                                 return self::INITIAL_STATE;
00992                                 
00993                             } else {
00994                                 $this->insideQuote = null;
00995                                 
00996                                 return self::INSIDE_TAG_STATE;
00997                             }
00998                         }
00999                     }
01000                     
01001                     $this->attrValue .= $this->char;
01002                     
01003                     if ($this->insideQuote && $this->char == '\\')
01004                         $this->attrValue .= $this->getNextChar();
01005                     
01006                     $this->getNextChar();
01007                 }
01008             }
01009             
01010             if ($this->insideQuote) {
01011                 // <tag id="...[eof]
01012                 //
01013                 // NOTE: firefox rolls back to the first > after quote.
01014                 // Opera consideres incomplete tag as cdata.
01015                 // we act as ff does.
01016                 
01017                 $this->reset();
01018                 
01019                 $this->warning(
01020                     "unclosed quoted value for attr == '{$this->attrName}',"
01021                     ." rolling back and searching '>'"
01022                 );
01023                 
01024                 $this->attrValue = null;
01025                 $this->insideQuote = '>';
01026                 
01027                 // call?
01028                 // TODO: possible infinite loop?
01029                 return self::ATTR_VALUE_STATE;
01030             }
01031             
01032             // <tag id=[space*][eof], <tag id=val[eof]
01033             
01034             $this->dumpAttribute();
01035             
01036             $this->error('unexpected end of file, incomplete tag stored');
01037             
01038             $this->makeTag();
01039             
01040             return self::FINAL_STATE;
01041         }
01042         
01043         // INLINE_TAG_STATE:
01044         private function inlineTagState()
01045         {
01046             // <script ...>X<-- we are here
01047             
01048             Assert::isNull($this->buffer);
01049             
01050             Assert::isNull($this->tag);
01051             Assert::isNull($this->tagId);
01052             
01053             $startTag = $this->previousTag->getId();
01054             
01055             if ($this->char === null) {
01056                 $this->error('unexpected eof inside inline tag');
01057                 
01058                 return self::FINAL_STATE;
01059             }
01060             
01061             $this->buffer = null;
01062             
01063             if ($startTag == 'style' || $startTag == 'script') {
01070                 if ($this->skipString('<!--', true))
01071                     $this->buffer = '<!--'.$this->getComment().'-->';
01072             }
01073             
01074             $endTag = '</'.$startTag;
01075             
01076             while ($this->char !== null) {
01077                 $this->buffer .= $this->getContentToSubstring($endTag, true);
01078                 
01079                 if ($this->char === null) {
01080                     // </script not found, or found </script[eof]
01081                     
01082                     break;
01083                     
01084                 } elseif (
01085                     $this->char === '>' || self::isSpacerChar($this->char)
01086                 ) {
01087                     // </script>, </script[space]
01088                     
01089                     $this->dumpBuffer();
01090                     
01091                     $this->tagId = $startTag;
01092                     
01093                     return self::END_TAG_STATE;
01094                 }
01095                 
01096                 // </script[any-other-char]
01097                 
01098                 $this->buffer .= $endTag.$this->char;
01099                 
01100                 $this->getNextChar();
01101             }
01102             
01103             $this->dumpBuffer();
01104             
01105             $this->error(
01106                 "end-tag for inline tag == '{$startTag}' not found"
01107             );
01108             
01109             return self::FINAL_STATE;
01110         }
01111         
01112         // CDATA_STATE
01113         private function cdataState()
01114         {
01115             Assert::isNull($this->tag);
01116             Assert::isNull($this->tagId);
01117             
01118             $content = $this->getContentToSubstring(']]>');
01119             
01120             $this->tag =
01121                 Cdata::create()->
01122                 setData($content)->
01123                 setStrict(true);
01124             
01125             $this->makeTag();
01126             
01127             if (!$this->substringFound) {
01128                 
01129                 $this->error('unexpected end-of-file inside cdata tag');
01130                 
01131                 return self::FINAL_STATE;
01132             }
01133             
01134             return self::INITIAL_STATE;
01135         }
01136         
01137         private function getComment()
01138         {
01139             $this->mark();
01140             
01141             $result = $this->getContentToSubstring('-->');
01142             
01143             if (!$this->substringFound) {
01144                 $this->reset();
01145                 
01146                 $this->error(
01147                     'unexpected end-of-file inside comment tag,'
01148                     ." trying to find '>'"
01149                 );
01150                 
01151                 $result = $this->getContentToSubstring('>');
01152                 
01153                 if (!$this->substringFound)
01154                     $this->error(
01155                         "end-tag '>' not found,"
01156                         .' treating all remaining content as cdata'
01157                     );
01158             }
01159             
01160             return $result;
01161         }
01162         
01163         // COMMENT_STATE
01164         private function commentState()
01165         {
01166             Assert::isNull($this->tag);
01167             Assert::isNull($this->tagId);
01168             
01169             $content = $this->getComment();
01170             
01171             $this->tag =
01172                 SgmlIgnoredTag::comment()->
01173                 setCdata(
01174                     Cdata::create()->setData($content)
01175                 );
01176             
01177             $this->makeTag();
01178             
01179             return self::INITIAL_STATE;
01180         }
01181         
01182         // EXTERNAL_TAG_STATE:
01183         private function externalTagState()
01184         {
01185             Assert::isTrue($this->tag instanceof SgmlIgnoredTag);
01186             
01187             $this->mark();
01188             
01189             $content = $this->getContentToSubstring('?>');
01190             
01191             if (!$this->substringFound) {
01192                 $this->reset();
01193                 
01194                 $this->error(
01195                     'unexpected end-of-file inside external tag,'
01196                     ." trying to find '>'"
01197                 );
01198                 
01199                 $content = $this->getContentToSubstring('>');
01200                 
01201                 if (!$this->substringFound)
01202                     $this->error(
01203                         "end-tag '>' not found,"
01204                         .' treating all remaining content as cdata'
01205                     );
01206             }
01207             
01208             $this->tag->setCdata(Cdata::create()->setData($content));
01209             
01210             $this->makeTag();
01211             
01212             return self::INITIAL_STATE;
01213         }
01214         
01215         // DOCTYPE_TAG_STATE:
01216         private function doctypeTagState()
01217         {
01218             // TODO: use DoctypeTag and parse it correctly as Opera does and
01219             // Firefox does not.
01220             Assert::isTrue($this->tag instanceof SgmlIgnoredTag);
01221             
01222             $content = $this->getContentToSubstring('>');
01223             
01224             if (!$this->substringFound)
01225                 $this->error('unexpected end-of-file inside doctype tag');
01226             
01227             $this->tag->setCdata(Cdata::create()->setData($content));
01228             
01229             $this->makeTag();
01230             
01231             return self::INITIAL_STATE;
01232         }
01233         
01239         private function getContentToSubstring($substring, $ignoreCase = false)
01240         {
01241             $this->substringFound = false;
01242             
01243             $substringLength = strlen($substring);
01244             
01245             $prefixTable = array(1 => 0);
01246             $buffer = $substring."\x00";
01247             $i = 0;
01248             
01249             while ($this->char !== null) {
01250                 
01251                 if ($i < $substringLength)
01252                     $char = $buffer[$i + 1];
01253                 else {
01254                     $char = $this->char;
01255                     $buffer .= $char;
01256                     $this->getNextChar();
01257                 }
01258                 
01259                 $maxLength = $prefixTable[$i + 1];
01260                 
01261                 $char = self::optionalLowercase($char, $ignoreCase);
01262                 
01263                 while (
01264                     self::optionalLowercase($buffer[$maxLength], $ignoreCase)
01265                         !== $char
01266                     && $maxLength > 0
01267                 ) {
01268                     $maxLength = $prefixTable[$maxLength];
01269                 }
01270                 
01271                 ++$i;
01272                 
01273                 $prefixTable[$i + 1] =
01274                     (
01275                         self::optionalLowercase($buffer[$maxLength], $ignoreCase)
01276                             === $char
01277                     )
01278                         ? $maxLength + 1
01279                         : 0;
01280                 
01281                 if (
01282                     $i > $substringLength + 1
01283                     && $prefixTable[$i + 1] == $substringLength
01284                 ) {
01285                     $this->substringFound = true;
01286                     
01287                     break;
01288                 }
01289             }
01290             
01291             if (!$this->substringFound)
01292                 return substr(
01293                     $buffer, $substringLength + 1
01294                 );
01295             else
01296                 return substr(
01297                     $buffer, $substringLength + 1, $i - 2 * $substringLength
01298                 );
01299         }
01300         
01301         private function getTextualPosition()
01302         {
01303             return
01304                 "line {$this->line}, position {$this->linePosition}"
01305                 .(
01306                     $this->tag && $this->tag->getId()
01307                         ? ", in tag '{$this->tag->getId()}'"
01308                         : null
01309                 );
01310         }
01311         
01315         private function warning($message)
01316         {
01317             $this->errors[] =
01318                 "warning at {$this->getTextualPosition()}: $message";
01319             
01320             return $this;
01321         }
01322         
01326         private function error($message)
01327         {
01328             $this->errors[] =
01329                 "error at {$this->getTextualPosition()}: $message";
01330             
01331             return $this;
01332         }
01333     }
01334 ?>