00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00015 final class HtmlTokenizer
00016 {
00017 const INITIAL_STATE = 1;
00018 const START_TAG_STATE = 2;
00019 const END_TAG_STATE = 3;
00020 const INSIDE_TAG_STATE = 4;
00021 const ATTR_NAME_STATE = 5;
00022 const WAITING_EQUAL_SIGN_STATE = 6;
00023 const ATTR_VALUE_STATE = 7;
00024
00025 const CDATA_STATE = 8;
00026 const COMMENT_STATE = 9;
00027 const INLINE_TAG_STATE = 10;
00028 const EXTERNAL_TAG_STATE = 11;
00029 const DOCTYPE_TAG_STATE = 12;
00030
00031 const FINAL_STATE = 42;
00032
00033 const SPACER_MASK = '[ \r\n\t]';
00034 const ID_FIRST_CHAR_MASK = '[A-Za-z]';
00035 const ID_CHAR_MASK = '[-_:.A-Za-z0-9]';
00036
00037 private $inlineTags = array('style', 'script', 'textarea');
00038
00039 private $stream = null;
00040
00041 private $char = null;
00042
00043
00044 private $line = 1;
00045 private $linePosition = 1;
00046 private $previousChar = null;
00047
00048 private $mark = null;
00049
00050 private $state = self::INITIAL_STATE;
00051
00052 private $tags = array();
00053 private $errors = array();
00054
00055 private $buffer = null;
00056
00057 private $tagId = null;
00058
00059 private $tag = null;
00060 private $completeTag = null;
00061 private $previousTag = null;
00062
00063 private $attrName = null;
00064 private $attrValue = null;
00065 private $insideQuote = null;
00066
00067 private $substringFound = false;
00068
00069 private $suppressWhitespaces = false;
00070 private $lowercaseAttributes = false;
00071 private $lowercaseTags = false;
00072
00073 public function __construct(InputStream $stream)
00074 {
00075 $this->stream = $stream;
00076
00077 $this->getNextChar();
00078 }
00079
00083 public static function create(InputStream $stream)
00084 {
00085 return new self($stream);
00086 }
00087
00091 public function suppressWhitespaces($isSuppressWhitespaces)
00092 {
00093 Assert::isBoolean($isSuppressWhitespaces);
00094
00095 $this->suppressWhitespaces = $isSuppressWhitespaces;
00096
00097 return $this;
00098 }
00099
00103 public function lowercaseAttributes($isLowercaseAttributes)
00104 {
00105 Assert::isBoolean($isLowercaseAttributes);
00106
00107 $this->lowercaseAttributes = $isLowercaseAttributes;
00108
00109 return $this;
00110 }
00111
00115 public function lowercaseTags($isLowercaseTags)
00116 {
00117 Assert::isBoolean($isLowercaseTags);
00118
00119 $this->lowercaseTags = $isLowercaseTags;
00120
00121 return $this;
00122 }
00123
00127 public function nextToken()
00128 {
00129 if ($this->state == self::FINAL_STATE)
00130 return null;
00131
00132 $this->completeTag = null;
00133
00134 while ($this->state != self::FINAL_STATE && !$this->completeTag)
00135 $this->state = $this->handleState();
00136
00137 if ($this->state == self::FINAL_STATE && $this->char !== null)
00138 throw new WrongStateException('state machine is broken');
00139
00140 $this->previousTag = $this->completeTag;
00141
00142 return $this->completeTag;
00143 }
00144
00145 public function getErrors()
00146 {
00147 return $this->errors;
00148 }
00149
00150 public static function isIdFirstChar($char)
00151 {
00152 return (preg_match('/'.self::ID_FIRST_CHAR_MASK.'/', $char) > 0);
00153 }
00154
00155 public static function isIdChar($char)
00156 {
00157 return (preg_match('/'.self::ID_CHAR_MASK.'/', $char) > 0);
00158 }
00159
00160 public static function isValidId($id)
00161 {
00162 $matches = preg_match(
00163 '/^'.self::ID_FIRST_CHAR_MASK.self::ID_CHAR_MASK.'*$/',
00164 $id
00165 );
00166
00167 return ($matches > 0);
00168 }
00169
00170 public static function isSpacerChar($char)
00171 {
00172 return (preg_match('/'.self::SPACER_MASK.'/', $char) > 0);
00173 }
00174
00175 public static function removeWhitespaces(Cdata $cdata)
00176 {
00177 $string = $cdata->getData();
00178
00179 $string = preg_replace(
00180 '/^'.self::SPACER_MASK.'+/',
00181 ' ',
00182 $string
00183 );
00184
00185 $string = preg_replace(
00186 '/'.self::SPACER_MASK.'+$/',
00187 ' ',
00188 $string
00189 );
00190
00191 if ($string === '' || $string === null)
00192 return null;
00193
00194 $cdata->setData($string);
00195
00196 return $cdata;
00197 }
00198
00199 public function isInlineTag($id)
00200 {
00201 return in_array($id, $this->inlineTags);
00202 }
00203
00204 private static function optionalLowercase($string, $ignoreCase)
00205 {
00206 if (!$ignoreCase)
00207 return $string;
00208 else
00209 return strtolower($string);
00210 }
00211
00212 private function getNextChar()
00213 {
00214 $this->char = $this->stream->read(1);
00215
00216 if ($this->char === null)
00217 return null;
00218
00219 if (
00220 $this->char == "\n" && $this->previousChar != "\r"
00221 || $this->char == "\r"
00222 ) {
00223 ++$this->line;
00224 $this->linePosition = 1;
00225 } else {
00226 ++$this->linePosition;
00227 }
00228
00229 $this->previousChar = $this->char;
00230
00231 return $this->char;
00232 }
00233
00234 private function getChars($count)
00235 {
00236 $result = null;
00237
00238 while ($this->char !== null && $count > 0) {
00239 $result .= $this->char;
00240
00241 $this->getNextChar();
00242
00243 --$count;
00244 }
00245
00246 return $result;
00247 }
00248
00252 private function mark()
00253 {
00254 $this->mark = array(
00255 $this->char, $this->previousChar,
00256 $this->line, $this->linePosition
00257 );
00258
00259 $this->stream->mark();
00260
00261 return $this;
00262 }
00263
00267 private function reset()
00268 {
00269 Assert::isNotNull($this->mark);
00270
00271 list (
00272 $this->char, $this->previousChar,
00273 $this->line, $this->linePosition
00274 ) = $this->mark;
00275
00276 $this->stream->reset();
00277
00278 return $this;
00279 }
00280
00284 private function skip($count)
00285 {
00286 for ($i = 0; $i < $count; ++$i)
00287 $this->getNextChar();
00288
00289 return $this;
00290 }
00291
00292 private function lookAhead($count)
00293 {
00294 $this->stream->mark();
00295
00296 $result = $this->stream->read($count);
00297
00298 $this->stream->reset();
00299
00300 return $result;
00301 }
00302
00303 private function skipString($string, $skipSpaces = false)
00304 {
00305 $this->mark();
00306
00307 if ($skipSpaces) {
00308 while (
00309 $this->char !== null
00310 && self::isSpacerChar($this->char)
00311 )
00312 $this->getNextChar();
00313 }
00314
00315 $length = strlen($string);
00316
00317 if ($this->getChars($length) === $string)
00318 return true;
00319
00320 $this->reset();
00321
00322 return false;
00323 }
00324
00328 private function makeTag()
00329 {
00330 Assert::isNotNull($this->tag);
00331
00332 Assert::isNull($this->attrName);
00333 Assert::isNull($this->attrValue);
00334
00335 Assert::isNull($this->insideQuote);
00336
00337 if (
00338 !$this->suppressWhitespaces
00339 || !$this->tag instanceof Cdata
00340 || (self::removeWhitespaces($this->tag) !== null)
00341 )
00342 $this->tags[] = $this->completeTag = $this->tag;
00343
00344 $this->tagId = $this->tag = null;
00345
00346 return $this;
00347 }
00348
00352 private function setupTag(SgmlTag $tag)
00353 {
00354 Assert::isNull($this->tag);
00355 Assert::isNotNull($this->tagId);
00356
00357 $this->tag = $tag->setId($this->tagId);
00358
00359 $this->tagId = null;
00360
00361 return $this->tag;
00362 }
00363
00364 private function handleState()
00365 {
00366 switch ($this->state) {
00367 case self::INITIAL_STATE:
00368
00369 if (
00370 $this->previousTag instanceof SgmlOpenTag
00371 && $this->isInlineTag($this->previousTag->getId())
00372 )
00373 return $this->inlineTagState();
00374 else
00375 return $this->outsideTagState();
00376
00377 case self::START_TAG_STATE:
00378 return $this->startTagState();
00379
00380 case self::END_TAG_STATE:
00381 return $this->endTagState();
00382
00383 case self::INSIDE_TAG_STATE:
00384 return $this->insideTagState();
00385
00386 case self::ATTR_NAME_STATE:
00387 return $this->attrNameState();
00388
00389 case self::WAITING_EQUAL_SIGN_STATE:
00390 return $this->waitingEqualSignState();
00391
00392 case self::ATTR_VALUE_STATE:
00393 return $this->attrValueState();
00394
00395 case self::CDATA_STATE:
00396 return $this->cdataState();
00397
00398 case self::COMMENT_STATE:
00399 return $this->commentState();
00400
00401 case self::EXTERNAL_TAG_STATE:
00402 return $this->externalTagState();
00403
00404 case self::DOCTYPE_TAG_STATE:
00405 return $this->doctypeTagState();
00406 }
00407
00408 throw new WrongStateException('state machine is broken');
00409 }
00410
00414 private function dumpBuffer()
00415 {
00416 if ($this->buffer !== null) {
00417 $this->tag = Cdata::create()->setData($this->buffer);
00418
00419 $this->buffer = null;
00420
00421 $this->makeTag();
00422 }
00423
00424 return $this;
00425 }
00426
00427 private function checkSpecialTagState()
00428 {
00429 if ($this->char != '!')
00430 return null;
00431
00432 $specialStartTags = array(
00433 '![CDATA[' => self::CDATA_STATE,
00434 '!--' => self::COMMENT_STATE
00435 );
00436
00437 foreach ($specialStartTags as $tag => $state) {
00438
00439 if ($this->skipString($tag))
00440 return $state;
00441 }
00442
00443 return null;
00444 }
00445
00446
00447 private function outsideTagState()
00448 {
00449 Assert::isNull($this->tag);
00450 Assert::isNull($this->tagId);
00451
00452 Assert::isNull($this->attrName);
00453 Assert::isNull($this->attrValue);
00454
00455 Assert::isNull($this->insideQuote);
00456
00457 while ($this->char !== null) {
00458
00459 if ($this->char != '<') {
00460
00461 $this->buffer .= $this->char;
00462 $this->getNextChar();
00463
00464 } else {
00465
00466 $this->getNextChar();
00467
00468 if (
00469 self::isIdFirstChar($this->char)
00470 || $this->char == '?' || $this->char == '!'
00471 ) {
00472 $this->dumpBuffer();
00473
00474
00475 $specialTagState = $this->checkSpecialTagState();
00476
00477 if ($specialTagState !== null) {
00478
00479 return $specialTagState;
00480 }
00481
00482 $this->tagId = $this->char;
00483
00484 $this->getNextChar();
00485
00486 return self::START_TAG_STATE;
00487
00488 } elseif ($this->char == '/') {
00489
00490
00491 $this->dumpBuffer();
00492
00493 $this->getNextChar();
00494
00495 return self::END_TAG_STATE;
00496
00497 } else {
00498
00499
00500 $this->warning(
00501 'incorrect start-tag, treating it as cdata'
00502 );
00503
00504 $this->buffer .= '<'.$this->char;
00505
00506 $this->getNextChar();
00507
00508 continue;
00509 }
00510
00511 Assert::isUnreachable();
00512 }
00513 }
00514
00515 $this->dumpBuffer();
00516
00517 return self::FINAL_STATE;
00518 }
00519
00523 private function createOpenTag()
00524 {
00525 if (!self::isValidId($this->tagId))
00526 $this->error("tag id '{$this->tagId}' is invalid");
00527 elseif ($this->lowercaseTags)
00528 $this->tagId = strtolower($this->tagId);
00529
00530 return $this->setupTag(SgmlOpenTag::create());
00531 }
00532
00533
00534 private function startTagState()
00535 {
00536 Assert::isNull($this->tag);
00537 Assert::isNotNull($this->tagId);
00538
00539 Assert::isNull($this->attrName);
00540 Assert::isNull($this->attrValue);
00541
00542 Assert::isNull($this->insideQuote);
00543
00544 while ($this->char !== null) {
00545
00546 if ($this->char == '>') {
00547
00548
00549 $this->createOpenTag();
00550
00551 $this->makeTag();
00552
00553 $this->getNextChar();
00554
00555 return self::INITIAL_STATE;
00556
00557 } elseif (self::isSpacerChar($this->char)) {
00558
00559
00560
00561 $externalTag =
00562 ($this->tagId[0] == '?')
00563 && ($this->tagId != '?xml');
00564
00565 $doctypeTag = (strtoupper($this->tagId) == '!DOCTYPE');
00566
00567 if ($externalTag) {
00568 $this->setupTag(
00569 SgmlIgnoredTag::create()->
00570 setEndMark('?')
00571 );
00572 } elseif ($doctypeTag) {
00573 $this->setupTag(SgmlIgnoredTag::create());
00574 } else
00575 $this->createOpenTag();
00576
00577 if ($externalTag)
00578 return self::EXTERNAL_TAG_STATE;
00579 elseif ($doctypeTag)
00580 return self::DOCTYPE_TAG_STATE;
00581 else {
00582
00583 $this->getNextChar();
00584
00585 return self::INSIDE_TAG_STATE;
00586 }
00587 } else {
00588 $char = $this->char;
00589
00590 $this->getNextChar();
00591
00592 if ($char == '/' && $this->char == '>') {
00593
00594
00595 $this->createOpenTag()->setEmpty(true);
00596
00597 $this->makeTag();
00598
00599 $this->getNextChar();
00600
00601 return self::INITIAL_STATE;
00602 }
00603
00604 $this->tagId .= $char;
00605 }
00606 }
00607
00608
00609
00610 $this->error('unexpected end of file, tag id is incomplete');
00611
00612 $this->createOpenTag();
00613
00614 $this->makeTag();
00615
00616 return self::FINAL_STATE;
00617 }
00618
00622 private function dumpEndTag()
00623 {
00624 if (!$this->tagId) {
00625
00626 $this->warning('empty end-tag, storing with empty id');
00627
00628 } elseif (!self::isValidId($this->tagId)) {
00629
00630 $this->error("end-tag id '{$this->tagId}' is invalid");
00631 }
00632
00633 $this->tag = SgmlEndTag::create()->
00634 setId(
00635 self::optionalLowercase($this->tagId, $this->lowercaseTags)
00636 );
00637
00638 $this->makeTag();
00639
00640 return $this;
00641 }
00642
00643
00644 private function endTagState()
00645 {
00646 Assert::isNull($this->tag);
00647
00648 Assert::isTrue(
00649 $this->tagId === null
00650 || $this->char == '>'
00651 || self::isSpacerChar($this->char)
00652 );
00653
00654 Assert::isNull($this->attrName);
00655 Assert::isNull($this->attrValue);
00656
00657 Assert::isNull($this->insideQuote);
00658
00659 $eatingGarbage = false;
00660
00661 while ($this->char !== null) {
00662
00663 if ($this->char == '>') {
00664
00665 $this->dumpEndTag();
00666
00667 $this->getNextChar();
00668
00669 return self::INITIAL_STATE;
00670
00671 } elseif ($eatingGarbage) {
00672
00673 $this->getNextChar();
00674
00675 continue;
00676
00677 } elseif (self::isSpacerChar($this->char)) {
00678
00679
00680 $eatingGarbage = true;
00681
00682 $this->getNextChar();
00683
00684 continue;
00685 }
00686
00687 $this->tagId .= $this->char;
00688
00689 $this->getNextChar();
00690 }
00691
00692
00693
00694
00695 $this->error("unexpected end of file, end-tag is incomplete");
00696
00697 $this->dumpEndTag();
00698
00699 return self::FINAL_STATE;
00700 }
00701
00702
00703 private function insideTagState()
00704 {
00705 Assert::isNull($this->tagId);
00706
00707 Assert::isNull($this->attrName);
00708 Assert::isNull($this->attrValue);
00709
00710 Assert::isNotNull($this->tag);
00711 Assert::isTrue($this->tag instanceof SgmlOpenTag);
00712
00713 Assert::isNull($this->insideQuote);
00714
00715 while ($this->char !== null) {
00716
00717 if (self::isSpacerChar($this->char)) {
00718 $this->getNextChar();
00719
00720 } elseif ($this->char == '>') {
00721
00722
00723 $this->makeTag();
00724
00725 $this->getNextChar();
00726
00727 return self::INITIAL_STATE;
00728
00729 } elseif ($this->char == '=') {
00730
00731
00732 $this->error(
00733 'unexpected equal sign, attr name considered empty'
00734 );
00735
00736 $this->getNextChar();
00737
00738
00739 return self::ATTR_VALUE_STATE;
00740
00741 } else {
00742
00743 $char = $this->char;
00744
00745 $this->getNextChar();
00746
00747 if ($char == '/' && $this->char == '>') {
00748
00749
00750 $this->tag->setEmpty(true);
00751
00752 $this->makeTag();
00753
00754 $this->getNextChar();
00755
00756 return self::INITIAL_STATE;
00757 }
00758
00759 $this->attrName = $char;
00760
00761
00762 return self::ATTR_NAME_STATE;
00763 }
00764 }
00765
00766
00767
00768 $this->error('unexpected end of file, incomplete tag stored');
00769
00770 $this->makeTag();
00771
00772 return self::FINAL_STATE;
00773 }
00774
00778 private function dumpAttribute()
00779 {
00780 if ($this->attrName) {
00781
00782 if (!self::isValidId($this->attrName))
00783 $this->error("attribute name '{$this->attrName}' is invalid");
00784 else
00785 $this->attrName = strtolower($this->attrName);
00786
00787 }
00788
00789 if ($this->attrValue === null || $this->attrValue === '')
00790 $this->warning("empty value for attr == '{$this->attrName}'");
00791
00792 $this->tag->setAttribute($this->attrName, $this->attrValue);
00793
00794 $this->attrName = $this->attrValue = null;
00795
00796 return $this;
00797 }
00798
00799
00800 private function attrNameState()
00801 {
00802 Assert::isNotNull($this->tag);
00803 Assert::isTrue($this->tag instanceof SgmlOpenTag);
00804
00805 Assert::isNotNull($this->attrName);
00806 Assert::isNull($this->attrValue);
00807
00808 Assert::isNull($this->insideQuote);
00809
00810 while ($this->char !== null) {
00811
00812 if (self::isSpacerChar($this->char)) {
00813
00814
00815 $this->getNextChar();
00816
00817
00818 return self::WAITING_EQUAL_SIGN_STATE;
00819
00820 } elseif ($this->char == '>') {
00821
00822
00823 $this->dumpAttribute();
00824
00825 $this->makeTag();
00826
00827 $this->getNextChar();
00828
00829 return self::INITIAL_STATE;
00830
00831 } elseif ($this->char == '=') {
00832
00833
00834 $this->getNextChar();
00835
00836
00837 $this->attrValue = '';
00838
00839
00840 return self::ATTR_VALUE_STATE;
00841
00842 } else {
00843
00844 $char = $this->char;
00845
00846 $this->getNextChar();
00847
00848 if ($char == '/' && $this->char == '>') {
00849
00850
00851 $this->tag->setEmpty(true);
00852
00853 $this->dumpAttribute();
00854
00855 $this->makeTag();
00856
00857 $this->getNextChar();
00858
00859 return self::INITIAL_STATE;
00860 }
00861
00862 $this->attrName .= $char;
00863 }
00864 }
00865
00866
00867
00868
00869 $this->dumpAttribute();
00870
00871 $this->error('unexpected end of file, incomplete tag stored');
00872
00873 $this->makeTag();
00874
00875 return self::FINAL_STATE;
00876 }
00877
00878
00879 private function waitingEqualSignState()
00880 {
00881 Assert::isNotNull($this->tag);
00882 Assert::isTrue($this->tag instanceof SgmlOpenTag);
00883 Assert::isNull($this->tagId);
00884 Assert::isNotNull($this->attrName);
00885 Assert::isNull($this->attrValue);
00886
00887 Assert::isNull($this->insideQuote);
00888
00889 while ($this->char !== null) {
00890
00891 if (self::isSpacerChar($this->char)) {
00892
00893
00894 $this->getNextChar();
00895
00896 } elseif ($this->char == '=') {
00897
00898 $this->getNextChar();
00899
00900
00901 $this->attrValue = '';
00902
00903
00904 return self::ATTR_VALUE_STATE;
00905
00906 } else {
00907
00908
00909 $this->dumpAttribute();
00910
00911 return self::INSIDE_TAG_STATE;
00912 }
00913 }
00914
00915
00916
00917 $this->dumpAttribute();
00918
00919 $this->error('unexpected end of file, incomplete tag stored');
00920
00921 $this->makeTag();
00922
00923 return self::FINAL_STATE;
00924 }
00925
00926
00927 private function attrValueState()
00928 {
00929 Assert::isNull($this->tagId);
00930
00931 Assert::isNotNull($this->tag);
00932 Assert::isTrue($this->tag instanceof SgmlOpenTag);
00933
00934 while ($this->char !== null) {
00935
00936 if (!$this->insideQuote && self::isSpacerChar($this->char)) {
00937 $this->getNextChar();
00938
00939 if ($this->attrValue !== null && $this->attrValue !== '') {
00940
00941
00942
00943 $this->dumpAttribute();
00944
00945 return self::INSIDE_TAG_STATE;
00946 }
00947
00948
00949 continue;
00950
00951 } elseif (!$this->insideQuote && $this->char == '>') {
00952
00953
00954 $this->dumpAttribute();
00955
00956 $this->makeTag();
00957
00958 $this->getNextChar();
00959
00960 return self::INITIAL_STATE;
00961
00962 } else {
00963 if (
00964 $this->char == '"' || $this->char == "'"
00965 || $this->char == $this->insideQuote
00966 ) {
00967 if (!$this->insideQuote) {
00968
00969 $this->insideQuote = $this->char;
00970
00971 $this->getNextChar();
00972
00973
00974
00975 $this->mark();
00976
00977 continue;
00978
00979 } elseif ($this->char == $this->insideQuote) {
00980
00981
00982 $this->dumpAttribute();
00983
00984 $this->getNextChar();
00985
00986 if ($this->insideQuote == '>') {
00987 $this->insideQuote = null;
00988
00989 $this->makeTag();
00990
00991 return self::INITIAL_STATE;
00992
00993 } else {
00994 $this->insideQuote = null;
00995
00996 return self::INSIDE_TAG_STATE;
00997 }
00998 }
00999 }
01000
01001 $this->attrValue .= $this->char;
01002
01003 if ($this->insideQuote && $this->char == '\\')
01004 $this->attrValue .= $this->getNextChar();
01005
01006 $this->getNextChar();
01007 }
01008 }
01009
01010 if ($this->insideQuote) {
01011
01012
01013
01014
01015
01016
01017 $this->reset();
01018
01019 $this->warning(
01020 "unclosed quoted value for attr == '{$this->attrName}',"
01021 ." rolling back and searching '>'"
01022 );
01023
01024 $this->attrValue = null;
01025 $this->insideQuote = '>';
01026
01027
01028
01029 return self::ATTR_VALUE_STATE;
01030 }
01031
01032
01033
01034 $this->dumpAttribute();
01035
01036 $this->error('unexpected end of file, incomplete tag stored');
01037
01038 $this->makeTag();
01039
01040 return self::FINAL_STATE;
01041 }
01042
01043
01044 private function inlineTagState()
01045 {
01046
01047
01048 Assert::isNull($this->buffer);
01049
01050 Assert::isNull($this->tag);
01051 Assert::isNull($this->tagId);
01052
01053 $startTag = $this->previousTag->getId();
01054
01055 if ($this->char === null) {
01056 $this->error('unexpected eof inside inline tag');
01057
01058 return self::FINAL_STATE;
01059 }
01060
01061 $this->buffer = null;
01062
01063 if ($startTag == 'style' || $startTag == 'script') {
01070 if ($this->skipString('<!--', true))
01071 $this->buffer = '<!--'.$this->getComment().'-->';
01072 }
01073
01074 $endTag = '</'.$startTag;
01075
01076 while ($this->char !== null) {
01077 $this->buffer .= $this->getContentToSubstring($endTag, true);
01078
01079 if ($this->char === null) {
01080
01081
01082 break;
01083
01084 } elseif (
01085 $this->char === '>' || self::isSpacerChar($this->char)
01086 ) {
01087
01088
01089 $this->dumpBuffer();
01090
01091 $this->tagId = $startTag;
01092
01093 return self::END_TAG_STATE;
01094 }
01095
01096
01097
01098 $this->buffer .= $endTag.$this->char;
01099
01100 $this->getNextChar();
01101 }
01102
01103 $this->dumpBuffer();
01104
01105 $this->error(
01106 "end-tag for inline tag == '{$startTag}' not found"
01107 );
01108
01109 return self::FINAL_STATE;
01110 }
01111
01112
01113 private function cdataState()
01114 {
01115 Assert::isNull($this->tag);
01116 Assert::isNull($this->tagId);
01117
01118 $content = $this->getContentToSubstring(']]>');
01119
01120 $this->tag =
01121 Cdata::create()->
01122 setData($content)->
01123 setStrict(true);
01124
01125 $this->makeTag();
01126
01127 if (!$this->substringFound) {
01128
01129 $this->error('unexpected end-of-file inside cdata tag');
01130
01131 return self::FINAL_STATE;
01132 }
01133
01134 return self::INITIAL_STATE;
01135 }
01136
01137 private function getComment()
01138 {
01139 $this->mark();
01140
01141 $result = $this->getContentToSubstring('-->');
01142
01143 if (!$this->substringFound) {
01144 $this->reset();
01145
01146 $this->error(
01147 'unexpected end-of-file inside comment tag,'
01148 ." trying to find '>'"
01149 );
01150
01151 $result = $this->getContentToSubstring('>');
01152
01153 if (!$this->substringFound)
01154 $this->error(
01155 "end-tag '>' not found,"
01156 .' treating all remaining content as cdata'
01157 );
01158 }
01159
01160 return $result;
01161 }
01162
01163
01164 private function commentState()
01165 {
01166 Assert::isNull($this->tag);
01167 Assert::isNull($this->tagId);
01168
01169 $content = $this->getComment();
01170
01171 $this->tag =
01172 SgmlIgnoredTag::comment()->
01173 setCdata(
01174 Cdata::create()->setData($content)
01175 );
01176
01177 $this->makeTag();
01178
01179 return self::INITIAL_STATE;
01180 }
01181
01182
01183 private function externalTagState()
01184 {
01185 Assert::isTrue($this->tag instanceof SgmlIgnoredTag);
01186
01187 $this->mark();
01188
01189 $content = $this->getContentToSubstring('?>');
01190
01191 if (!$this->substringFound) {
01192 $this->reset();
01193
01194 $this->error(
01195 'unexpected end-of-file inside external tag,'
01196 ." trying to find '>'"
01197 );
01198
01199 $content = $this->getContentToSubstring('>');
01200
01201 if (!$this->substringFound)
01202 $this->error(
01203 "end-tag '>' not found,"
01204 .' treating all remaining content as cdata'
01205 );
01206 }
01207
01208 $this->tag->setCdata(Cdata::create()->setData($content));
01209
01210 $this->makeTag();
01211
01212 return self::INITIAL_STATE;
01213 }
01214
01215
01216 private function doctypeTagState()
01217 {
01218
01219
01220 Assert::isTrue($this->tag instanceof SgmlIgnoredTag);
01221
01222 $content = $this->getContentToSubstring('>');
01223
01224 if (!$this->substringFound)
01225 $this->error('unexpected end-of-file inside doctype tag');
01226
01227 $this->tag->setCdata(Cdata::create()->setData($content));
01228
01229 $this->makeTag();
01230
01231 return self::INITIAL_STATE;
01232 }
01233
01239 private function getContentToSubstring($substring, $ignoreCase = false)
01240 {
01241 $this->substringFound = false;
01242
01243 $substringLength = strlen($substring);
01244
01245 $prefixTable = array(1 => 0);
01246 $buffer = $substring."\x00";
01247 $i = 0;
01248
01249 while ($this->char !== null) {
01250
01251 if ($i < $substringLength)
01252 $char = $buffer[$i + 1];
01253 else {
01254 $char = $this->char;
01255 $buffer .= $char;
01256 $this->getNextChar();
01257 }
01258
01259 $maxLength = $prefixTable[$i + 1];
01260
01261 $char = self::optionalLowercase($char, $ignoreCase);
01262
01263 while (
01264 self::optionalLowercase($buffer[$maxLength], $ignoreCase)
01265 !== $char
01266 && $maxLength > 0
01267 ) {
01268 $maxLength = $prefixTable[$maxLength];
01269 }
01270
01271 ++$i;
01272
01273 $prefixTable[$i + 1] =
01274 (
01275 self::optionalLowercase($buffer[$maxLength], $ignoreCase)
01276 === $char
01277 )
01278 ? $maxLength + 1
01279 : 0;
01280
01281 if (
01282 $i > $substringLength + 1
01283 && $prefixTable[$i + 1] == $substringLength
01284 ) {
01285 $this->substringFound = true;
01286
01287 break;
01288 }
01289 }
01290
01291 if (!$this->substringFound)
01292 return substr(
01293 $buffer, $substringLength + 1
01294 );
01295 else
01296 return substr(
01297 $buffer, $substringLength + 1, $i - 2 * $substringLength
01298 );
01299 }
01300
01301 private function getTextualPosition()
01302 {
01303 return
01304 "line {$this->line}, position {$this->linePosition}"
01305 .(
01306 $this->tag && $this->tag->getId()
01307 ? ", in tag '{$this->tag->getId()}'"
01308 : null
01309 );
01310 }
01311
01315 private function warning($message)
01316 {
01317 $this->errors[] =
01318 "warning at {$this->getTextualPosition()}: $message";
01319
01320 return $this;
01321 }
01322
01326 private function error($message)
01327 {
01328 $this->errors[] =
01329 "error at {$this->getTextualPosition()}: $message";
01330
01331 return $this;
01332 }
01333 }
01334 ?>