00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00016 final class HtmlTokenizer
00017 {
00018 const INITIAL_STATE = 1;
00019 const START_TAG_STATE = 2;
00020 const END_TAG_STATE = 3;
00021 const INSIDE_TAG_STATE = 4;
00022 const ATTR_NAME_STATE = 5;
00023 const WAITING_EQUAL_SIGN_STATE = 6;
00024 const ATTR_VALUE_STATE = 7;
00025
00026 const CDATA_STATE = 8;
00027 const COMMENT_STATE = 9;
00028 const INLINE_TAG_STATE = 10;
00029 const EXTERNAL_TAG_STATE = 11;
00030 const DOCTYPE_TAG_STATE = 12;
00031
00032 const FINAL_STATE = 42;
00033
00034 const SPACER_MASK = '[ \r\n\t]';
00035 const ID_FIRST_CHAR_MASK = '[A-Za-z]';
00036 const ID_CHAR_MASK = '[-_:.A-Za-z0-9]';
00037
00038 private $inlineTags = array('style', 'script', 'textarea');
00039
00040 private $stream = null;
00041
00042 private $char = null;
00043
00044
00045 private $line = 1;
00046 private $linePosition = 1;
00047 private $previousChar = null;
00048
00049 private $mark = null;
00050
00051 private $state = self::INITIAL_STATE;
00052
00053 private $tags = array();
00054 private $errors = array();
00055
00056 private $buffer = null;
00057
00058 private $tagId = null;
00059
00060 private $tag = null;
00061 private $completeTag = null;
00062 private $previousTag = null;
00063
00064 private $attrName = null;
00065 private $attrValue = null;
00066 private $insideQuote = null;
00067
00068 private $substringFound = false;
00069
00070 private $suppressWhitespaces = false;
00071 private $lowercaseAttributes = false;
00072 private $lowercaseTags = false;
00073
00074 public function __construct(InputStream $stream)
00075 {
00076 $this->stream = $stream;
00077
00078 $this->getNextChar();
00079 }
00080
00084 public static function create(InputStream $stream)
00085 {
00086 return new self($stream);
00087 }
00088
00092 public function suppressWhitespaces($isSuppressWhitespaces)
00093 {
00094 Assert::isBoolean($isSuppressWhitespaces);
00095
00096 $this->suppressWhitespaces = $isSuppressWhitespaces;
00097
00098 return $this;
00099 }
00100
00104 public function lowercaseAttributes($isLowercaseAttributes)
00105 {
00106 Assert::isBoolean($isLowercaseAttributes);
00107
00108 $this->lowercaseAttributes = $isLowercaseAttributes;
00109
00110 return $this;
00111 }
00112
00116 public function lowercaseTags($isLowercaseTags)
00117 {
00118 Assert::isBoolean($isLowercaseTags);
00119
00120 $this->lowercaseTags = $isLowercaseTags;
00121
00122 return $this;
00123 }
00124
00128 public function nextToken()
00129 {
00130 if ($this->state == self::FINAL_STATE)
00131 return null;
00132
00133 $this->completeTag = null;
00134
00135 while ($this->state != self::FINAL_STATE && !$this->completeTag)
00136 $this->state = $this->handleState();
00137
00138 if ($this->state == self::FINAL_STATE && $this->char !== null)
00139 throw new WrongStateException('state machine is broken');
00140
00141 $this->previousTag = $this->completeTag;
00142
00143 return $this->completeTag;
00144 }
00145
00146 public function getErrors()
00147 {
00148 return $this->errors;
00149 }
00150
00151 public static function isIdFirstChar($char)
00152 {
00153 return (preg_match('/'.self::ID_FIRST_CHAR_MASK.'/', $char) > 0);
00154 }
00155
00156 public static function isIdChar($char)
00157 {
00158 return (preg_match('/'.self::ID_CHAR_MASK.'/', $char) > 0);
00159 }
00160
00161 public static function isValidId($id)
00162 {
00163 $matches = preg_match(
00164 '/^'.self::ID_FIRST_CHAR_MASK.self::ID_CHAR_MASK.'*$/',
00165 $id
00166 );
00167
00168 return ($matches > 0);
00169 }
00170
00171 public static function isSpacerChar($char)
00172 {
00173 return (preg_match('/'.self::SPACER_MASK.'/', $char) > 0);
00174 }
00175
00176 public static function removeWhitespaces(Cdata $cdata)
00177 {
00178 $string = $cdata->getData();
00179
00180 $string = preg_replace(
00181 '/^'.self::SPACER_MASK.'+/',
00182 ' ',
00183 $string
00184 );
00185
00186 $string = preg_replace(
00187 '/'.self::SPACER_MASK.'+$/',
00188 ' ',
00189 $string
00190 );
00191
00192 if ($string === '' || $string === null)
00193 return null;
00194
00195 $cdata->setData($string);
00196
00197 return $cdata;
00198 }
00199
00200 public function isInlineTag($id)
00201 {
00202 return in_array($id, $this->inlineTags);
00203 }
00204
00205 private static function optionalLowercase($string, $ignoreCase)
00206 {
00207 if (!$ignoreCase)
00208 return $string;
00209 else
00210 return strtolower($string);
00211 }
00212
00213 private function getNextChar()
00214 {
00215 $this->char = $this->stream->read(1);
00216
00217 if ($this->char === null)
00218 return null;
00219
00220 if (
00221 $this->char == "\n" && $this->previousChar != "\r"
00222 || $this->char == "\r"
00223 ) {
00224 ++$this->line;
00225 $this->linePosition = 1;
00226 } else {
00227 ++$this->linePosition;
00228 }
00229
00230 $this->previousChar = $this->char;
00231
00232 return $this->char;
00233 }
00234
00235 private function getChars($count)
00236 {
00237 $result = null;
00238
00239 while ($this->char !== null && $count > 0) {
00240 $result .= $this->char;
00241
00242 $this->getNextChar();
00243
00244 --$count;
00245 }
00246
00247 return $result;
00248 }
00249
00253 private function mark()
00254 {
00255 $this->mark = array(
00256 $this->char, $this->previousChar,
00257 $this->line, $this->linePosition
00258 );
00259
00260 $this->stream->mark();
00261
00262 return $this;
00263 }
00264
00268 private function reset()
00269 {
00270 Assert::isNotNull($this->mark);
00271
00272 list (
00273 $this->char, $this->previousChar,
00274 $this->line, $this->linePosition
00275 ) = $this->mark;
00276
00277 $this->stream->reset();
00278
00279 return $this;
00280 }
00281
00285 private function skip($count)
00286 {
00287 for ($i = 0; $i < $count; ++$i)
00288 $this->getNextChar();
00289
00290 return $this;
00291 }
00292
00293 private function lookAhead($count)
00294 {
00295 $this->stream->mark();
00296
00297 $result = $this->stream->read($count);
00298
00299 $this->stream->reset();
00300
00301 return $result;
00302 }
00303
00304 private function skipString($string, $skipSpaces = false)
00305 {
00306 $this->mark();
00307
00308 if ($skipSpaces) {
00309 while (
00310 $this->char !== null
00311 && self::isSpacerChar($this->char)
00312 )
00313 $this->getNextChar();
00314 }
00315
00316 $length = strlen($string);
00317
00318 if ($this->getChars($length) === $string)
00319 return true;
00320
00321 $this->reset();
00322
00323 return false;
00324 }
00325
00329 private function makeTag()
00330 {
00331 Assert::isNotNull($this->tag);
00332
00333 Assert::isNull($this->attrName);
00334 Assert::isNull($this->attrValue);
00335
00336 Assert::isNull($this->insideQuote);
00337
00338 if (
00339 !$this->suppressWhitespaces
00340 || !$this->tag instanceof Cdata
00341 || (self::removeWhitespaces($this->tag) !== null)
00342 )
00343 $this->tags[] = $this->completeTag = $this->tag;
00344
00345 $this->tagId = $this->tag = null;
00346
00347 return $this;
00348 }
00349
00353 private function setupTag(SgmlTag $tag)
00354 {
00355 Assert::isNull($this->tag);
00356 Assert::isNotNull($this->tagId);
00357
00358 $this->tag = $tag->setId($this->tagId);
00359
00360 $this->tagId = null;
00361
00362 return $this->tag;
00363 }
00364
00365 private function handleState()
00366 {
00367 switch ($this->state) {
00368 case self::INITIAL_STATE:
00369
00370 if (
00371 $this->previousTag instanceof SgmlOpenTag
00372 && $this->isInlineTag($this->previousTag->getId())
00373 )
00374 return $this->inlineTagState();
00375 else
00376 return $this->outsideTagState();
00377
00378 case self::START_TAG_STATE:
00379 return $this->startTagState();
00380
00381 case self::END_TAG_STATE:
00382 return $this->endTagState();
00383
00384 case self::INSIDE_TAG_STATE:
00385 return $this->insideTagState();
00386
00387 case self::ATTR_NAME_STATE:
00388 return $this->attrNameState();
00389
00390 case self::WAITING_EQUAL_SIGN_STATE:
00391 return $this->waitingEqualSignState();
00392
00393 case self::ATTR_VALUE_STATE:
00394 return $this->attrValueState();
00395
00396 case self::CDATA_STATE:
00397 return $this->cdataState();
00398
00399 case self::COMMENT_STATE:
00400 return $this->commentState();
00401
00402 case self::EXTERNAL_TAG_STATE:
00403 return $this->externalTagState();
00404
00405 case self::DOCTYPE_TAG_STATE:
00406 return $this->doctypeTagState();
00407 }
00408
00409 throw new WrongStateException('state machine is broken');
00410 }
00411
00415 private function dumpBuffer()
00416 {
00417 if ($this->buffer !== null) {
00418 $this->tag = Cdata::create()->setData($this->buffer);
00419
00420 $this->buffer = null;
00421
00422 $this->makeTag();
00423 }
00424
00425 return $this;
00426 }
00427
00428 private function checkSpecialTagState()
00429 {
00430 if ($this->char != '!')
00431 return null;
00432
00433 $specialStartTags = array(
00434 '![CDATA[' => self::CDATA_STATE,
00435 '!--' => self::COMMENT_STATE
00436 );
00437
00438 foreach ($specialStartTags as $tag => $state) {
00439
00440 if ($this->skipString($tag))
00441 return $state;
00442 }
00443
00444 return null;
00445 }
00446
00447
00448 private function outsideTagState()
00449 {
00450 Assert::isNull($this->tag);
00451 Assert::isNull($this->tagId);
00452
00453 Assert::isNull($this->attrName);
00454 Assert::isNull($this->attrValue);
00455
00456 Assert::isNull($this->insideQuote);
00457
00458 while ($this->char !== null) {
00459
00460 if ($this->char != '<') {
00461
00462 $this->buffer .= $this->char;
00463 $this->getNextChar();
00464
00465 } else {
00466
00467 $this->getNextChar();
00468
00469 if (
00470 self::isIdFirstChar($this->char)
00471 || $this->char == '?' || $this->char == '!'
00472 ) {
00473 $this->dumpBuffer();
00474
00475
00476 $specialTagState = $this->checkSpecialTagState();
00477
00478 if ($specialTagState !== null) {
00479
00480 return $specialTagState;
00481 }
00482
00483 $this->tagId = $this->char;
00484
00485 $this->getNextChar();
00486
00487 return self::START_TAG_STATE;
00488
00489 } elseif ($this->char == '/') {
00490
00491
00492 $this->dumpBuffer();
00493
00494 $this->getNextChar();
00495
00496 return self::END_TAG_STATE;
00497
00498 } else {
00499
00500
00501 $this->warning(
00502 'incorrect start-tag, treating it as cdata'
00503 );
00504
00505 $this->buffer .= '<'.$this->char;
00506
00507 $this->getNextChar();
00508
00509 continue;
00510 }
00511
00512 Assert::isUnreachable();
00513 }
00514 }
00515
00516 $this->dumpBuffer();
00517
00518 return self::FINAL_STATE;
00519 }
00520
00524 private function createOpenTag()
00525 {
00526 if (!self::isValidId($this->tagId))
00527 $this->error("tag id '{$this->tagId}' is invalid");
00528 elseif ($this->lowercaseTags)
00529 $this->tagId = strtolower($this->tagId);
00530
00531 return $this->setupTag(SgmlOpenTag::create());
00532 }
00533
00534
00535 private function startTagState()
00536 {
00537 Assert::isNull($this->tag);
00538 Assert::isNotNull($this->tagId);
00539
00540 Assert::isNull($this->attrName);
00541 Assert::isNull($this->attrValue);
00542
00543 Assert::isNull($this->insideQuote);
00544
00545 while ($this->char !== null) {
00546
00547 if ($this->char == '>') {
00548
00549
00550 $this->createOpenTag();
00551
00552 $this->makeTag();
00553
00554 $this->getNextChar();
00555
00556 return self::INITIAL_STATE;
00557
00558 } elseif (self::isSpacerChar($this->char)) {
00559
00560
00561
00562 $externalTag =
00563 ($this->tagId[0] == '?')
00564 && ($this->tagId != '?xml');
00565
00566 $doctypeTag = (strtoupper($this->tagId) == '!DOCTYPE');
00567
00568 if ($externalTag) {
00569 $this->setupTag(
00570 SgmlIgnoredTag::create()->
00571 setEndMark('?')
00572 );
00573 } elseif ($doctypeTag) {
00574 $this->setupTag(SgmlIgnoredTag::create());
00575 } else
00576 $this->createOpenTag();
00577
00578 if ($externalTag)
00579 return self::EXTERNAL_TAG_STATE;
00580 elseif ($doctypeTag)
00581 return self::DOCTYPE_TAG_STATE;
00582 else {
00583
00584 $this->getNextChar();
00585
00586 return self::INSIDE_TAG_STATE;
00587 }
00588 } else {
00589 $char = $this->char;
00590
00591 $this->getNextChar();
00592
00593 if ($char == '/' && $this->char == '>') {
00594
00595
00596 $this->createOpenTag()->setEmpty(true);
00597
00598 $this->makeTag();
00599
00600 $this->getNextChar();
00601
00602 return self::INITIAL_STATE;
00603 }
00604
00605 $this->tagId .= $char;
00606 }
00607 }
00608
00609
00610
00611 $this->error('unexpected end of file, tag id is incomplete');
00612
00613 $this->createOpenTag();
00614
00615 $this->makeTag();
00616
00617 return self::FINAL_STATE;
00618 }
00619
00623 private function dumpEndTag()
00624 {
00625 if (!$this->tagId) {
00626
00627 $this->warning('empty end-tag, storing with empty id');
00628
00629 } elseif (!self::isValidId($this->tagId)) {
00630
00631 $this->error("end-tag id '{$this->tagId}' is invalid");
00632 }
00633
00634 $this->tag = SgmlEndTag::create()->
00635 setId(
00636 self::optionalLowercase($this->tagId, $this->lowercaseTags)
00637 );
00638
00639 $this->makeTag();
00640
00641 return $this;
00642 }
00643
00644
00645 private function endTagState()
00646 {
00647 Assert::isNull($this->tag);
00648
00649 Assert::isTrue(
00650 $this->tagId === null
00651 || $this->char == '>'
00652 || self::isSpacerChar($this->char)
00653 );
00654
00655 Assert::isNull($this->attrName);
00656 Assert::isNull($this->attrValue);
00657
00658 Assert::isNull($this->insideQuote);
00659
00660 $eatingGarbage = false;
00661
00662 while ($this->char !== null) {
00663
00664 if ($this->char == '>') {
00665
00666 $this->dumpEndTag();
00667
00668 $this->getNextChar();
00669
00670 return self::INITIAL_STATE;
00671
00672 } elseif ($eatingGarbage) {
00673
00674 $this->getNextChar();
00675
00676 continue;
00677
00678 } elseif (self::isSpacerChar($this->char)) {
00679
00680
00681 $eatingGarbage = true;
00682
00683 $this->getNextChar();
00684
00685 continue;
00686 }
00687
00688 $this->tagId .= $this->char;
00689
00690 $this->getNextChar();
00691 }
00692
00693
00694
00695
00696 $this->error("unexpected end of file, end-tag is incomplete");
00697
00698 $this->dumpEndTag();
00699
00700 return self::FINAL_STATE;
00701 }
00702
00703
00704 private function insideTagState()
00705 {
00706 Assert::isNull($this->tagId);
00707
00708 Assert::isNull($this->attrName);
00709 Assert::isNull($this->attrValue);
00710
00711 Assert::isNotNull($this->tag);
00712 Assert::isTrue($this->tag instanceof SgmlOpenTag);
00713
00714 Assert::isNull($this->insideQuote);
00715
00716 while ($this->char !== null) {
00717
00718 if (self::isSpacerChar($this->char)) {
00719 $this->getNextChar();
00720
00721 } elseif ($this->char == '>') {
00722
00723
00724 $this->makeTag();
00725
00726 $this->getNextChar();
00727
00728 return self::INITIAL_STATE;
00729
00730 } elseif ($this->char == '=') {
00731
00732
00733 $this->error(
00734 'unexpected equal sign, attr name considered empty'
00735 );
00736
00737 $this->getNextChar();
00738
00739
00740 return self::ATTR_VALUE_STATE;
00741
00742 } else {
00743
00744 $char = $this->char;
00745
00746 $this->getNextChar();
00747
00748 if ($char == '/' && $this->char == '>') {
00749
00750
00751 $this->tag->setEmpty(true);
00752
00753 $this->makeTag();
00754
00755 $this->getNextChar();
00756
00757 return self::INITIAL_STATE;
00758 }
00759
00760 $this->attrName = $char;
00761
00762
00763 return self::ATTR_NAME_STATE;
00764 }
00765 }
00766
00767
00768
00769 $this->error('unexpected end of file, incomplete tag stored');
00770
00771 $this->makeTag();
00772
00773 return self::FINAL_STATE;
00774 }
00775
00779 private function dumpAttribute()
00780 {
00781 if ($this->attrName) {
00782
00783 if (!self::isValidId($this->attrName))
00784 $this->error("attribute name '{$this->attrName}' is invalid");
00785 else
00786 $this->attrName = strtolower($this->attrName);
00787
00788 }
00789
00790 if ($this->attrValue === null || $this->attrValue === '')
00791 $this->warning("empty value for attr == '{$this->attrName}'");
00792
00793 $this->tag->setAttribute($this->attrName, $this->attrValue);
00794
00795 $this->attrName = $this->attrValue = null;
00796
00797 return $this;
00798 }
00799
00800
00801 private function attrNameState()
00802 {
00803 Assert::isNotNull($this->tag);
00804 Assert::isTrue($this->tag instanceof SgmlOpenTag);
00805
00806 Assert::isNotNull($this->attrName);
00807 Assert::isNull($this->attrValue);
00808
00809 Assert::isNull($this->insideQuote);
00810
00811 while ($this->char !== null) {
00812
00813 if (self::isSpacerChar($this->char)) {
00814
00815
00816 $this->getNextChar();
00817
00818
00819 return self::WAITING_EQUAL_SIGN_STATE;
00820
00821 } elseif ($this->char == '>') {
00822
00823
00824 $this->dumpAttribute();
00825
00826 $this->makeTag();
00827
00828 $this->getNextChar();
00829
00830 return self::INITIAL_STATE;
00831
00832 } elseif ($this->char == '=') {
00833
00834
00835 $this->getNextChar();
00836
00837
00838 $this->attrValue = '';
00839
00840
00841 return self::ATTR_VALUE_STATE;
00842
00843 } else {
00844
00845 $char = $this->char;
00846
00847 $this->getNextChar();
00848
00849 if ($char == '/' && $this->char == '>') {
00850
00851
00852 $this->tag->setEmpty(true);
00853
00854 $this->dumpAttribute();
00855
00856 $this->makeTag();
00857
00858 $this->getNextChar();
00859
00860 return self::INITIAL_STATE;
00861 }
00862
00863 $this->attrName .= $char;
00864 }
00865 }
00866
00867
00868
00869
00870 $this->dumpAttribute();
00871
00872 $this->error('unexpected end of file, incomplete tag stored');
00873
00874 $this->makeTag();
00875
00876 return self::FINAL_STATE;
00877 }
00878
00879
00880 private function waitingEqualSignState()
00881 {
00882 Assert::isNotNull($this->tag);
00883 Assert::isTrue($this->tag instanceof SgmlOpenTag);
00884 Assert::isNull($this->tagId);
00885 Assert::isNotNull($this->attrName);
00886 Assert::isNull($this->attrValue);
00887
00888 Assert::isNull($this->insideQuote);
00889
00890 while ($this->char !== null) {
00891
00892 if (self::isSpacerChar($this->char)) {
00893
00894
00895 $this->getNextChar();
00896
00897 } elseif ($this->char == '=') {
00898
00899 $this->getNextChar();
00900
00901
00902 $this->attrValue = '';
00903
00904
00905 return self::ATTR_VALUE_STATE;
00906
00907 } else {
00908
00909
00910 $this->dumpAttribute();
00911
00912 return self::INSIDE_TAG_STATE;
00913 }
00914 }
00915
00916
00917
00918 $this->dumpAttribute();
00919
00920 $this->error('unexpected end of file, incomplete tag stored');
00921
00922 $this->makeTag();
00923
00924 return self::FINAL_STATE;
00925 }
00926
00927
00928 private function attrValueState()
00929 {
00930 Assert::isNull($this->tagId);
00931
00932 Assert::isNotNull($this->tag);
00933 Assert::isTrue($this->tag instanceof SgmlOpenTag);
00934
00935 while ($this->char !== null) {
00936
00937 if (!$this->insideQuote && self::isSpacerChar($this->char)) {
00938 $this->getNextChar();
00939
00940 if ($this->attrValue !== null && $this->attrValue !== '') {
00941
00942
00943
00944 $this->dumpAttribute();
00945
00946 return self::INSIDE_TAG_STATE;
00947 }
00948
00949
00950 continue;
00951
00952 } elseif (!$this->insideQuote && $this->char == '>') {
00953
00954
00955 $this->dumpAttribute();
00956
00957 $this->makeTag();
00958
00959 $this->getNextChar();
00960
00961 return self::INITIAL_STATE;
00962
00963 } else {
00964 if (
00965 $this->char == '"' || $this->char == "'"
00966 || $this->char == $this->insideQuote
00967 ) {
00968 if (!$this->insideQuote) {
00969
00970 $this->insideQuote = $this->char;
00971
00972 $this->getNextChar();
00973
00974
00975
00976 $this->mark();
00977
00978 continue;
00979
00980 } elseif ($this->char == $this->insideQuote) {
00981
00982
00983 $this->dumpAttribute();
00984
00985 $this->getNextChar();
00986
00987 if ($this->insideQuote == '>') {
00988 $this->insideQuote = null;
00989
00990 $this->makeTag();
00991
00992 return self::INITIAL_STATE;
00993
00994 } else {
00995 $this->insideQuote = null;
00996
00997 return self::INSIDE_TAG_STATE;
00998 }
00999 }
01000 }
01001
01002 $this->attrValue .= $this->char;
01003
01004 if ($this->insideQuote && $this->char == '\\')
01005 $this->attrValue .= $this->getNextChar();
01006
01007 $this->getNextChar();
01008 }
01009 }
01010
01011 if ($this->insideQuote) {
01012
01013
01014
01015
01016
01017
01018 $this->reset();
01019
01020 $this->warning(
01021 "unclosed quoted value for attr == '{$this->attrName}',"
01022 ." rolling back and searching '>'"
01023 );
01024
01025 $this->attrValue = null;
01026 $this->insideQuote = '>';
01027
01028
01029
01030 return self::ATTR_VALUE_STATE;
01031 }
01032
01033
01034
01035 $this->dumpAttribute();
01036
01037 $this->error('unexpected end of file, incomplete tag stored');
01038
01039 $this->makeTag();
01040
01041 return self::FINAL_STATE;
01042 }
01043
01044
01045 private function inlineTagState()
01046 {
01047
01048
01049 Assert::isNull($this->buffer);
01050
01051 Assert::isNull($this->tag);
01052 Assert::isNull($this->tagId);
01053
01054 $startTag = $this->previousTag->getId();
01055
01056 if ($this->char === null) {
01057 $this->error('unexpected eof inside inline tag');
01058
01059 return self::FINAL_STATE;
01060 }
01061
01062 $this->buffer = null;
01063
01064 if ($startTag == 'style' || $startTag == 'script') {
01071 if ($this->skipString('<!--', true))
01072 $this->buffer = '<!--'.$this->getComment().'-->';
01073 }
01074
01075 $endTag = '</'.$startTag;
01076
01077 while ($this->char !== null) {
01078 $this->buffer .= $this->getContentToSubstring($endTag, true);
01079
01080 if ($this->char === null) {
01081
01082
01083 break;
01084
01085 } elseif (
01086 $this->char === '>' || self::isSpacerChar($this->char)
01087 ) {
01088
01089
01090 $this->dumpBuffer();
01091
01092 $this->tagId = $startTag;
01093
01094 return self::END_TAG_STATE;
01095 }
01096
01097
01098
01099 $this->buffer .= $endTag.$this->char;
01100
01101 $this->getNextChar();
01102 }
01103
01104 $this->dumpBuffer();
01105
01106 $this->error(
01107 "end-tag for inline tag == '{$startTag}' not found"
01108 );
01109
01110 return self::FINAL_STATE;
01111 }
01112
01113
01114 private function cdataState()
01115 {
01116 Assert::isNull($this->tag);
01117 Assert::isNull($this->tagId);
01118
01119 $content = $this->getContentToSubstring(']]>');
01120
01121 $this->tag =
01122 Cdata::create()->
01123 setData($content)->
01124 setStrict(true);
01125
01126 $this->makeTag();
01127
01128 if (!$this->substringFound) {
01129
01130 $this->error('unexpected end-of-file inside cdata tag');
01131
01132 return self::FINAL_STATE;
01133 }
01134
01135 return self::INITIAL_STATE;
01136 }
01137
01138 private function getComment()
01139 {
01140 $this->mark();
01141
01142 $result = $this->getContentToSubstring('-->');
01143
01144 if (!$this->substringFound) {
01145 $this->reset();
01146
01147 $this->error(
01148 'unexpected end-of-file inside comment tag,'
01149 ." trying to find '>'"
01150 );
01151
01152 $result = $this->getContentToSubstring('>');
01153
01154 if (!$this->substringFound)
01155 $this->error(
01156 "end-tag '>' not found,"
01157 .' treating all remaining content as cdata'
01158 );
01159 }
01160
01161 return $result;
01162 }
01163
01164
01165 private function commentState()
01166 {
01167 Assert::isNull($this->tag);
01168 Assert::isNull($this->tagId);
01169
01170 $content = $this->getComment();
01171
01172 $this->tag =
01173 SgmlIgnoredTag::comment()->
01174 setCdata(
01175 Cdata::create()->setData($content)
01176 );
01177
01178 $this->makeTag();
01179
01180 return self::INITIAL_STATE;
01181 }
01182
01183
01184 private function externalTagState()
01185 {
01186 Assert::isTrue($this->tag instanceof SgmlIgnoredTag);
01187
01188 $this->mark();
01189
01190 $content = $this->getContentToSubstring('?>');
01191
01192 if (!$this->substringFound) {
01193 $this->reset();
01194
01195 $this->error(
01196 'unexpected end-of-file inside external tag,'
01197 ." trying to find '>'"
01198 );
01199
01200 $content = $this->getContentToSubstring('>');
01201
01202 if (!$this->substringFound)
01203 $this->error(
01204 "end-tag '>' not found,"
01205 .' treating all remaining content as cdata'
01206 );
01207 }
01208
01209 $this->tag->setCdata(Cdata::create()->setData($content));
01210
01211 $this->makeTag();
01212
01213 return self::INITIAL_STATE;
01214 }
01215
01216
01217 private function doctypeTagState()
01218 {
01219
01220
01221 Assert::isTrue($this->tag instanceof SgmlIgnoredTag);
01222
01223 $content = $this->getContentToSubstring('>');
01224
01225 if (!$this->substringFound)
01226 $this->error('unexpected end-of-file inside doctype tag');
01227
01228 $this->tag->setCdata(Cdata::create()->setData($content));
01229
01230 $this->makeTag();
01231
01232 return self::INITIAL_STATE;
01233 }
01234
01240 private function getContentToSubstring($substring, $ignoreCase = false)
01241 {
01242 $this->substringFound = false;
01243
01244 $substringLength = strlen($substring);
01245
01246 $prefixTable = array(1 => 0);
01247 $buffer = $substring."\x00";
01248 $i = 0;
01249
01250 while ($this->char !== null) {
01251
01252 if ($i < $substringLength)
01253 $char = $buffer[$i + 1];
01254 else {
01255 $char = $this->char;
01256 $buffer .= $char;
01257 $this->getNextChar();
01258 }
01259
01260 $maxLength = $prefixTable[$i + 1];
01261
01262 while (
01263 self::optionalLowercase($buffer[$maxLength], $ignoreCase)
01264 !== self::optionalLowercase($char, $ignoreCase)
01265 && $maxLength > 0
01266 ) {
01267 $maxLength = $prefixTable[$maxLength];
01268 }
01269
01270 ++$i;
01271
01272 $prefixTable[$i + 1] =
01273 (
01274 self::optionalLowercase($buffer[$maxLength], $ignoreCase)
01275 === self::optionalLowercase($char, $ignoreCase)
01276 )
01277 ? $maxLength + 1
01278 : 0;
01279
01280 if (
01281 $i > $substringLength + 1
01282 && $prefixTable[$i + 1] == $substringLength
01283 ) {
01284 $this->substringFound = true;
01285
01286 break;
01287 }
01288 }
01289
01290 if (!$this->substringFound)
01291 return substr(
01292 $buffer, $substringLength + 1
01293 );
01294 else
01295 return substr(
01296 $buffer, $substringLength + 1, $i - 2 * $substringLength
01297 );
01298 }
01299
01300 private function getTextualPosition()
01301 {
01302 return
01303 "line {$this->line}, position {$this->linePosition}"
01304 .(
01305 $this->tag && $this->tag->getId()
01306 ? ", in tag '{$this->tag->getId()}'"
01307 : null
01308 );
01309 }
01310
01314 private function warning($message)
01315 {
01316 $this->errors[] =
01317 "warning at {$this->getTextualPosition()}: $message";
01318
01319 return $this;
01320 }
01321
01325 private function error($message)
01326 {
01327 $this->errors[] =
01328 "error at {$this->getTextualPosition()}: $message";
01329
01330 return $this;
01331 }
01332 }
01333 ?>