HtmlTokenizer.class.php

Go to the documentation of this file.
00001 <?php
00002 /***************************************************************************
00003  *   Copyright (C) 2007 by Ivan Y. Khvostishkov                            *
00004  *                                                                         *
00005  *   This program is free software; you can redistribute it and/or modify  *
00006  *   it under the terms of the GNU Lesser General Public License as        *
00007  *   published by the Free Software Foundation; either version 3 of the    *
00008  *   License, or (at your option) any later version.                       *
00009  *                                                                         *
00010  ***************************************************************************/
00011 /* $Id: HtmlTokenizer.class.php 4687 2007-12-09 18:57:18Z voxus $ */
00012 
00016     final class HtmlTokenizer
00017     {
00018         const INITIAL_STATE             = 1;
00019         const START_TAG_STATE           = 2;
00020         const END_TAG_STATE             = 3;
00021         const INSIDE_TAG_STATE          = 4;
00022         const ATTR_NAME_STATE           = 5;
00023         const WAITING_EQUAL_SIGN_STATE  = 6;
00024         const ATTR_VALUE_STATE          = 7;
00025         
00026         const CDATA_STATE           = 8; // <![CDATA[ ... ]]>
00027         const COMMENT_STATE         = 9; // <!-- ... -->
00028         const INLINE_TAG_STATE      = 10; // script, style
00029         const EXTERNAL_TAG_STATE    = 11; // <?php ... ? >
00030         const DOCTYPE_TAG_STATE     = 12;
00031         
00032         const FINAL_STATE           = 42;
00033         
00034         const SPACER_MASK           = '[ \r\n\t]';
00035         const ID_FIRST_CHAR_MASK    = '[A-Za-z]';
00036         const ID_CHAR_MASK          = '[-_:.A-Za-z0-9]';
00037         
00038         private $inlineTags         = array('style', 'script', 'textarea');
00039         
00040         private $stream     = null;
00041         
00042         private $char       = null;
00043         
00044         // for logging
00045         private $line           = 1;
00046         private $linePosition   = 1;
00047         private $previousChar   = null;
00048         
00049         private $mark       = null;
00050         
00051         private $state      = self::INITIAL_STATE;
00052         
00053         private $tags       = array();
00054         private $errors     = array();
00055         
00056         private $buffer     = null;
00057         
00058         private $tagId      = null;
00059         
00060         private $tag            = null;
00061         private $completeTag    = null;
00062         private $previousTag    = null;
00063         
00064         private $attrName       = null;
00065         private $attrValue      = null;
00066         private $insideQuote    = null;
00067         
00068         private $substringFound = false;
00069         
00070         private $suppressWhitespaces    = false;
00071         private $lowercaseAttributes    = false;
00072         private $lowercaseTags          = false;
00073         
00074         public function __construct(InputStream $stream)
00075         {
00076             $this->stream = $stream;
00077             
00078             $this->getNextChar();
00079         }
00080         
00084         public static function create(InputStream $stream)
00085         {
00086             return new self($stream);
00087         }
00088         
00092         public function suppressWhitespaces($isSuppressWhitespaces)
00093         {
00094             Assert::isBoolean($isSuppressWhitespaces);
00095             
00096             $this->suppressWhitespaces = $isSuppressWhitespaces;
00097             
00098             return $this;
00099         }
00100         
00104         public function lowercaseAttributes($isLowercaseAttributes)
00105         {
00106             Assert::isBoolean($isLowercaseAttributes);
00107             
00108             $this->lowercaseAttributes = $isLowercaseAttributes;
00109             
00110             return $this;
00111         }
00112         
00116         public function lowercaseTags($isLowercaseTags)
00117         {
00118             Assert::isBoolean($isLowercaseTags);
00119             
00120             $this->lowercaseTags = $isLowercaseTags;
00121             
00122             return $this;
00123         }
00124         
00128         public function nextToken()
00129         {
00130             if ($this->state == self::FINAL_STATE)
00131                 return null;
00132             
00133             $this->completeTag = null;
00134             
00135             while ($this->state != self::FINAL_STATE && !$this->completeTag)
00136                 $this->state = $this->handleState();
00137             
00138             if ($this->state == self::FINAL_STATE && $this->char !== null)
00139                 throw new WrongStateException('state machine is broken');
00140             
00141             $this->previousTag = $this->completeTag;
00142             
00143             return $this->completeTag;
00144         }
00145         
00146         public function getErrors()
00147         {
00148             return $this->errors;
00149         }
00150         
00151         public static function isIdFirstChar($char)
00152         {
00153             return (preg_match('/'.self::ID_FIRST_CHAR_MASK.'/', $char) > 0);
00154         }
00155         
00156         public static function isIdChar($char)
00157         {
00158             return (preg_match('/'.self::ID_CHAR_MASK.'/', $char) > 0);
00159         }
00160         
00161         public static function isValidId($id)
00162         {
00163             $matches = preg_match(
00164                 '/^'.self::ID_FIRST_CHAR_MASK.self::ID_CHAR_MASK.'*$/',
00165                 $id
00166             );
00167             
00168             return ($matches > 0);
00169         }
00170         
00171         public static function isSpacerChar($char)
00172         {
00173             return (preg_match('/'.self::SPACER_MASK.'/', $char) > 0);
00174         }
00175         
00176         public static function removeWhitespaces(Cdata $cdata)
00177         {
00178             $string = $cdata->getData();
00179             
00180             $string = preg_replace(
00181                 '/^'.self::SPACER_MASK.'+/',
00182                 ' ',
00183                 $string
00184             );
00185             
00186             $string = preg_replace(
00187                 '/'.self::SPACER_MASK.'+$/',
00188                 ' ',
00189                 $string
00190             );
00191             
00192             if ($string === '' || $string === null)
00193                 return null;
00194             
00195             $cdata->setData($string);
00196             
00197             return $cdata;
00198         }
00199         
00200         public function isInlineTag($id)
00201         {
00202             return in_array($id, $this->inlineTags);
00203         }
00204         
00205         private static function optionalLowercase($string, $ignoreCase)
00206         {
00207             if (!$ignoreCase)
00208                 return $string;
00209             else
00210                 return strtolower($string);
00211         }
00212         
00213         private function getNextChar()
00214         {
00215             $this->char = $this->stream->read(1);
00216             
00217             if ($this->char === null)
00218                 return null;
00219             
00220             if (
00221                 $this->char == "\n" && $this->previousChar != "\r"
00222                 || $this->char == "\r"
00223             ) {
00224                 ++$this->line;
00225                 $this->linePosition = 1;
00226             } else {
00227                 ++$this->linePosition;
00228             }
00229             
00230             $this->previousChar = $this->char;
00231             
00232             return $this->char;
00233         }
00234         
00235         private function getChars($count)
00236         {
00237             $result = null;
00238             
00239             while ($this->char !== null && $count > 0) {
00240                 $result .= $this->char;
00241                 
00242                 $this->getNextChar();
00243                 
00244                 --$count;
00245             }
00246             
00247             return $result;
00248         }
00249         
00253         private function mark()
00254         {
00255             $this->mark = array(
00256                 $this->char, $this->previousChar,
00257                 $this->line, $this->linePosition
00258             );
00259             
00260             $this->stream->mark();
00261             
00262             return $this;
00263         }
00264         
00268         private function reset()
00269         {
00270             Assert::isNotNull($this->mark);
00271             
00272             list (
00273                 $this->char, $this->previousChar,
00274                 $this->line, $this->linePosition
00275             ) = $this->mark;
00276             
00277             $this->stream->reset();
00278             
00279             return $this;
00280         }
00281         
00285         private function skip($count)
00286         {
00287             for ($i = 0; $i < $count; ++$i)
00288                 $this->getNextChar();
00289             
00290             return $this;
00291         }
00292         
00293         private function lookAhead($count)
00294         {
00295             $this->stream->mark();
00296             
00297             $result = $this->stream->read($count);
00298             
00299             $this->stream->reset();
00300             
00301             return $result;
00302         }
00303         
00304         private function skipString($string, $skipSpaces = false)
00305         {
00306             $this->mark();
00307             
00308             if ($skipSpaces) {
00309                 while (
00310                     $this->char !== null
00311                     && self::isSpacerChar($this->char)
00312                 )
00313                     $this->getNextChar();
00314             }
00315             
00316             $length = strlen($string);
00317             
00318             if ($this->getChars($length) === $string)
00319                 return true;
00320             
00321             $this->reset();
00322             
00323             return false;
00324         }
00325         
00329         private function makeTag()
00330         {
00331             Assert::isNotNull($this->tag);
00332             
00333             Assert::isNull($this->attrName);
00334             Assert::isNull($this->attrValue);
00335             
00336             Assert::isNull($this->insideQuote);
00337             
00338             if (
00339                 !$this->suppressWhitespaces
00340                 || !$this->tag instanceof Cdata
00341                 || (self::removeWhitespaces($this->tag) !== null)
00342             )
00343                 $this->tags[] = $this->completeTag = $this->tag;
00344             
00345             $this->tagId = $this->tag = null;
00346             
00347             return $this;
00348         }
00349         
00353         private function setupTag(SgmlTag $tag)
00354         {
00355             Assert::isNull($this->tag);
00356             Assert::isNotNull($this->tagId);
00357             
00358             $this->tag = $tag->setId($this->tagId);
00359             
00360             $this->tagId = null;
00361             
00362             return $this->tag;
00363         }
00364         
00365         private function handleState()
00366         {
00367             switch ($this->state) {
00368                 case self::INITIAL_STATE:
00369                     
00370                     if (
00371                         $this->previousTag instanceof SgmlOpenTag
00372                         && $this->isInlineTag($this->previousTag->getId())
00373                     )
00374                         return $this->inlineTagState();
00375                     else
00376                         return $this->outsideTagState();
00377                     
00378                 case self::START_TAG_STATE:
00379                     return $this->startTagState();
00380                     
00381                 case self::END_TAG_STATE:
00382                     return $this->endTagState();
00383                     
00384                 case self::INSIDE_TAG_STATE:
00385                     return $this->insideTagState();
00386                     
00387                 case self::ATTR_NAME_STATE:
00388                     return $this->attrNameState();
00389                     
00390                 case self::WAITING_EQUAL_SIGN_STATE:
00391                     return $this->waitingEqualSignState();
00392                     
00393                 case self::ATTR_VALUE_STATE:
00394                     return $this->attrValueState();
00395                     
00396                 case self::CDATA_STATE:
00397                     return $this->cdataState();
00398                     
00399                 case self::COMMENT_STATE:
00400                     return $this->commentState();
00401                     
00402                 case self::EXTERNAL_TAG_STATE:
00403                     return $this->externalTagState();
00404                 
00405                 case self::DOCTYPE_TAG_STATE:
00406                     return $this->doctypeTagState();
00407             }
00408             
00409             throw new WrongStateException('state machine is broken');
00410         }
00411         
00415         private function dumpBuffer()
00416         {
00417             if ($this->buffer !== null) {
00418                 $this->tag = Cdata::create()->setData($this->buffer);
00419                 
00420                 $this->buffer = null;
00421                 
00422                 $this->makeTag();
00423             }
00424             
00425             return $this;
00426         }
00427         
00428         private function checkSpecialTagState()
00429         {
00430             if ($this->char != '!')
00431                 return null;
00432             
00433             $specialStartTags = array(
00434                 '![CDATA['  => self::CDATA_STATE,
00435                 '!--'       => self::COMMENT_STATE
00436             );
00437             
00438             foreach ($specialStartTags as $tag => $state) {
00439                 
00440                 if ($this->skipString($tag))
00441                     return $state;
00442             }
00443             
00444             return null;
00445         }
00446         
00447         // INITIAL_STATE
00448         private function outsideTagState()
00449         {
00450             Assert::isNull($this->tag);
00451             Assert::isNull($this->tagId);
00452             
00453             Assert::isNull($this->attrName);
00454             Assert::isNull($this->attrValue);
00455             
00456             Assert::isNull($this->insideQuote);
00457             
00458             while ($this->char !== null) {
00459                 
00460                 if ($this->char != '<') {
00461                     
00462                     $this->buffer .= $this->char;
00463                     $this->getNextChar();
00464                     
00465                 } else {
00466                     
00467                     $this->getNextChar();
00468                     
00469                     if (
00470                         self::isIdFirstChar($this->char)
00471                         || $this->char == '?' || $this->char == '!'
00472                     ) {
00473                         $this->dumpBuffer();
00474                         
00475                         // TODO: handle at start tag state
00476                         $specialTagState = $this->checkSpecialTagState();
00477                         
00478                         if ($specialTagState !== null) {
00479                             // comment, cdata
00480                             return $specialTagState;
00481                         }
00482                         
00483                         $this->tagId = $this->char;
00484                         
00485                         $this->getNextChar();
00486                         
00487                         return self::START_TAG_STATE;
00488                         
00489                     } elseif ($this->char == '/') {
00490                         // </
00491                         
00492                         $this->dumpBuffer();
00493                         
00494                         $this->getNextChar();
00495                         
00496                         return self::END_TAG_STATE;
00497                         
00498                     } else {
00499                         // <2, <ф, <[space], <>, <[eof]
00500                         
00501                         $this->warning(
00502                             'incorrect start-tag, treating it as cdata'
00503                         );
00504                         
00505                         $this->buffer .= '<'.$this->char;
00506                         
00507                         $this->getNextChar();
00508                         
00509                         continue;
00510                     }
00511                     
00512                     Assert::isUnreachable();
00513                 }
00514             }
00515             
00516             $this->dumpBuffer();
00517             
00518             return self::FINAL_STATE;
00519         }
00520         
00524         private function createOpenTag()
00525         {
00526             if (!self::isValidId($this->tagId))
00527                 $this->error("tag id '{$this->tagId}' is invalid");
00528             elseif ($this->lowercaseTags)
00529                 $this->tagId = strtolower($this->tagId);
00530             
00531             return $this->setupTag(SgmlOpenTag::create());
00532         }
00533         
00534         // START_TAG_STATE
00535         private function startTagState()
00536         {
00537             Assert::isNull($this->tag);
00538             Assert::isNotNull($this->tagId); // strlen(tagId) == 1
00539             
00540             Assert::isNull($this->attrName);
00541             Assert::isNull($this->attrValue);
00542             
00543             Assert::isNull($this->insideQuote);
00544             
00545             while ($this->char !== null) {
00546                 
00547                 if ($this->char == '>') {
00548                     // <b>, <divмусор>
00549                     
00550                     $this->createOpenTag();
00551                     
00552                     $this->makeTag();
00553                     
00554                     $this->getNextChar();
00555                     
00556                     return self::INITIAL_STATE;
00557                     
00558                 } elseif (self::isSpacerChar($this->char)) {
00559                     // <p[space], <divмусор[space], <?php[space],
00560                     // <?xml[space], <!DOCTYPE[space]
00561                     
00562                     $externalTag =
00563                         ($this->tagId[0] == '?')
00564                         && ($this->tagId != '?xml');
00565                     
00566                     $doctypeTag = (strtoupper($this->tagId) == '!DOCTYPE');
00567                     
00568                     if ($externalTag) {
00569                         $this->setupTag(
00570                             SgmlIgnoredTag::create()->
00571                             setEndMark('?')
00572                         );
00573                     } elseif ($doctypeTag) {
00574                         $this->setupTag(SgmlIgnoredTag::create());
00575                     } else
00576                         $this->createOpenTag();
00577                     
00578                     if ($externalTag)
00579                         return self::EXTERNAL_TAG_STATE;
00580                     elseif ($doctypeTag)
00581                         return self::DOCTYPE_TAG_STATE;
00582                     else {
00583                         // don't eating spacer for external and doctype tags
00584                         $this->getNextChar();
00585                         
00586                         return self::INSIDE_TAG_STATE;
00587                     }
00588                 } else {
00589                     $char = $this->char;
00590                     
00591                     $this->getNextChar();
00592                     
00593                     if ($char == '/' && $this->char == '>') {
00594                         // <br/>
00595                         
00596                         $this->createOpenTag()->setEmpty(true);
00597                         
00598                         $this->makeTag();
00599                         
00600                         $this->getNextChar();
00601                         
00602                         return self::INITIAL_STATE;
00603                     }
00604                     
00605                     $this->tagId .= $char;
00606                 }
00607             }
00608             
00609             // ... <tag[end-of-file]
00610             
00611             $this->error('unexpected end of file, tag id is incomplete');
00612             
00613             $this->createOpenTag();
00614             
00615             $this->makeTag();
00616             
00617             return self::FINAL_STATE;
00618         }
00619         
00623         private function dumpEndTag()
00624         {
00625             if (!$this->tagId) {
00626                 // </>
00627                 $this->warning('empty end-tag, storing with empty id');
00628                 
00629             } elseif (!self::isValidId($this->tagId)) {
00630                 
00631                 $this->error("end-tag id '{$this->tagId}' is invalid");
00632             }
00633             
00634             $this->tag = SgmlEndTag::create()->
00635                 setId(
00636                     self::optionalLowercase($this->tagId, $this->lowercaseTags)
00637                 );
00638             
00639             $this->makeTag();
00640             
00641             return $this;
00642         }
00643         
00644         // END_TAG_STATE
00645         private function endTagState()
00646         {
00647             Assert::isNull($this->tag);
00648             
00649             Assert::isTrue(
00650                 $this->tagId === null
00651                 || $this->char == '>'
00652                 || self::isSpacerChar($this->char)
00653             );
00654             
00655             Assert::isNull($this->attrName);
00656             Assert::isNull($this->attrValue);
00657             
00658             Assert::isNull($this->insideQuote);
00659             
00660             $eatingGarbage = false;
00661             
00662             while ($this->char !== null) {
00663                 
00664                 if ($this->char == '>') {
00665                     
00666                     $this->dumpEndTag();
00667                     
00668                     $this->getNextChar();
00669                     
00670                     return self::INITIAL_STATE;
00671                     
00672                 } elseif ($eatingGarbage) {
00673                     
00674                     $this->getNextChar();
00675                     
00676                     continue;
00677                     
00678                 } elseif (self::isSpacerChar($this->char)) {
00679                     // most browsers parse end-tag until next '>' char
00680                     
00681                     $eatingGarbage = true;
00682                     
00683                     $this->getNextChar();
00684                     
00685                     continue;
00686                 }
00687                 
00688                 $this->tagId .= $this->char;
00689                 
00690                 $this->getNextChar();
00691             }
00692             
00693             // ... </[end-of-file], </sometag[eof]
00694             
00695             // NOTE: opera treats </[eof] as cdata, firefox as tag
00696             $this->error("unexpected end of file, end-tag is incomplete");
00697             
00698             $this->dumpEndTag();
00699             
00700             return self::FINAL_STATE;
00701         }
00702         
00703         // INSIDE_TAG_STATE
00704         private function insideTagState()
00705         {
00706             Assert::isNull($this->tagId);
00707             
00708             Assert::isNull($this->attrName);
00709             Assert::isNull($this->attrValue);
00710             
00711             Assert::isNotNull($this->tag);
00712             Assert::isTrue($this->tag instanceof SgmlOpenTag);
00713             
00714             Assert::isNull($this->insideQuote);
00715             
00716             while ($this->char !== null) {
00717                 
00718                 if (self::isSpacerChar($this->char)) {
00719                     $this->getNextChar();
00720                     
00721                 } elseif ($this->char == '>') {
00722                     // <tag ... >
00723                     
00724                     $this->makeTag();
00725                     
00726                     $this->getNextChar();
00727                     
00728                     return self::INITIAL_STATE;
00729                     
00730                 } elseif ($this->char == '=') {
00731                     
00732                     // most browsers' behaviour
00733                     $this->error(
00734                         'unexpected equal sign, attr name considered empty'
00735                     );
00736                     
00737                     $this->getNextChar();
00738                     
00739                     // call?
00740                     return self::ATTR_VALUE_STATE;
00741                     
00742                 } else {
00743                     
00744                     $char = $this->char;
00745                     
00746                     $this->getNextChar();
00747                     
00748                     if ($char == '/' && $this->char == '>') {
00749                         // <tag />, <tag id=value />
00750                         
00751                         $this->tag->setEmpty(true);
00752                         
00753                         $this->makeTag();
00754                         
00755                         $this->getNextChar();
00756                         
00757                         return self::INITIAL_STATE;
00758                     }
00759                     
00760                     $this->attrName = $char;
00761                     
00762                     // call?
00763                     return self::ATTR_NAME_STATE;
00764                 }
00765             }
00766             
00767             // <tag [eof], <tag id=val [eof]
00768             
00769             $this->error('unexpected end of file, incomplete tag stored');
00770             
00771             $this->makeTag();
00772                 
00773             return self::FINAL_STATE;
00774         }
00775         
00779         private function dumpAttribute()
00780         {
00781             if ($this->attrName) {
00782                 
00783                 if (!self::isValidId($this->attrName))
00784                     $this->error("attribute name '{$this->attrName}' is invalid");
00785                 else
00786                     $this->attrName = strtolower($this->attrName);
00787                 
00788             }
00789             
00790             if ($this->attrValue === null || $this->attrValue === '')
00791                 $this->warning("empty value for attr == '{$this->attrName}'");
00792             
00793             $this->tag->setAttribute($this->attrName, $this->attrValue);
00794             
00795             $this->attrName = $this->attrValue = null;
00796             
00797             return $this;
00798         }
00799         
00800         // ATTR_NAME_STATE
00801         private function attrNameState()
00802         {
00803             Assert::isNotNull($this->tag);
00804             Assert::isTrue($this->tag instanceof SgmlOpenTag);
00805             
00806             Assert::isNotNull($this->attrName); // length == 1
00807             Assert::isNull($this->attrValue);
00808             
00809             Assert::isNull($this->insideQuote);
00810             
00811             while ($this->char !== null) {
00812                 
00813                 if (self::isSpacerChar($this->char)) {
00814                     // <tag attr[space]
00815                     
00816                     $this->getNextChar();
00817                     
00818                     // call?
00819                     return self::WAITING_EQUAL_SIGN_STATE;
00820                     
00821                 } elseif ($this->char == '>') {
00822                     // <tag attr>
00823                     
00824                     $this->dumpAttribute();
00825                     
00826                     $this->makeTag();
00827                     
00828                     $this->getNextChar();
00829                     
00830                     return self::INITIAL_STATE;
00831                     
00832                 } elseif ($this->char == '=') {
00833                     // <tag id=
00834                     
00835                     $this->getNextChar();
00836                     
00837                     // empty string, not null, to be sure that value needed
00838                     $this->attrValue = '';
00839                     
00840                     // call?
00841                     return self::ATTR_VALUE_STATE;
00842                     
00843                 } else {
00844                     
00845                     $char = $this->char;
00846                     
00847                     $this->getNextChar();
00848                     
00849                     if ($char == '/' && $this->char == '>') {
00850                         // <option attr=value checked/>
00851                         
00852                         $this->tag->setEmpty(true);
00853                         
00854                         $this->dumpAttribute();
00855                         
00856                         $this->makeTag();
00857                         
00858                         $this->getNextChar();
00859                         
00860                         return self::INITIAL_STATE;
00861                     }
00862                     
00863                     $this->attrName .= $char;
00864                 }
00865             }
00866             
00867             // <tag i[eof]
00868             
00869             // NOTE: opera treats it as cdata, firefox does not
00870             $this->dumpAttribute();
00871             
00872             $this->error('unexpected end of file, incomplete tag stored');
00873             
00874             $this->makeTag();
00875             
00876             return self::FINAL_STATE;
00877         }
00878         
00879         // WAITING_EQUAL_SIGN_STATE
00880         private function waitingEqualSignState()
00881         {
00882             Assert::isNotNull($this->tag);
00883             Assert::isTrue($this->tag instanceof SgmlOpenTag);
00884             Assert::isNull($this->tagId);
00885             Assert::isNotNull($this->attrName);
00886             Assert::isNull($this->attrValue);
00887             
00888             Assert::isNull($this->insideQuote);
00889             
00890             while ($this->char !== null) {
00891                 
00892                 if (self::isSpacerChar($this->char)) {
00893                     // <tag attr[space*]
00894                     
00895                     $this->getNextChar();
00896                     
00897                 } elseif ($this->char == '=') {
00898                     
00899                     $this->getNextChar();
00900                     
00901                     // empty string, not null, to be sure that value needed
00902                     $this->attrValue = '';
00903                     
00904                     // call?
00905                     return self::ATTR_VALUE_STATE;
00906                     
00907                 } else {
00908                     // <tag attr x, <tag attr >
00909                     
00910                     $this->dumpAttribute();
00911                     
00912                     return self::INSIDE_TAG_STATE;
00913                 }
00914             }
00915             
00916             // <tag id[space*][eof]
00917             
00918             $this->dumpAttribute();
00919             
00920             $this->error('unexpected end of file, incomplete tag stored');
00921             
00922             $this->makeTag();
00923             
00924             return self::FINAL_STATE;
00925         }
00926         
00927         // ATTR_VALUE_STATE
00928         private function attrValueState()
00929         {
00930             Assert::isNull($this->tagId);
00931             
00932             Assert::isNotNull($this->tag);
00933             Assert::isTrue($this->tag instanceof SgmlOpenTag);
00934             
00935             while ($this->char !== null) {
00936                 
00937                 if (!$this->insideQuote && self::isSpacerChar($this->char)) {
00938                     $this->getNextChar();
00939                     
00940                     if ($this->attrValue !== null && $this->attrValue !== '') {
00941                         // NOTE: "0" is accepted value
00942                         // <tag id=unquottedValue[space]
00943                         
00944                         $this->dumpAttribute();
00945                         
00946                         return self::INSIDE_TAG_STATE;
00947                     }
00948                     
00949                     // <tag id=[space*]
00950                     continue;
00951                     
00952                 } elseif (!$this->insideQuote && $this->char == '>') {
00953                     // <tag id=value>, <a href=catalog/>
00954                     
00955                     $this->dumpAttribute();
00956                     
00957                     $this->makeTag();
00958                     
00959                     $this->getNextChar();
00960                     
00961                     return self::INITIAL_STATE;
00962                     
00963                 } else {
00964                     if (
00965                         $this->char == '"' || $this->char == "'"
00966                         || $this->char == $this->insideQuote // may be '>'
00967                     ) {
00968                         if (!$this->insideQuote) {
00969                             
00970                             $this->insideQuote = $this->char;
00971                             
00972                             $this->getNextChar();
00973                             
00974                             // a place to rollback if second quote will not be
00975                             // found.
00976                             $this->mark();
00977                             
00978                             continue;
00979                             
00980                         } elseif ($this->char == $this->insideQuote) {
00981                             // attr = "value", attr='value', attr='value>([^']*)
00982                             
00983                             $this->dumpAttribute();
00984                             
00985                             $this->getNextChar();
00986                             
00987                             if ($this->insideQuote == '>') {
00988                                 $this->insideQuote = null;
00989                                 
00990                                 $this->makeTag();
00991                                 
00992                                 return self::INITIAL_STATE;
00993                                 
00994                             } else {
00995                                 $this->insideQuote = null;
00996                                 
00997                                 return self::INSIDE_TAG_STATE;
00998                             }
00999                         }
01000                     }
01001                     
01002                     $this->attrValue .= $this->char;
01003                     
01004                     if ($this->insideQuote && $this->char == '\\')
01005                         $this->attrValue .= $this->getNextChar();
01006                     
01007                     $this->getNextChar();
01008                 }
01009             }
01010             
01011             if ($this->insideQuote) {
01012                 // <tag id="...[eof]
01013                 //
01014                 // NOTE: firefox rolls back to the first > after quote.
01015                 // Opera consideres incomplete tag as cdata.
01016                 // we act as ff does.
01017                 
01018                 $this->reset();
01019                 
01020                 $this->warning(
01021                     "unclosed quoted value for attr == '{$this->attrName}',"
01022                     ." rolling back and searching '>'"
01023                 );
01024                 
01025                 $this->attrValue = null;
01026                 $this->insideQuote = '>';
01027                 
01028                 // call?
01029                 // TODO: possible infinite loop?
01030                 return self::ATTR_VALUE_STATE;
01031             }
01032             
01033             // <tag id=[space*][eof], <tag id=val[eof]
01034             
01035             $this->dumpAttribute();
01036             
01037             $this->error('unexpected end of file, incomplete tag stored');
01038             
01039             $this->makeTag();
01040             
01041             return self::FINAL_STATE;
01042         }
01043         
01044         // INLINE_TAG_STATE:
01045         private function inlineTagState()
01046         {
01047             // <script ...>X<-- we are here
01048             
01049             Assert::isNull($this->buffer);
01050             
01051             Assert::isNull($this->tag);
01052             Assert::isNull($this->tagId);
01053             
01054             $startTag = $this->previousTag->getId();
01055             
01056             if ($this->char === null) {
01057                 $this->error('unexpected eof inside inline tag');
01058                 
01059                 return self::FINAL_STATE;
01060             }
01061             
01062             $this->buffer = null;
01063             
01064             if ($startTag == 'style' || $startTag == 'script') {
01071                 if ($this->skipString('<!--', true))
01072                     $this->buffer = '<!--'.$this->getComment().'-->';
01073             }
01074             
01075             $endTag = '</'.$startTag;
01076             
01077             while ($this->char !== null) {
01078                 $this->buffer .= $this->getContentToSubstring($endTag, true);
01079                 
01080                 if ($this->char === null) {
01081                     // </script not found, or found </script[eof]
01082                     
01083                     break;
01084                     
01085                 } elseif (
01086                     $this->char === '>' || self::isSpacerChar($this->char)
01087                 ) {
01088                     // </script>, </script[space]
01089                     
01090                     $this->dumpBuffer();
01091                     
01092                     $this->tagId = $startTag;
01093                     
01094                     return self::END_TAG_STATE;
01095                 }
01096                 
01097                 // </script[any-other-char]
01098                 
01099                 $this->buffer .= $endTag.$this->char;
01100                 
01101                 $this->getNextChar();
01102             }
01103             
01104             $this->dumpBuffer();
01105             
01106             $this->error(
01107                 "end-tag for inline tag == '{$startTag}' not found"
01108             );
01109             
01110             return self::FINAL_STATE;
01111         }
01112         
01113         // CDATA_STATE
01114         private function cdataState()
01115         {
01116             Assert::isNull($this->tag);
01117             Assert::isNull($this->tagId);
01118             
01119             $content = $this->getContentToSubstring(']]>');
01120             
01121             $this->tag =
01122                 Cdata::create()->
01123                 setData($content)->
01124                 setStrict(true);
01125             
01126             $this->makeTag();
01127             
01128             if (!$this->substringFound) {
01129                 
01130                 $this->error('unexpected end-of-file inside cdata tag');
01131                 
01132                 return self::FINAL_STATE;
01133             }
01134             
01135             return self::INITIAL_STATE;
01136         }
01137         
01138         private function getComment()
01139         {
01140             $this->mark();
01141             
01142             $result = $this->getContentToSubstring('-->');
01143             
01144             if (!$this->substringFound) {
01145                 $this->reset();
01146                 
01147                 $this->error(
01148                     'unexpected end-of-file inside comment tag,'
01149                     ." trying to find '>'"
01150                 );
01151                 
01152                 $result = $this->getContentToSubstring('>');
01153                 
01154                 if (!$this->substringFound)
01155                     $this->error(
01156                         "end-tag '>' not found,"
01157                         .' treating all remaining content as cdata'
01158                     );
01159             }
01160             
01161             return $result;
01162         }
01163         
01164         // COMMENT_STATE
01165         private function commentState()
01166         {
01167             Assert::isNull($this->tag);
01168             Assert::isNull($this->tagId);
01169             
01170             $content = $this->getComment();
01171             
01172             $this->tag =
01173                 SgmlIgnoredTag::comment()->
01174                 setCdata(
01175                     Cdata::create()->setData($content)
01176                 );
01177             
01178             $this->makeTag();
01179             
01180             return self::INITIAL_STATE;
01181         }
01182         
01183         // EXTERNAL_TAG_STATE:
01184         private function externalTagState()
01185         {
01186             Assert::isTrue($this->tag instanceof SgmlIgnoredTag);
01187             
01188             $this->mark();
01189             
01190             $content = $this->getContentToSubstring('?>');
01191             
01192             if (!$this->substringFound) {
01193                 $this->reset();
01194                 
01195                 $this->error(
01196                     'unexpected end-of-file inside external tag,'
01197                     ." trying to find '>'"
01198                 );
01199                 
01200                 $content = $this->getContentToSubstring('>');
01201                 
01202                 if (!$this->substringFound)
01203                     $this->error(
01204                         "end-tag '>' not found,"
01205                         .' treating all remaining content as cdata'
01206                     );
01207             }
01208             
01209             $this->tag->setCdata(Cdata::create()->setData($content));
01210             
01211             $this->makeTag();
01212             
01213             return self::INITIAL_STATE;
01214         }
01215         
01216         // DOCTYPE_TAG_STATE:
01217         private function doctypeTagState()
01218         {
01219             // TODO: use DoctypeTag and parse it correctly as Opera does and
01220             // Firefox does not.
01221             Assert::isTrue($this->tag instanceof SgmlIgnoredTag);
01222             
01223             $content = $this->getContentToSubstring('>');
01224             
01225             if (!$this->substringFound)
01226                 $this->error('unexpected end-of-file inside doctype tag');
01227             
01228             $this->tag->setCdata(Cdata::create()->setData($content));
01229             
01230             $this->makeTag();
01231             
01232             return self::INITIAL_STATE;
01233         }
01234         
01240         private function getContentToSubstring($substring, $ignoreCase = false)
01241         {
01242             $this->substringFound = false;
01243             
01244             $substringLength = strlen($substring);
01245             
01246             $prefixTable = array(1 => 0);
01247             $buffer = $substring."\x00";
01248             $i = 0;
01249             
01250             while ($this->char !== null) {
01251                 
01252                 if ($i < $substringLength)
01253                     $char = $buffer[$i + 1];
01254                 else {
01255                     $char = $this->char;
01256                     $buffer .= $char;
01257                     $this->getNextChar();
01258                 }
01259                 
01260                 $maxLength = $prefixTable[$i + 1];
01261                 
01262                 while (
01263                     self::optionalLowercase($buffer[$maxLength], $ignoreCase)
01264                         !== self::optionalLowercase($char, $ignoreCase)
01265                     && $maxLength > 0
01266                 ) {
01267                     $maxLength = $prefixTable[$maxLength];
01268                 }
01269                 
01270                 ++$i;
01271                 
01272                 $prefixTable[$i + 1] =
01273                     (
01274                         self::optionalLowercase($buffer[$maxLength], $ignoreCase)
01275                             === self::optionalLowercase($char, $ignoreCase)
01276                     )
01277                     ? $maxLength + 1
01278                     : 0;
01279                 
01280                 if (
01281                     $i > $substringLength + 1
01282                     && $prefixTable[$i + 1] == $substringLength
01283                 ) {
01284                     $this->substringFound = true;
01285                     
01286                     break;
01287                 }
01288             }
01289             
01290             if (!$this->substringFound)
01291                 return substr(
01292                     $buffer, $substringLength + 1
01293                 );
01294             else
01295                 return substr(
01296                     $buffer, $substringLength + 1, $i - 2 * $substringLength
01297                 );
01298         }
01299         
01300         private function getTextualPosition()
01301         {
01302             return
01303                 "line {$this->line}, position {$this->linePosition}"
01304                 .(
01305                     $this->tag && $this->tag->getId()
01306                         ? ", in tag '{$this->tag->getId()}'"
01307                         : null
01308                 );
01309         }
01310         
01314         private function warning($message)
01315         {
01316             $this->errors[] =
01317                 "warning at {$this->getTextualPosition()}: $message";
01318             
01319             return $this;
01320         }
01321         
01325         private function error($message)
01326         {
01327             $this->errors[] =
01328                 "error at {$this->getTextualPosition()}: $message";
01329             
01330             return $this;
01331         }
01332     }
01333 ?>

Generated on Sun Dec 9 21:56:24 2007 for onPHP by  doxygen 1.5.4