OqlTokenizer.class.php

Go to the documentation of this file.
00001 <?php
00002 /****************************************************************************
00003  *   Copyright (C) 2008-2009 by Vladlen Y. Koshelev                         *
00004  *                                                                          *
00005  *   This program is free software; you can redistribute it and/or modify   *
00006  *   it under the terms of the GNU Lesser General Public License as         *
00007  *   published by the Free Software Foundation; either version 3 of the     *
00008  *   License, or (at your option) any later version.                        *
00009  *                                                                          *
00010  ****************************************************************************/
00011 
00015     final class OqlTokenizer
00016     {
00017         private $tokens         = array();
00018         private $tokensCount    = 0;
00019         private $token          = null;
00020         private $prevToken      = null;
00021         private $index          = -1;
00022         
00023         private static $masks = array(
00024             OqlToken::NEW_LINE =>
00025                 '\n',
00026             
00027             // "'`-quoted string constant
00028             OqlToken::STRING =>
00029                 '"[^"\\\]*(?:\\\.[^"\\\]*)*"|\'[^\'\\\]*(?:\\\.[^\'\\\]*)*\'|`[^`\\\]*(?:\\\.[^`\\\]*)*`',
00030             
00031             // unsigned numeric constant
00032             OqlToken::NUMBER =>
00033                 '(?:\b[\d]+)?\.?[\d]+(?:[eE][-+]?[\d]+)?\b',
00034             
00035             // boolean constant
00036             OqlToken::BOOLEAN =>
00037                 '\b(?:true|false)\b',
00038             
00039             OqlToken::NULL =>
00040                 '\bnull\b',
00041             
00042             // substitution
00043             OqlToken::SUBSTITUTION =>
00044                 '\$[\d]+',
00045             
00046             // reserved word
00047             OqlToken::KEYWORD =>
00048                 '\b(?:as|distinct|from|where|not|and|or|in|like|ilike|similar\s+to|between|is|group\s+by|order\s+by|asc|desc|having|limit|offset)\b',
00049             
00050             // aggregate function
00051             OqlToken::AGGREGATE_FUNCTION =>
00052                 '\b(?:sum|avg|min|max|count)\b',
00053             
00054             // property, class name
00055             OqlToken::IDENTIFIER =>
00056                 '\b[a-zA-Z_][a-zA-Z\d_]*(?:\.[a-zA-Z_][a-zA-Z\d_]+)*\b',
00057             
00058             // parentheses
00059             OqlToken::PARENTHESES =>
00060                 '[\(\)]',
00061             
00062             // comma
00063             OqlToken::PUNCTUATION =>
00064                 ',',
00065             
00066             // comparison operators
00067             OqlToken::COMPARISON_OPERATOR =>
00068                 '>\=|<\=|<>|>|<|\!\=|\=',
00069             
00070             // arithmetic operators
00071             OqlToken::ARITHMETIC_OPERATOR =>
00072                 '\+|\-|\/|\*'
00073         );
00074         
00075         public function __construct($string)
00076         {
00077             $this->tokenize($string);
00078         }
00079         
00080         public function getList()
00081         {
00082             return $this->tokens;
00083         }
00084         
00085         public function getLine()
00086         {
00087             $token = $this->token;
00088             if (!$token)
00089                 $token = $this->prevToken;
00090             
00091             return $token ? $token->getLine() : null;
00092         }
00093         
00094         public function getPosition()
00095         {
00096             $token = $this->token;
00097             if (!$token)
00098                 $token = $this->prevToken;
00099             
00100             return $token ? $token->getPosition() : null;
00101         }
00102         
00103         public function getIndex()
00104         {
00105             return $this->index;
00106         }
00107         
00111         public function setIndex($index)
00112         {
00113             if ($index > $this->tokensCount - 1) {
00114                 $index = $this->tokensCount - 1;
00115             
00116             } elseif ($index < -1) {
00117                 $index = -1;
00118             }
00119             
00120             $this->index = $index;
00121             $this->token = $this->getByIndex($this->index);
00122             $this->prevToken = $this->getByIndex($this->index - 1);
00123             
00124             return $this;
00125         }
00126         
00130         public function get()
00131         {
00132             return $this->token;
00133         }
00134         
00138         public function next()
00139         {
00140             $this->setIndex($this->index + 1);
00141             
00142             return $this->token;
00143         }
00144         
00148         public function back()
00149         {
00150             $this->setIndex($this->index - 1);
00151             
00152             return $this->token;
00153         }
00154         
00158         public function peek()
00159         {
00160             if ($this->token)
00161                 $this->prevToken = $this->token;
00162             
00163             return $this->token = $this->getByIndex($this->index + 1);
00164         }
00165         
00169         private function getByIndex($index)
00170         {
00171             return isset($this->tokens[$index]) ? $this->tokens[$index] : null;
00172         }
00173         
00177         private function tokenize($string)
00178         {
00179             Assert::isString($string);
00180             
00181             $maxMultibyteDelta = strlen($string) - mb_strlen($string);
00182             $isMultibyte = $maxMultibyteDelta > 0;
00183             
00184             $pattern = '/('.implode(')|(', self::$masks).')/is';
00185             if ($isMultibyte)
00186                 $pattern .= 'u';
00187             
00188             preg_match_all(
00189                 $pattern,
00190                 $string,
00191                 $matches,
00192                 PREG_SET_ORDER | PREG_OFFSET_CAPTURE
00193             );
00194             
00195             $line = 1;
00196             $lineStart = 0;
00197             $multibyteDelta = 0;
00198             
00199             foreach ($matches as $match) {
00200                 $type = count($match) - 1;
00201                 $offset = $match[0][1] - $multibyteDelta;
00202                 
00203                 if ($type == OqlToken::NEW_LINE) {
00204                     $line++;
00205                     $lineStart = $offset + 1;
00206                     continue;
00207                 }
00208                 
00209                 $value = $match[0][0];
00210                 $position = $offset - $lineStart;
00211                 
00212                 $this->tokens[] =
00213                     OqlToken::make(
00214                         $this->importTokenValue($value, $type),
00215                         $value,
00216                         $type,
00217                         $line,
00218                         $position
00219                     );
00220                 
00221                 if (
00222                     $type == OqlToken::KEYWORD
00223                     && ($pos = strpos($value, "\n")) !== false
00224                 ) {
00225                     $line++;
00226                     $lineStart = $offset + $pos + 1;
00227                 }
00228                 
00229                 if ($isMultibyte && $type == OqlToken::STRING) {
00230                     $multibyteDelta += (strlen($value) - mb_strlen($value));
00231                     
00232                     if ($multibyteDelta >= $maxMultibyteDelta)
00233                         $isMultibyte = false;
00234                 }
00235             }
00236             
00237             $this->tokensCount = count($this->tokens);
00238             
00239             return $this;
00240         }
00241         
00242         private static function importTokenValue($value, $type)
00243         {
00244             switch ($type) {
00245                 case OqlToken::STRING:
00246                     $quote = mb_substr($value, 0, 1);
00247                     
00248                     return mb_ereg_replace(
00249                         '\\\\'.$quote,
00250                         $quote,
00251                         mb_substr($value, 1, mb_strlen($value) - 2)
00252                     );
00253                 
00254                 case OqlToken::NUMBER:
00255                     return floatval($value);
00256                 
00257                 case OqlToken::BOOLEAN:
00258                     return strtolower($value) != 'false';
00259                 
00260                 case OqlToken::NULL:
00261                     return 'null';
00262                 
00263                 case OqlToken::AGGREGATE_FUNCTION:
00264                     return strtolower($value);
00265                 
00266                 case OqlToken::SUBSTITUTION:
00267                     return intval(substr($value, 1));
00268                 
00269                 case OqlToken::KEYWORD:
00270                     return strtolower(
00271                         preg_replace('/\s+/', ' ', $value)
00272                     );
00273                 
00274                 case OqlToken::COMPARISON_OPERATOR:
00275                     return $value == '<>' ? BinaryExpression::NOT_EQUALS : $value;
00276             }
00277             
00278             return $value;
00279         }
00280     }
00281 ?>