lexer.cpp
00001 // -*- c-basic-offset: 2 -*- 00002 /* 00003 * This file is part of the KDE libraries 00004 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) 00005 * 00006 * This library is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Library General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2 of the License, or (at your option) any later version. 00010 * 00011 * This library is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Library General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Library General Public License 00017 * along with this library; see the file COPYING.LIB. If not, write to 00018 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00019 * Boston, MA 02110-1301, USA. 00020 * 00021 */ 00022 00023 #ifdef HAVE_CONFIG_H 00024 #include <config.h> 00025 #endif 00026 00027 #include <ctype.h> 00028 #include <stdlib.h> 00029 #include <stdio.h> 00030 #include <string.h> 00031 #include <assert.h> 00032 00033 #include "value.h" 00034 #include "object.h" 00035 #include "types.h" 00036 #include "interpreter.h" 00037 #include "nodes.h" 00038 #include "lexer.h" 00039 #include "identifier.h" 00040 #include "lookup.h" 00041 #include "internal.h" 00042 #include "dtoa.h" 00043 00044 // we can't specify the namespace in yacc's C output, so do it here 00045 using namespace KJS; 00046 00047 static Lexer *currLexer = 0; 00048 00049 #ifndef KDE_USE_FINAL 00050 #include "grammar.h" 00051 #endif 00052 00053 #include "lexer.lut.h" 00054 00055 extern YYLTYPE yylloc; // global bison variable holding token info 00056 00057 // a bridge for yacc from the C world to C++ 00058 int kjsyylex() 00059 { 00060 return Lexer::curr()->lex(); 00061 } 00062 00063 Lexer::Lexer() 00064 : yylineno(1), 00065 size8(128), size16(128), restrKeyword(false), 00066 convertNextIdentifier(false), stackToken(-1), lastToken(-1), pos(0), 00067 code(0), length(0), 00068 #ifndef KJS_PURE_ECMA 00069 bol(true), 00070 #endif 00071 current(0), next1(0), next2(0), next3(0), 00072 strings(0), numStrings(0), stringsCapacity(0), 00073 identifiers(0), numIdentifiers(0), identifiersCapacity(0) 00074 { 00075 // allocate space for read buffers 00076 buffer8 = new char[size8]; 00077 buffer16 = new UChar[size16]; 00078 currLexer = this; 00079 } 00080 00081 Lexer::~Lexer() 00082 { 00083 delete [] buffer8; 00084 delete [] buffer16; 00085 } 00086 00087 Lexer *Lexer::curr() 00088 { 00089 if (!currLexer) { 00090 // create singleton instance 00091 currLexer = new Lexer(); 00092 } 00093 return currLexer; 00094 } 00095 00096 #ifdef KJS_DEBUG_MEM 00097 void Lexer::globalClear() 00098 { 00099 delete currLexer; 00100 currLexer = 0L; 00101 } 00102 #endif 00103 00104 void Lexer::setCode(const UChar *c, unsigned int len) 00105 { 00106 yylineno = 1; 00107 restrKeyword = false; 00108 delimited = false; 00109 convertNextIdentifier = false; 00110 stackToken = -1; 00111 lastToken = -1; 00112 foundBad = false; 00113 pos = 0; 00114 code = c; 00115 length = len; 00116 skipLF = false; 00117 skipCR = false; 00118 #ifndef KJS_PURE_ECMA 00119 bol = true; 00120 #endif 00121 00122 // read first characters 00123 current = (length > 0) ? code[0].uc : -1; 00124 next1 = (length > 1) ? code[1].uc : -1; 00125 next2 = (length > 2) ? code[2].uc : -1; 00126 next3 = (length > 3) ? code[3].uc : -1; 00127 } 00128 00129 void Lexer::shift(unsigned int p) 00130 { 00131 while (p--) { 00132 pos++; 00133 current = next1; 00134 next1 = next2; 00135 next2 = next3; 00136 next3 = (pos + 3 < length) ? code[pos+3].uc : -1; 00137 } 00138 } 00139 00140 // called on each new line 00141 void Lexer::nextLine() 00142 { 00143 yylineno++; 00144 #ifndef KJS_PURE_ECMA 00145 bol = true; 00146 #endif 00147 } 00148 00149 void Lexer::setDone(State s) 00150 { 00151 state = s; 00152 done = true; 00153 } 00154 00155 int Lexer::lex() 00156 { 00157 int token = 0; 00158 state = Start; 00159 unsigned short stringType = 0; // either single or double quotes 00160 pos8 = pos16 = 0; 00161 done = false; 00162 terminator = false; 00163 skipLF = false; 00164 skipCR = false; 00165 00166 // did we push a token on the stack previously ? 00167 // (after an automatic semicolon insertion) 00168 if (stackToken >= 0) { 00169 setDone(Other); 00170 token = stackToken; 00171 stackToken = 0; 00172 } 00173 00174 while (!done) { 00175 if (skipLF && current != '\n') // found \r but not \n afterwards 00176 skipLF = false; 00177 if (skipCR && current != '\r') // found \n but not \r afterwards 00178 skipCR = false; 00179 if (skipLF || skipCR) // found \r\n or \n\r -> eat the second one 00180 { 00181 skipLF = false; 00182 skipCR = false; 00183 shift(1); 00184 } 00185 00186 bool cr = (current == '\r'); 00187 bool lf = (current == '\n'); 00188 if (cr) 00189 skipLF = true; 00190 else if (lf) 00191 skipCR = true; 00192 bool isLineTerminator = cr || lf; 00193 00194 switch (state) { 00195 case Start: 00196 if (isWhiteSpace(current)) { 00197 // do nothing 00198 } else if (current == '/' && next1 == '/') { 00199 shift(1); 00200 state = InSingleLineComment; 00201 } else if (current == '/' && next1 == '*') { 00202 shift(1); 00203 state = InMultiLineComment; 00204 } else if (current == -1) { 00205 if (!terminator && !delimited) { 00206 // automatic semicolon insertion if program incomplete 00207 token = ';'; 00208 stackToken = 0; 00209 setDone(Other); 00210 } else 00211 setDone(Eof); 00212 } else if (isLineTerminator) { 00213 nextLine(); 00214 terminator = true; 00215 if (restrKeyword) { 00216 token = ';'; 00217 setDone(Other); 00218 } 00219 } else if (current == '"' || current == '\'') { 00220 state = InString; 00221 stringType = current; 00222 } else if (isIdentLetter(current)) { 00223 record16(current); 00224 state = InIdentifierOrKeyword; 00225 } else if (current == '\\') { 00226 state = InIdentifierUnicodeEscapeStart; 00227 } else if (current == '0') { 00228 record8(current); 00229 state = InNum0; 00230 } else if (isDecimalDigit(current)) { 00231 record8(current); 00232 state = InNum; 00233 } else if (current == '.' && isDecimalDigit(next1)) { 00234 record8(current); 00235 state = InDecimal; 00236 #ifndef KJS_PURE_ECMA 00237 // <!-- marks the beginning of a line comment (for www usage) 00238 } else if (current == '<' && next1 == '!' && 00239 next2 == '-' && next3 == '-') { 00240 shift(3); 00241 state = InSingleLineComment; 00242 // same for --> 00243 } else if (bol && current == '-' && next1 == '-' && next2 == '>') { 00244 shift(2); 00245 state = InSingleLineComment; 00246 #endif 00247 } else { 00248 token = matchPunctuator(current, next1, next2, next3); 00249 if (token != -1) { 00250 setDone(Other); 00251 } else { 00252 // cerr << "encountered unknown character" << endl; 00253 setDone(Bad); 00254 } 00255 } 00256 break; 00257 case InString: 00258 if (current == stringType) { 00259 shift(1); 00260 setDone(String); 00261 } else if (current == -1 || isLineTerminator) { 00262 setDone(Bad); 00263 } else if (current == '\\') { 00264 state = InEscapeSequence; 00265 } else { 00266 record16(current); 00267 } 00268 break; 00269 // Escape Sequences inside of strings 00270 case InEscapeSequence: 00271 if (isOctalDigit(current)) { 00272 if (current >= '0' && current <= '3' && 00273 isOctalDigit(next1) && isOctalDigit(next2)) { 00274 record16(convertOctal(current, next1, next2)); 00275 shift(2); 00276 state = InString; 00277 } else if (isOctalDigit(current) && isOctalDigit(next1)) { 00278 record16(convertOctal('0', current, next1)); 00279 shift(1); 00280 state = InString; 00281 } else if (isOctalDigit(current)) { 00282 record16(convertOctal('0', '0', current)); 00283 state = InString; 00284 } else { 00285 setDone(Bad); 00286 } 00287 } else if (current == 'x') 00288 state = InHexEscape; 00289 else if (current == 'u') 00290 state = InUnicodeEscape; 00291 else { 00292 if (isLineTerminator) 00293 nextLine(); 00294 record16(singleEscape(current)); 00295 state = InString; 00296 } 00297 break; 00298 case InHexEscape: 00299 if (isHexDigit(current) && isHexDigit(next1)) { 00300 state = InString; 00301 record16(convertHex(current, next1)); 00302 shift(1); 00303 } else if (current == stringType) { 00304 record16('x'); 00305 shift(1); 00306 setDone(String); 00307 } else { 00308 record16('x'); 00309 record16(current); 00310 state = InString; 00311 } 00312 break; 00313 case InUnicodeEscape: 00314 if (isHexDigit(current) && isHexDigit(next1) && 00315 isHexDigit(next2) && isHexDigit(next3)) { 00316 record16(convertUnicode(current, next1, next2, next3)); 00317 shift(3); 00318 state = InString; 00319 } else if (current == stringType) { 00320 record16('u'); 00321 shift(1); 00322 setDone(String); 00323 } else { 00324 setDone(Bad); 00325 } 00326 break; 00327 case InSingleLineComment: 00328 if (isLineTerminator) { 00329 nextLine(); 00330 terminator = true; 00331 if (restrKeyword) { 00332 token = ';'; 00333 setDone(Other); 00334 } else 00335 state = Start; 00336 } else if (current == -1) { 00337 setDone(Eof); 00338 } 00339 break; 00340 case InMultiLineComment: 00341 if (current == -1) { 00342 setDone(Bad); 00343 } else if (isLineTerminator) { 00344 nextLine(); 00345 } else if (current == '*' && next1 == '/') { 00346 state = Start; 00347 shift(1); 00348 } 00349 break; 00350 case InIdentifierOrKeyword: 00351 case InIdentifier: 00352 if (isIdentLetter(current) || isDecimalDigit(current)) 00353 record16(current); 00354 else if (current == '\\') 00355 state = InIdentifierUnicodeEscapeStart; 00356 else 00357 setDone(state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier); 00358 break; 00359 case InNum0: 00360 if (current == 'x' || current == 'X') { 00361 record8(current); 00362 state = InHex; 00363 } else if (current == '.') { 00364 record8(current); 00365 state = InDecimal; 00366 } else if (current == 'e' || current == 'E') { 00367 record8(current); 00368 state = InExponentIndicator; 00369 } else if (isOctalDigit(current)) { 00370 record8(current); 00371 state = InOctal; 00372 } else if (isDecimalDigit(current)) { 00373 record8(current); 00374 state = InDecimal; 00375 } else { 00376 setDone(Number); 00377 } 00378 break; 00379 case InHex: 00380 if (isHexDigit(current)) { 00381 record8(current); 00382 } else { 00383 setDone(Hex); 00384 } 00385 break; 00386 case InOctal: 00387 if (isOctalDigit(current)) { 00388 record8(current); 00389 } 00390 else if (isDecimalDigit(current)) { 00391 record8(current); 00392 state = InDecimal; 00393 } else 00394 setDone(Octal); 00395 break; 00396 case InNum: 00397 if (isDecimalDigit(current)) { 00398 record8(current); 00399 } else if (current == '.') { 00400 record8(current); 00401 state = InDecimal; 00402 } else if (current == 'e' || current == 'E') { 00403 record8(current); 00404 state = InExponentIndicator; 00405 } else 00406 setDone(Number); 00407 break; 00408 case InDecimal: 00409 if (isDecimalDigit(current)) { 00410 record8(current); 00411 } else if (current == 'e' || current == 'E') { 00412 record8(current); 00413 state = InExponentIndicator; 00414 } else 00415 setDone(Number); 00416 break; 00417 case InExponentIndicator: 00418 if (current == '+' || current == '-') { 00419 record8(current); 00420 } else if (isDecimalDigit(current)) { 00421 record8(current); 00422 state = InExponent; 00423 } else 00424 setDone(Bad); 00425 break; 00426 case InExponent: 00427 if (isDecimalDigit(current)) { 00428 record8(current); 00429 } else 00430 setDone(Number); 00431 break; 00432 case InIdentifierUnicodeEscapeStart: 00433 if (current == 'u') 00434 state = InIdentifierUnicodeEscape; 00435 else 00436 setDone(Bad); 00437 break; 00438 case InIdentifierUnicodeEscape: 00439 if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) { 00440 record16(convertUnicode(current, next1, next2, next3)); 00441 shift(3); 00442 state = InIdentifier; 00443 } else { 00444 setDone(Bad); 00445 } 00446 break; 00447 default: 00448 assert(!"Unhandled state in switch statement"); 00449 } 00450 00451 // move on to the next character 00452 if (!done) 00453 shift(1); 00454 #ifndef KJS_PURE_ECMA 00455 if (state != Start && state != InSingleLineComment) 00456 bol = false; 00457 #endif 00458 } 00459 00460 // no identifiers allowed directly after numeric literal, e.g. "3in" is bad 00461 if ((state == Number || state == Octal || state == Hex) 00462 && isIdentLetter(current)) 00463 state = Bad; 00464 00465 // terminate string 00466 buffer8[pos8] = '\0'; 00467 00468 #ifdef KJS_DEBUG_LEX 00469 fprintf(stderr, "line: %d ", lineNo()); 00470 fprintf(stderr, "yytext (%x): ", buffer8[0]); 00471 fprintf(stderr, "%s ", buffer8); 00472 #endif 00473 00474 long double dval = 0; 00475 if (state == Number) { 00476 dval = kjs_strtod(buffer8, 0L); 00477 } else if (state == Hex) { // scan hex numbers 00478 dval = 0; 00479 if (buffer8[0] == '0' && (buffer8[1] == 'x' || buffer8[1] == 'X')) { 00480 for (const char *p = buffer8+2; *p; p++) { 00481 if (!isHexDigit(*p)) { 00482 dval = 0; 00483 break; 00484 } 00485 dval = dval * 16 + convertHex(*p); 00486 } 00487 } 00488 state = Number; 00489 } else if (state == Octal) { // scan octal number 00490 dval = 0; 00491 if (buffer8[0] == '0') { 00492 for (const char *p = buffer8+1; *p; p++) { 00493 if (*p < '0' || *p > '7') { 00494 dval = 0; 00495 break; 00496 } 00497 dval = dval * 8 + *p - '0'; 00498 } 00499 } 00500 state = Number; 00501 } 00502 00503 #ifdef KJS_DEBUG_LEX 00504 switch (state) { 00505 case Eof: 00506 printf("(EOF)\n"); 00507 break; 00508 case Other: 00509 printf("(Other)\n"); 00510 break; 00511 case Identifier: 00512 case IdentifierOrKeyword: 00513 printf("(Identifier)/(Keyword)\n"); 00514 break; 00515 case String: 00516 printf("(String)\n"); 00517 break; 00518 case Number: 00519 printf("(Number)\n"); 00520 break; 00521 default: 00522 printf("(unknown)"); 00523 } 00524 #endif 00525 00526 if (state != Identifier && state != IdentifierOrKeyword && 00527 convertNextIdentifier) 00528 convertNextIdentifier = false; 00529 00530 restrKeyword = false; 00531 delimited = false; 00532 kjsyylloc.first_line = yylineno; // ??? 00533 kjsyylloc.last_line = yylineno; 00534 00535 switch (state) { 00536 case Eof: 00537 token = 0; 00538 break; 00539 case Other: 00540 if(token == '}' || token == ';') { 00541 delimited = true; 00542 } 00543 break; 00544 case IdentifierOrKeyword: 00545 if ((token = Lookup::find(&mainTable, buffer16, pos16)) < 0) { 00546 case Identifier: 00547 // Lookup for keyword failed, means this is an identifier 00548 // Apply anonymous-function hack below (convert the identifier) 00549 if (convertNextIdentifier) { 00550 convertNextIdentifier = false; 00551 #ifdef KJS_VERBOSE 00552 UString debugstr(buffer16, pos16); fprintf(stderr,"Anonymous function hack: eating identifier %s\n",debugstr.ascii()); 00553 #endif 00554 token = FUNCEXPRIDENT; 00555 } else { 00556 token = IDENT; 00557 } 00558 /* TODO: close leak on parse error. same holds true for String */ 00559 kjsyylval.ident = makeIdentifier(buffer16, pos16); 00560 break; 00561 } 00562 00563 convertNextIdentifier = false; 00564 // Hack for "f = function somename() { ... }", too hard to get into the grammar 00565 // Same for building an array with function pointers ( 'name', func1, 'name2', func2 ) 00566 // There are lots of other uses, we really have to get this into the grammar 00567 if ( token == FUNCTION && 00568 ( lastToken == '=' || lastToken == ',' || lastToken == '(' || 00569 lastToken == ':' || lastToken == RETURN ) ) 00570 convertNextIdentifier = true; 00571 00572 if (token == CONTINUE || token == BREAK || 00573 token == RETURN || token == THROW) 00574 restrKeyword = true; 00575 break; 00576 case String: 00577 kjsyylval.ustr = makeUString(buffer16, pos16); 00578 token = STRING; 00579 break; 00580 case Number: 00581 kjsyylval.dval = dval; 00582 token = NUMBER; 00583 break; 00584 case Bad: 00585 foundBad = true; 00586 return -1; 00587 default: 00588 assert(!"unhandled numeration value in switch"); 00589 return -1; 00590 } 00591 lastToken = token; 00592 return token; 00593 } 00594 00595 bool Lexer::isWhiteSpace(unsigned short c) 00596 { 00597 return (c == ' ' || c == '\t' || 00598 c == 0x0b || c == 0x0c || c == 0xa0); 00599 } 00600 00601 bool Lexer::isIdentLetter(unsigned short c) 00602 { 00603 // Allow any character in the Unicode categories 00604 // Uppercase letter (Lu), Lowercase letter (Ll), 00605 // Titlecase letter (Lt)", Modifier letter (Lm), 00606 // Other letter (Lo), or Letter number (Nl). 00607 // Also see: http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */ 00608 return (c >= 'a' && c <= 'z' || 00609 c >= 'A' && c <= 'Z' || 00610 // A with grave - O with diaeresis 00611 c >= 0x00c0 && c <= 0x00d6 || 00612 // O with stroke - o with diaeresis 00613 c >= 0x00d8 && c <= 0x00f6 || 00614 // o with stroke - turned h with fishook and tail 00615 c >= 0x00f8 && c <= 0x02af || 00616 // Greek etc. TODO: not precise 00617 c >= 0x0388 && c <= 0x1ffc || 00618 c == '$' || c == '_'); 00619 /* TODO: use complete category table */ 00620 } 00621 00622 bool Lexer::isDecimalDigit(unsigned short c) 00623 { 00624 return (c >= '0' && c <= '9'); 00625 } 00626 00627 bool Lexer::isHexDigit(unsigned short c) 00628 { 00629 return (c >= '0' && c <= '9' || 00630 c >= 'a' && c <= 'f' || 00631 c >= 'A' && c <= 'F'); 00632 } 00633 00634 bool Lexer::isOctalDigit(unsigned short c) 00635 { 00636 return (c >= '0' && c <= '7'); 00637 } 00638 00639 int Lexer::matchPunctuator(unsigned short c1, unsigned short c2, 00640 unsigned short c3, unsigned short c4) 00641 { 00642 if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') { 00643 shift(4); 00644 return URSHIFTEQUAL; 00645 } else if (c1 == '=' && c2 == '=' && c3 == '=') { 00646 shift(3); 00647 return STREQ; 00648 } else if (c1 == '!' && c2 == '=' && c3 == '=') { 00649 shift(3); 00650 return STRNEQ; 00651 } else if (c1 == '>' && c2 == '>' && c3 == '>') { 00652 shift(3); 00653 return URSHIFT; 00654 } else if (c1 == '<' && c2 == '<' && c3 == '=') { 00655 shift(3); 00656 return LSHIFTEQUAL; 00657 } else if (c1 == '>' && c2 == '>' && c3 == '=') { 00658 shift(3); 00659 return RSHIFTEQUAL; 00660 } else if (c1 == '<' && c2 == '=') { 00661 shift(2); 00662 return LE; 00663 } else if (c1 == '>' && c2 == '=') { 00664 shift(2); 00665 return GE; 00666 } else if (c1 == '!' && c2 == '=') { 00667 shift(2); 00668 return NE; 00669 } else if (c1 == '+' && c2 == '+') { 00670 shift(2); 00671 if (terminator) 00672 return AUTOPLUSPLUS; 00673 else 00674 return PLUSPLUS; 00675 } else if (c1 == '-' && c2 == '-') { 00676 shift(2); 00677 if (terminator) 00678 return AUTOMINUSMINUS; 00679 else 00680 return MINUSMINUS; 00681 } else if (c1 == '=' && c2 == '=') { 00682 shift(2); 00683 return EQEQ; 00684 } else if (c1 == '+' && c2 == '=') { 00685 shift(2); 00686 return PLUSEQUAL; 00687 } else if (c1 == '-' && c2 == '=') { 00688 shift(2); 00689 return MINUSEQUAL; 00690 } else if (c1 == '*' && c2 == '=') { 00691 shift(2); 00692 return MULTEQUAL; 00693 } else if (c1 == '/' && c2 == '=') { 00694 shift(2); 00695 return DIVEQUAL; 00696 } else if (c1 == '&' && c2 == '=') { 00697 shift(2); 00698 return ANDEQUAL; 00699 } else if (c1 == '^' && c2 == '=') { 00700 shift(2); 00701 return XOREQUAL; 00702 } else if (c1 == '%' && c2 == '=') { 00703 shift(2); 00704 return MODEQUAL; 00705 } else if (c1 == '|' && c2 == '=') { 00706 shift(2); 00707 return OREQUAL; 00708 } else if (c1 == '<' && c2 == '<') { 00709 shift(2); 00710 return LSHIFT; 00711 } else if (c1 == '>' && c2 == '>') { 00712 shift(2); 00713 return RSHIFT; 00714 } else if (c1 == '&' && c2 == '&') { 00715 shift(2); 00716 return AND; 00717 } else if (c1 == '|' && c2 == '|') { 00718 shift(2); 00719 return OR; 00720 } 00721 00722 switch(c1) { 00723 case '=': 00724 case '>': 00725 case '<': 00726 case ',': 00727 case '!': 00728 case '~': 00729 case '?': 00730 case ':': 00731 case '.': 00732 case '+': 00733 case '-': 00734 case '*': 00735 case '/': 00736 case '&': 00737 case '|': 00738 case '^': 00739 case '%': 00740 case '(': 00741 case ')': 00742 case '{': 00743 case '}': 00744 case '[': 00745 case ']': 00746 case ';': 00747 shift(1); 00748 return static_cast<int>(c1); 00749 default: 00750 return -1; 00751 } 00752 } 00753 00754 unsigned short Lexer::singleEscape(unsigned short c) const 00755 { 00756 switch(c) { 00757 case 'b': 00758 return 0x08; 00759 case 't': 00760 return 0x09; 00761 case 'n': 00762 return 0x0A; 00763 case 'v': 00764 return 0x0B; 00765 case 'f': 00766 return 0x0C; 00767 case 'r': 00768 return 0x0D; 00769 case '"': 00770 return 0x22; 00771 case '\'': 00772 return 0x27; 00773 case '\\': 00774 return 0x5C; 00775 default: 00776 return c; 00777 } 00778 } 00779 00780 unsigned short Lexer::convertOctal(unsigned short c1, unsigned short c2, 00781 unsigned short c3) const 00782 { 00783 return ((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0'); 00784 } 00785 00786 unsigned char Lexer::convertHex(unsigned short c) 00787 { 00788 if (c >= '0' && c <= '9') 00789 return (c - '0'); 00790 else if (c >= 'a' && c <= 'f') 00791 return (c - 'a' + 10); 00792 else 00793 return (c - 'A' + 10); 00794 } 00795 00796 unsigned char Lexer::convertHex(unsigned short c1, unsigned short c2) 00797 { 00798 return ((convertHex(c1) << 4) + convertHex(c2)); 00799 } 00800 00801 UChar Lexer::convertUnicode(unsigned short c1, unsigned short c2, 00802 unsigned short c3, unsigned short c4) 00803 { 00804 return UChar((convertHex(c1) << 4) + convertHex(c2), 00805 (convertHex(c3) << 4) + convertHex(c4)); 00806 } 00807 00808 void Lexer::record8(unsigned short c) 00809 { 00810 assert(c <= 0xff); 00811 00812 // enlarge buffer if full 00813 if (pos8 >= size8 - 1) { 00814 char *tmp = new char[2 * size8]; 00815 memcpy(tmp, buffer8, size8 * sizeof(char)); 00816 delete [] buffer8; 00817 buffer8 = tmp; 00818 size8 *= 2; 00819 } 00820 00821 buffer8[pos8++] = (char) c; 00822 } 00823 00824 void Lexer::record16(int c) 00825 { 00826 assert(c >= 0); 00827 //assert(c <= USHRT_MAX); 00828 record16(UChar(static_cast<unsigned short>(c))); 00829 } 00830 00831 void Lexer::record16(UChar c) 00832 { 00833 // enlarge buffer if full 00834 if (pos16 >= size16 - 1) { 00835 UChar *tmp = new UChar[2 * size16]; 00836 memcpy(tmp, buffer16, size16 * sizeof(UChar)); 00837 delete [] buffer16; 00838 buffer16 = tmp; 00839 size16 *= 2; 00840 } 00841 00842 buffer16[pos16++] = c; 00843 } 00844 00845 bool Lexer::scanRegExp() 00846 { 00847 pos16 = 0; 00848 bool lastWasEscape = false; 00849 bool inBrackets = false; 00850 00851 while (1) { 00852 if (current == '\r' || current == '\n' || current == -1) 00853 return false; 00854 else if (current != '/' || lastWasEscape == true || inBrackets == true) 00855 { 00856 // keep track of '[' and ']' 00857 if ( !lastWasEscape ) { 00858 if ( current == '[' && !inBrackets ) 00859 inBrackets = true; 00860 if ( current == ']' && inBrackets ) 00861 inBrackets = false; 00862 } 00863 record16(current); 00864 lastWasEscape = 00865 !lastWasEscape && (current == '\\'); 00866 } 00867 else { // end of regexp 00868 pattern = UString(buffer16, pos16); 00869 pos16 = 0; 00870 shift(1); 00871 break; 00872 } 00873 shift(1); 00874 } 00875 00876 while (isIdentLetter(current)) { 00877 record16(current); 00878 shift(1); 00879 } 00880 flags = UString(buffer16, pos16); 00881 00882 return true; 00883 } 00884 00885 00886 void Lexer::doneParsing() 00887 { 00888 for (unsigned i = 0; i < numIdentifiers; i++) { 00889 delete identifiers[i]; 00890 } 00891 free(identifiers); 00892 identifiers = 0; 00893 numIdentifiers = 0; 00894 identifiersCapacity = 0; 00895 00896 for (unsigned i = 0; i < numStrings; i++) { 00897 delete strings[i]; 00898 } 00899 free(strings); 00900 strings = 0; 00901 numStrings = 0; 00902 stringsCapacity = 0; 00903 } 00904 00905 const int initialCapacity = 64; 00906 const int growthFactor = 2; 00907 00908 Identifier *Lexer::makeIdentifier(UChar *buffer, unsigned int pos) 00909 { 00910 if (numIdentifiers == identifiersCapacity) { 00911 identifiersCapacity = (identifiersCapacity == 0) ? initialCapacity : identifiersCapacity *growthFactor; 00912 identifiers = (KJS::Identifier **)realloc(identifiers, sizeof(KJS::Identifier *) * identifiersCapacity); 00913 } 00914 00915 KJS::Identifier *identifier = new KJS::Identifier(buffer, pos); 00916 identifiers[numIdentifiers++] = identifier; 00917 return identifier; 00918 } 00919 00920 UString *Lexer::makeUString(UChar *buffer, unsigned int pos) 00921 { 00922 if (numStrings == stringsCapacity) { 00923 stringsCapacity = (stringsCapacity == 0) ? initialCapacity : stringsCapacity *growthFactor; 00924 strings = (UString **)realloc(strings, sizeof(UString *) * stringsCapacity); 00925 } 00926 00927 UString *string = new UString(buffer, pos); 00928 strings[numStrings++] = string; 00929 return string; 00930 }