SpiderScript - Moar fixes, mostly speedups (caching values and lookups)
[tpg/acess2.git] / Usermode / Libraries / libspiderscript.so_src / lex.c
1 /*
2  * SpiderScript
3  * - Script Lexer
4  */
5 #include "tokens.h"
6 #include <stdlib.h>
7 #include <stdio.h>
8 #include <string.h>
9
10 // Make the scope character ('.') be a symbol, otherwise it's just
11 // a ident character
12 #define USE_SCOPE_CHAR  0
13
14 #define DEBUG   0
15
16 #define ARRAY_SIZE(x)   ((sizeof(x))/(sizeof((x)[0])))
17
18 // === PROTOTYPES ===
19  int    is_ident(char ch);
20  int    isdigit(int ch);
21  int    isspace(int ch);
22  int    GetToken(tParser *File);
23
24 // === CONSTANTS ===
25 const struct {
26         const  int      Value;
27         const char      *Name;
28 } csaReservedWords[] = {
29         {TOK_RWD_FUNCTION, "function"},
30         
31         {TOK_RWD_RETURN, "return"},
32         {TOK_RWD_NEW, "new"},
33         
34         {TOK_RWD_IF, "if"},
35         {TOK_RWD_ELSE, "else"},
36         {TOK_RWD_DO, "do"},
37         {TOK_RWD_WHILE, "while"},
38         {TOK_RWD_FOR, "for"},
39         
40         {TOK_RWD_VOID, "void"},
41         {TOK_RWD_OBJECT, "Object"},
42         {TOK_RWD_OPAQUE, "Opaque"},
43         {TOK_RWD_INTEGER, "Integer"},
44         {TOK_RWD_REAL, "Real"},
45         {TOK_RWD_STRING, "String"}
46 };
47
48 // === CODE ===
49 /**
50  * \brief Read a token from a buffer
51  * \param File  Parser state
52  */
53 int GetToken(tParser *File)
54 {
55          int    ret;
56         
57         if( File->NextToken != -1 ) {
58                 // Save Last
59                 File->LastToken = File->Token;
60                 File->LastTokenStr = File->TokenStr;
61                 File->LastTokenLen = File->TokenLen;
62                 File->LastLine = File->CurLine;
63                 // Restore Next
64                 File->Token = File->NextToken;
65                 File->TokenStr = File->NextTokenStr;
66                 File->TokenLen = File->NextTokenLen;
67                 File->CurLine = File->NextLine;
68                 // Set State
69                 File->CurPos = File->TokenStr + File->TokenLen;
70                 File->NextToken = -1;
71                 {
72                         char    buf[ File->TokenLen + 1];
73                         memcpy(buf, File->TokenStr, File->TokenLen);
74                         buf[File->TokenLen] = 0;
75                         #if DEBUG
76                         printf(" GetToken: FAST Return %i (%i long) (%s)\n", File->Token, File->TokenLen, buf);
77                         #endif
78                 }
79                 return File->Token;
80         }
81         
82         //printf("  GetToken: File=%p, File->CurPos = %p\n", File, File->CurPos);
83         
84         // Clear whitespace (including comments)
85         for( ;; )
86         {
87                 // Whitespace
88                 while( isspace( *File->CurPos ) )
89                 {
90                         //printf("whitespace 0x%x, line = %i\n", *File->CurPos, File->CurLine);
91                         if( *File->CurPos == '\n' )
92                                 File->CurLine ++;
93                         File->CurPos ++;
94                 }
95                 
96                 // # Line Comments
97                 if( *File->CurPos == '#' ) {
98                         while( *File->CurPos && *File->CurPos != '\n' )
99                                 File->CurPos ++;
100                         continue ;
101                 }
102                 
103                 // C-Style Line Comments
104                 if( *File->CurPos == '/' && File->CurPos[1] == '/' ) {
105                         while( *File->CurPos && *File->CurPos != '\n' )
106                                 File->CurPos ++;
107                         continue ;
108                 }
109                 
110                 // C-Style Block Comments
111                 if( *File->CurPos == '/' && File->CurPos[1] == '*' ) {
112                         File->CurPos += 2;      // Eat the '/*'
113                         while( *File->CurPos && !(File->CurPos[-1] == '*' && *File->CurPos == '/') )
114                         {
115                                 if( *File->CurPos == '\n' )     File->CurLine ++;
116                                 File->CurPos ++;
117                         }
118                         File->CurPos ++;        // Eat the '/'
119                         continue ;
120                 }
121                 
122                 // No more "whitespace"
123                 break;
124         }
125         
126         // Save previous tokens (speeds up PutBack and LookAhead)
127         File->LastToken = File->Token;
128         File->LastTokenStr = File->TokenStr;
129         File->LastTokenLen = File->TokenLen;
130         File->LastLine = File->CurLine;
131         
132         // Read token
133         File->TokenStr = File->CurPos;
134         switch( *File->CurPos++ )
135         {
136         case '\0':      ret = TOK_EOF;  break;
137         
138         // Operations
139         case '^':
140                 if( *File->CurPos == '^' ) {
141                         File->CurPos ++;
142                         ret = TOK_LOGICXOR;
143                         break;
144                 }
145                 ret = TOK_XOR;
146                 break;
147         
148         case '|':
149                 if( *File->CurPos == '|' ) {
150                         File->CurPos ++;
151                         ret = TOK_LOGICOR;
152                         break;
153                 }
154                 ret = TOK_OR;
155                 break;
156         
157         case '&':
158                 if( *File->CurPos == '&' ) {
159                         File->CurPos ++;
160                         ret = TOK_LOGICAND;
161                         break;
162                 }
163                 ret = TOK_AND;
164                 break;
165         
166         case '/':
167                 if( *File->CurPos == '=' ) {
168                         File->CurPos ++;
169                         ret = TOK_ASSIGN_DIV;
170                         break;
171                 }
172                 ret = TOK_DIV;
173                 break;
174         case '*':
175                 if( *File->CurPos == '=' ) {
176                         File->CurPos ++;
177                         ret = TOK_ASSIGN_MUL;
178                         break;
179                 }
180                 ret = TOK_MUL;
181                 break;
182         case '+':
183                 if( *File->CurPos == '+' ) {
184                         File->CurPos ++;
185                         ret = TOK_INCREMENT;
186                         break;
187                 }
188                 if( *File->CurPos == '=' ) {
189                         File->CurPos ++;
190                         ret = TOK_ASSIGN_PLUS;
191                         break;
192                 }
193                 ret = TOK_PLUS;
194                 break;
195         case '-':
196                 if( *File->CurPos == '-' ) {
197                         File->CurPos ++;
198                         ret = TOK_DECREMENT;
199                         break;
200                 }
201                 if( *File->CurPos == '=' ) {
202                         File->CurPos ++;
203                         ret = TOK_ASSIGN_MINUS;
204                         break;
205                 }
206                 if( *File->CurPos == '>' ) {
207                         File->CurPos ++;
208                         ret = TOK_ELEMENT;
209                         break;
210                 }
211                 ret = TOK_MINUS;
212                 break;
213         
214         // Strings
215         case '"':
216                 while( *File->CurPos && !(*File->CurPos == '"' && *File->CurPos != '\\') )
217                         File->CurPos ++;
218                 if( *File->CurPos )
219                 {
220                         File->CurPos ++;
221                         ret = TOK_STR;
222                 }
223                 else
224                         ret = TOK_EOF;
225                 break;
226         
227         // Brackets
228         case '(':       ret = TOK_PAREN_OPEN;   break;
229         case ')':       ret = TOK_PAREN_CLOSE;  break;
230         case '{':       ret = TOK_BRACE_OPEN;   break;
231         case '}':       ret = TOK_BRACE_CLOSE;  break;
232         case '[':       ret = TOK_SQUARE_OPEN;  break;
233         case ']':       ret = TOK_SQUARE_CLOSE; break;
234         
235         // Core symbols
236         case ';':       ret = TOK_SEMICOLON;    break;
237         case ',':       ret = TOK_COMMA;        break;
238         #if USE_SCOPE_CHAR
239         case '.':       ret = TOK_SCOPE;        break;
240         #endif
241         
242         // Equals
243         case '=':
244                 // Comparison Equals
245                 if( *File->CurPos == '=' ) {
246                         File->CurPos ++;
247                         ret = TOK_EQUALS;
248                         break;
249                 }
250                 // Assignment Equals
251                 ret = TOK_ASSIGN;
252                 break;
253         
254         // Less-Than
255         case '<':
256                 // Less-Than or Equal
257                 if( *File->CurPos == '=' ) {
258                         File->CurPos ++;
259                         ret = TOK_LTE;
260                         break;
261                 }
262                 ret = TOK_LT;
263                 break;
264         
265         // Greater-Than
266         case '>':
267                 // Greater-Than or Equal
268                 if( *File->CurPos == '=' ) {
269                         File->CurPos ++;
270                         ret = TOK_GTE;
271                         break;
272                 }
273                 ret = TOK_GT;
274                 break;
275         
276         // Logical NOT
277         case '!':
278                 ret = TOK_LOGICNOT;
279                 break;
280         // Bitwise NOT
281         case '~':
282                 ret = TOK_BWNOT;
283                 break;
284         
285         // Variables
286         // \$[0-9]+ or \$[_a-zA-Z][_a-zA-Z0-9]*
287         case '$':
288                 // Numeric Variable
289                 if( isdigit( *File->CurPos ) ) {
290                         while( isdigit(*File->CurPos) )
291                                 File->CurPos ++;
292                 }
293                 // Ident Variable
294                 else {
295                         while( is_ident(*File->CurPos) || isdigit(*File->CurPos) )
296                                 File->CurPos ++;
297                 }
298                 ret = TOK_VARIABLE;
299                 break;
300         
301         // Default (Numbers and Identifiers)
302         default:
303                 File->CurPos --;
304                 
305                 // Numbers
306                 if( isdigit(*File->CurPos) )
307                 {
308                         ret = TOK_INTEGER;
309                         if( *File->CurPos == '0' && File->CurPos[1] == 'x' )
310                         {
311                                 File->CurPos += 2;
312                                 while(('0' <= *File->CurPos && *File->CurPos <= '9')
313                                    || ('A' <= *File->CurPos && *File->CurPos <= 'F')
314                                    || ('a' <= *File->CurPos && *File->CurPos <= 'f') )
315                                 {
316                                         File->CurPos ++;
317                                 }
318                         }
319                         else
320                         {
321                                 while( isdigit(*File->CurPos) )
322                                         File->CurPos ++;
323                                 
324 //                              printf("*File->CurPos = '%c'\n", *File->CurPos);
325                                 
326                                 // Decimal
327                                 if( *File->CurPos == '.' )
328                                 {
329                                         ret = TOK_REAL;
330                                         File->CurPos ++;
331                                         while( isdigit(*File->CurPos) )
332                                                 File->CurPos ++;
333                                 }
334                                 // Exponent
335                                 if( *File->CurPos == 'e' || *File->CurPos == 'E' )
336                                 {
337                                         ret = TOK_REAL;
338                                         File->CurPos ++;
339                                         if(*File->CurPos == '-' || *File->CurPos == '+')
340                                                 File->CurPos ++;
341                                         while( isdigit(*File->CurPos) )
342                                                 File->CurPos ++;
343                                 }
344                                 
345 //                              printf(" ret = %i\n", ret);
346                         }
347                         break;
348                 }
349         
350                 // Identifier
351                 if( is_ident(*File->CurPos) )
352                 {
353                         ret = TOK_IDENT;
354                         
355                         // Identifier
356                         while( is_ident(*File->CurPos) || isdigit(*File->CurPos) )
357                                 File->CurPos ++;
358                         
359                         // This is set later too, but we use it below
360                         File->TokenLen = File->CurPos - File->TokenStr;
361                         
362                         // Check if it's a reserved word
363                         {
364                                 char    buf[File->TokenLen + 1];
365                                  int    i;
366                                 memcpy(buf, File->TokenStr, File->TokenLen);
367                                 buf[File->TokenLen] = 0;
368                                 for( i = 0; i < ARRAY_SIZE(csaReservedWords); i ++ )
369                                 {
370                                         if(strcmp(csaReservedWords[i].Name, buf) == 0) {
371                                                 ret = csaReservedWords[i].Value;
372                                                 break ;
373                                         }
374                                 }
375                         }
376                         // If there's no match, just keep ret as TOK_IDENT
377                         
378                         break;
379                 }
380                 // Syntax Error
381                 ret = TOK_INVAL;
382                 
383                 fprintf(stderr, "Syntax Error: Unknown symbol '%c'\n", *File->CurPos);
384                 longjmp(File->JmpTarget, 1);
385                 
386                 break;
387         }
388         // Return
389         File->Token = ret;
390         File->TokenLen = File->CurPos - File->TokenStr;
391         
392         #if DEBUG
393         {
394                 char    buf[ File->TokenLen + 1];
395                 memcpy(buf, File->TokenStr, File->TokenLen);
396                 buf[File->TokenLen] = 0;
397                 //printf("  GetToken: File->CurPos = %p\n", File->CurPos);
398                 printf(" GetToken: Return %i (%i long) (%s)\n", ret, File->TokenLen, buf);
399         }
400         #endif
401         return ret;
402 }
403
404 void PutBack(tParser *File)
405 {
406         if( File->LastToken == -1 ) {
407                 // ERROR:
408                 fprintf(stderr, "INTERNAL ERROR: Putback when LastToken==-1\n");
409                 longjmp( File->JmpTarget, -1 );
410                 return ;
411         }
412         #if DEBUG
413         printf(" PutBack: Was on %i\n", File->Token);
414         #endif
415         // Save
416         File->NextLine = File->CurLine;
417         File->NextToken = File->Token;
418         File->NextTokenStr = File->TokenStr;
419         File->NextTokenLen = File->TokenLen;
420         // Restore
421         File->CurLine = File->LastLine;
422         File->Token = File->LastToken;
423         File->TokenStr = File->LastTokenStr;
424         File->TokenLen = File->LastTokenLen;
425         File->CurPos = File->NextTokenStr;
426         // Invalidate
427         File->LastToken = -1;
428 }
429
430 int LookAhead(tParser *File)
431 {
432         // TODO: Should I save the entire state here?
433          int    ret = GetToken(File);
434         PutBack(File);
435         return ret;
436 }
437
438 // --- Helpers ---
439 /**
440  * \brief Check for ident characters
441  * \note Matches Regex [a-zA-Z_]
442  */
443 int is_ident(char ch)
444 {
445         if('a' <= ch && ch <= 'z')      return 1;
446         if('A' <= ch && ch <= 'Z')      return 1;
447         if(ch == '_')   return 1;
448         #if !USE_SCOPE_CHAR
449         if(ch == '.')   return 1;
450         #endif
451         if(ch < 0)      return 1;
452         return 0;
453 }
454
455 int isdigit(int ch)
456 {
457         if('0' <= ch && ch <= '9')      return 1;
458         return 0;
459 }
460
461 int isspace(int ch)
462 {
463         if(' ' == ch)   return 1;
464         if('\t' == ch)  return 1;
465         if('\b' == ch)  return 1;
466         if('\n' == ch)  return 1;
467         if('\r' == ch)  return 1;
468         return 0;
469 }

UCC git Repository :: git.ucc.asn.au