"Fossies" - the Fresh Open Source Software Archive

Member "fityk-1.3.1/fityk/lexer.cpp" (18 Dec 2016, 13190 Bytes) of package /linux/misc/fityk-1.3.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "lexer.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.3.0_vs_1.3.1.

    1 // This file is part of fityk program. Copyright 2001-2013 Marcin Wojdyr
    2 // Licence: GNU General Public License ver. 2+
    3 
    4 /// Lexical analyser. Takes C string and yields tokens.
    5 
    6 #define BUILDING_LIBFITYK
    7 #include "lexer.h"
    8 
    9 #include <string.h>
   10 #include <ctype.h>
   11 #include <stdlib.h>
   12 #include <assert.h>
   13 
   14 #include "fityk.h" // SyntaxError
   15 #include "common.h" // S()
   16 
   17 using namespace std;
   18 
   19 namespace fityk {
   20 
   21 string Lexer::get_string(const Token& token)
   22 {
   23     switch (token.type) {
   24         case kTokenString:
   25             return string(token.str+1, token.length - 2);
   26         case kTokenVarname:
   27             return string(token.str+1, token.length - 1);
   28         case kTokenFuncname:
   29             return string(token.str+1, token.length - 1);
   30         default:
   31             //assert(!"Unexpected token in get_string()");
   32             return token.as_string();
   33     }
   34 }
   35 
   36 const char* tokentype2str(TokenType tt)
   37 {
   38     switch (tt) {
   39         case kTokenLname: return "lower_case_name";
   40         case kTokenCname: return "CamelCaseName";
   41         case kTokenUletter: return "Upper-case-letter";
   42         case kTokenString: return "'quoted-string'";
   43         case kTokenVarname: return "$variable_name";
   44         case kTokenFuncname: return "%func_name";
   45         case kTokenNumber: return "number";
   46         case kTokenDataset: return "@dataset";
   47         case kTokenWord: return "word";
   48         case kTokenExpr: return "expr";
   49         case kTokenEVar: return "var-expr";
   50         case kTokenRest: return "rest-of-line";
   51 
   52         case kTokenLE: return "<=";
   53         case kTokenGE: return ">=";
   54         case kTokenNE: return "!=";
   55         case kTokenEQ: return "==";
   56         case kTokenAppend: return ">>";
   57         case kTokenDots: return "..";
   58         case kTokenPlusMinus: return "+-";
   59         case kTokenAddAssign: return "+=";
   60         case kTokenSubAssign: return "-=";
   61 
   62         case kTokenOpen: return "(";
   63         case kTokenClose: return ")";
   64         case kTokenLSquare: return "[";
   65         case kTokenRSquare: return "]";
   66         case kTokenLCurly: return "{";
   67         case kTokenRCurly: return "}";
   68         case kTokenPlus: return "+";
   69         case kTokenMinus: return "-";
   70         case kTokenMult: return "*";
   71         case kTokenDiv: return "/";
   72         case kTokenPower: return "^";
   73         case kTokenLT: return "<";
   74         case kTokenGT: return ">";
   75         case kTokenAssign: return "=";
   76         case kTokenComma: return ",";
   77         case kTokenSemicolon: return ";";
   78         case kTokenDot: return ".";
   79         case kTokenColon: return ":";
   80         case kTokenTilde: return "~";
   81         case kTokenQMark: return "?";
   82         case kTokenBang: return "!";
   83 
   84         case kTokenNop: return "Nop";
   85     }
   86     return NULL; // avoid compiler warning
   87 }
   88 
   89 string token2str(const Token& token)
   90 {
   91     string s = tokentype2str(token.type);
   92     switch (token.type) {
   93         case kTokenString:
   94         case kTokenVarname:
   95         case kTokenFuncname:
   96         case kTokenLname:
   97         case kTokenCname:
   98         case kTokenUletter:
   99         case kTokenWord:
  100         case kTokenRest:
  101             return s + " \"" + token.as_string() + "\"";
  102         case kTokenExpr:
  103             return s + " \"" + token.as_string() + "\" ("+S(token.value.d)+")";
  104         case kTokenEVar:
  105             return s + " \"" + token.as_string() + "\"";
  106         case kTokenNumber:
  107             return s + " " + S(token.value.d);
  108         case kTokenDataset:
  109             if (token.value.i == Lexer::kAll)
  110                 return s + " '*'";
  111             else if (token.value.i == Lexer::kNew)
  112                 return s + " '+'";
  113             else
  114                 return s + " " + S(token.value.i);
  115         default:
  116             return s;
  117     }
  118 }
  119 
  120 void Lexer::read_token(bool allow_glob)
  121 {
  122     tok_.str = cur_;
  123     while (isspace(*tok_.str))
  124         ++tok_.str;
  125     const char* ptr = tok_.str;
  126 
  127     switch (*ptr) {
  128         case '\0':
  129         case '#':
  130             tok_.type = kTokenNop;
  131             break;
  132         case '\'': {
  133             tok_.type = kTokenString;
  134             const char* end = strchr(ptr + 1, '\'');
  135             if (end == NULL)
  136                 throw SyntaxError("unfinished string");
  137             ptr = end + 1;
  138             break;
  139         }
  140         case '>':
  141             ++ptr;
  142             if (*ptr == '=') {
  143                 tok_.type = kTokenGE;
  144                 ++ptr;
  145             } else if (*ptr == '>') {
  146                 tok_.type = kTokenAppend;
  147                 ++ptr;
  148             } else
  149                 tok_.type = kTokenGT;
  150             break;
  151         case '<':
  152             ++ptr;
  153             if (*ptr == '=') {
  154                 tok_.type = kTokenLE;
  155                 ++ptr;
  156             } else if (*ptr == '>') {
  157                 tok_.type = kTokenNE;
  158                 ++ptr;
  159             } else
  160                 tok_.type = kTokenLT;
  161             break;
  162         case '=':
  163             ++ptr;
  164             if (*ptr == '=') {
  165                 tok_.type = kTokenEQ;
  166                 ++ptr;
  167             } else
  168                 tok_.type = kTokenAssign;
  169             break;
  170         case '+':
  171             ++ptr;
  172             if (*ptr == '-') {
  173                 tok_.type = kTokenPlusMinus;
  174                 ++ptr;
  175             } else if (*ptr == '=') {
  176                 tok_.type = kTokenAddAssign;
  177                 ++ptr;
  178             } else
  179                 tok_.type = kTokenPlus;
  180             break;
  181         case '-':
  182             ++ptr;
  183             if (*ptr == '=') {
  184                 tok_.type = kTokenSubAssign;
  185                 ++ptr;
  186             } else
  187                 tok_.type = kTokenMinus;
  188             break;
  189 
  190         case '!':
  191             ++ptr;
  192             if (*ptr == '=') {
  193                 tok_.type = kTokenNE;
  194                 ++ptr;
  195             } else
  196                 tok_.type = kTokenBang;
  197             break;
  198 
  199         case '.':
  200             ++ptr;
  201             if (isdigit(*ptr)) {
  202                 char* endptr;
  203                 tok_.value.d = strtod(ptr-1, &endptr);
  204                 ptr = endptr;
  205                 tok_.type = kTokenNumber;
  206             } else if (*ptr == '.') {
  207                 ++ptr;
  208                 if (*ptr == '.') // 3rd dot
  209                     ++ptr;
  210                 tok_.type = kTokenDots;
  211             } else
  212                 tok_.type = kTokenDot;
  213             break;
  214         case '@':
  215             ++ptr;
  216             tok_.type = kTokenDataset;
  217             if (*ptr == '*') {
  218                 tok_.value.i = kAll;
  219                 ++ptr;
  220             } else if (*ptr == '+') {
  221                 tok_.value.i = kNew;
  222                 ++ptr;
  223             } else if (isdigit(*ptr)) {
  224                 char *endptr;
  225                 tok_.value.i = strtol(ptr, &endptr, 10);
  226                 ptr = endptr;
  227             } else
  228                 throw SyntaxError("unexpected character after '@'");
  229             break;
  230         case '$':
  231             ++ptr;
  232             // allow_glob decides if the '*' is read ("delete $p*")
  233             // or not ("$c=$a*$b"). Always read "$*" (it's not ambigous and
  234             // we do't want error when peeking)
  235             if (! (isalpha(*ptr) || *ptr == '_' || *ptr == '*'))
  236                 throw SyntaxError("unexpected character after '$'");
  237             ++ptr;
  238             tok_.type = kTokenVarname;
  239             while (isalnum(*ptr) || *ptr == '_' || (allow_glob && *ptr == '*'))
  240                 ++ptr;
  241             break;
  242         case '%':
  243             ++ptr;
  244             // the same rules as in the case of '$'
  245             if (! (isalpha(*ptr) || *ptr == '_' || *ptr == '*'))
  246                 throw SyntaxError("unexpected character after '%'");
  247             ++ptr;
  248             tok_.type = kTokenFuncname;
  249             while (isalnum(*ptr) || *ptr == '_' || (allow_glob && *ptr == '*'))
  250                 ++ptr;
  251             break;
  252 
  253         case '(': tok_.type = kTokenOpen;      ++ptr; break;
  254         case ')': tok_.type = kTokenClose;     ++ptr; break;
  255         case '[': tok_.type = kTokenLSquare;   ++ptr; break;
  256         case ']': tok_.type = kTokenRSquare;   ++ptr; break;
  257         case '{': tok_.type = kTokenLCurly;    ++ptr; break;
  258         case '}': tok_.type = kTokenRCurly;    ++ptr; break;
  259         case '*': tok_.type = kTokenMult;      ++ptr; break;
  260         case '/': tok_.type = kTokenDiv;       ++ptr; break;
  261         case '^': tok_.type = kTokenPower;     ++ptr; break;
  262         case ',': tok_.type = kTokenComma;     ++ptr; break;
  263         case ';': tok_.type = kTokenSemicolon; ++ptr; break;
  264         case ':': tok_.type = kTokenColon;     ++ptr; break;
  265         case '~': tok_.type = kTokenTilde;     ++ptr; break;
  266         case '?': tok_.type = kTokenQMark;     ++ptr; break;
  267 
  268         default:
  269             if (isdigit(*ptr)) {
  270                 char* endptr;
  271                 tok_.value.d = strtod(ptr, &endptr);
  272                 ptr = endptr;
  273                 tok_.type = kTokenNumber;
  274             } else if (isupper(*ptr)) {
  275                 ++ptr;
  276                 if (isalnum(*ptr)) {
  277                     while (isalnum(*ptr))
  278                         ++ptr;
  279                     tok_.type = kTokenCname;
  280                 } else
  281                     tok_.type = kTokenUletter;
  282             } else if (isalpha(*ptr) || *ptr == '_') {
  283                 while (isalnum(*ptr) || *ptr == '_')
  284                     ++ptr;
  285                 tok_.type = kTokenLname;
  286             } else
  287                 throw SyntaxError("unexpected character: " + string(ptr, 1));
  288     }
  289     tok_.length = ptr - tok_.str;
  290     cur_ = ptr;
  291 }
  292 
  293 Token Lexer::get_token()
  294 {
  295     if (!peeked_)
  296         read_token();
  297     peeked_ = false;
  298     return tok_;
  299 }
  300 
  301 const Token& Lexer::peek_token()
  302 {
  303     if (!peeked_)
  304         read_token();
  305     peeked_ = true;
  306     return tok_;
  307 }
  308 
  309 void Lexer::go_back(const Token& token)
  310 {
  311     cur_ = token.str;
  312     peeked_ = false;
  313 }
  314 
  315 Token Lexer::get_glob_token()
  316 {
  317     if (peeked_) {
  318         // un-peek
  319         cur_ = tok_.str;
  320         peeked_ = false;
  321     }
  322     read_token(true);
  323     return tok_;
  324 }
  325 
  326 Token Lexer::get_word_token()
  327 {
  328     Token t = get_token();
  329     if (t.type == kTokenString || t.type == kTokenNop)
  330         return t;
  331     while (*cur_ != '\0' && !isspace(*cur_) && *cur_ != ';' && *cur_ != '#')
  332         ++cur_;
  333     t.type = kTokenWord;
  334     t.length = cur_ - t.str;
  335     return t;
  336 }
  337 
  338 Token Lexer::get_rest_of_cmd()
  339 {
  340     Token t = get_token();
  341     if (t.type == kTokenString || t.type == kTokenNop)
  342         return t;
  343     while (*cur_ != '\0' && *cur_ != ';' && *cur_ != '#')
  344         ++cur_;
  345     t.type = kTokenRest;
  346     t.length = cur_ - t.str;
  347     return t;
  348 }
  349 
  350 Token Lexer::get_rest_of_line()
  351 {
  352     // avoid calling here read_token() because it may throw exception
  353     Token t;
  354     t.type = kTokenRest;
  355     while (isspace(*cur_))
  356         ++cur_;
  357     t.str = peeked_ ? tok_.str : cur_;
  358     peeked_ = false;
  359     while (*cur_ != '\0')
  360         ++cur_;
  361     t.length = cur_ - t.str;
  362     return t;
  363 }
  364 
  365 Token Lexer::get_expected_token(const string& raw)
  366 {
  367     TokenType p = peek_token().type;
  368     string s = peek_token().as_string();
  369     if (s != raw) {
  370         string msg = "expected `" + raw + "'";
  371         throw_syntax_error(p == kTokenNop ? msg
  372                                           : msg + " instead of `" + s + "'");
  373     }
  374     return get_token();
  375 }
  376 
  377 Token Lexer::get_expected_token(TokenType tt)
  378 {
  379     TokenType p = peek_token().type;
  380     if (p != tt) {
  381         string msg = S("expected ") + tokentype2str(tt);
  382         throw_syntax_error(p == kTokenNop ? msg
  383                                     : msg + " instead of " + tokentype2str(p));
  384     }
  385     return get_token();
  386 }
  387 
  388 Token Lexer::get_expected_token(TokenType tt1, TokenType tt2)
  389 {
  390     TokenType p = peek_token().type;
  391     if (p != tt1 && p != tt2) {
  392         string msg = S("expected ") + tokentype2str(tt1)
  393                      + " or " + tokentype2str(tt2);
  394         throw_syntax_error(p == kTokenNop ? msg
  395                                     : msg + " instead of " + tokentype2str(p));
  396     }
  397     return get_token();
  398 }
  399 
  400 Token Lexer::get_expected_token(TokenType tt, const string& raw)
  401 {
  402     TokenType p = peek_token().type;
  403     string s = peek_token().as_string();
  404     if (p != tt && s != raw) {
  405         string msg = S("expected ") + tokentype2str(tt) + " or `" + raw + "'";
  406         throw_syntax_error(p == kTokenNop ? msg
  407                                           : msg + " instead of `" + s + "'");
  408     }
  409     return get_token();
  410 }
  411 
  412 Token Lexer::get_expected_token(const string& raw1, const string& raw2)
  413 {
  414     TokenType p = peek_token().type;
  415     string s = peek_token().as_string();
  416     if (s != raw1 && s != raw2) {
  417         string msg = "expected `" + raw1 + "' or `" + raw2 + "'";
  418         throw_syntax_error(p == kTokenNop ? msg
  419                                           : msg + " instead of `" + s + "'");
  420     }
  421     return get_token();
  422 }
  423 
  424 Token Lexer::get_token_if(TokenType tt)
  425 {
  426     if (peek_token().type == tt)
  427         return get_token();
  428     else {
  429         Token token;
  430         token.type = kTokenNop;
  431         token.str = cur_;
  432         token.length = 0;
  433         return token;
  434     }
  435 }
  436 
  437 void Lexer::throw_syntax_error(const string& msg)
  438 {
  439     int pos = cur_ - input_;
  440     string s = S(pos);
  441     if (pos >= 10)
  442         s += ", near `" + string(cur_ - 10, cur_) + "'";
  443     throw SyntaxError("at " + s + ": " + msg);
  444 }
  445 
  446 } // namespace fityk