lexer/lexical_analyzer.c

   1 /* Lexical analyzer */
   2
   3 #include <stdlib.h>
   4 #include <stdbool.h>
   5
   6 #include "global_vars.h"
   7 #include "print_helper.h"
   8 #include "lexical_analyzer.h"
   9
  10 wint_t c;
  11
  12 /* It looks silly to check for each characters but for debugging, it's just the way to go */
  13 static bool isAlphaNum() {
  14     if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \
  15         c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \
  16         c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \
  17         c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \
  18         c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \
  19         c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \
  20         c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \
  21         c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \
  22         c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \
  23         c == L'\''|| c == L'#' || \
  24         c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
  25         c == L'7' || c == L'8' || c == L'9' || \
  26         //FIXME: Accentued characters (aka multibytes characters) support is still buggy
  27         c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
  28         c == L'ù' || c == L'û' || \
  29         c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
  30         c == L'Ù' || c == L'Û') {
  31             return true;
  32         }
  33         return false;
  34 }
  35
  36 static bool isSeparator() {
  37     if (c == L'\t' || c == L' ' || c == L'\n') {
  38         return true;
  39     }
  40     return false;
  41 }
  42
  43 static bool isEOF() {
  44     if (c == WEOF) {
  45         return true;
  46     }
  47     return false;
  48 }
  49
  50 int scanner(void) {
  51     tokenValue[0] = 0;
  52     unsigned int i = 0;
  53     wchar_t m[6];
  54
  55 init:
  56     if (c == L' ' || c == L'\t') {
  57         c = fgetwc(source);
  58         goto init;
  59     }
  60     if (c == L'\n') {
  61         c = fgetwc(source);
  62         goto initLV1;
  63     }
  64     if (c == L'>') {
  65         c = fgetwc(source);
  66         goto MC1;
  67     }
  68     if (c == L'=') {
  69         c = fgetwc(source);
  70         goto S1SS1;
  71     }
  72     if (isAlphaNum()) {
  73         tokenValue[i] = c;
  74         i++;
  75         c = fgetwc(source);
  76         goto M1;
  77     }
  78     if (isEOF()) {
  79         goto FIN;
  80     }
  81     goto error;
  82
  83 MC1:
  84     if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
  85         wcscpy((wchar_t*)tokenValue, L">Auteur");
  86         c = fgetwc(source);
  87         goto MC2;
  88     }
  89     if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
  90         wcscpy((wchar_t*)tokenValue, L">Titre");
  91         c = fgetwc(source);
  92         goto MC2;
  93     }
  94     goto error;
  95
  96 S1SS1:
  97     if (c == L'=') {
  98         c = fgetwc(source);
  99         goto SS2;
 100     }
 101     if (isSeparator() || isEOF()) {
 102         goto SECTION;
 103     }
 104     goto error;
 105
 106 SS2:
 107     if (isSeparator() || isEOF()) {
 108         goto SSECTION;
 109     }
 110     goto error;
 111
 112 SECTION:
 113     tokenType = SECTION;
 114     return EXIT_SUCCESS;
 115
 116 SSECTION:
 117     tokenType = SSECTION;
 118     return EXIT_SUCCESS;
 119
 120 M1:
 121     if (isAlphaNum()) {
 122         tokenValue[i] = c;
 123         i++;
 124         c = fgetwc(source);
 125         goto M1;
 126     }
 127     if (isSeparator() || isEOF()) {
 128         goto MOT;
 129     }
 130     goto error;
 131
 132 initLV1:
 133     if (c == L' ' || c == L'\t') {
 134         c = fgetwc(source);
 135         goto initLV1;
 136     }
 137     if (c == L'\n') {
 138         c = fgetwc(source);
 139         goto initLV1LV2;
 140     }
 141     if (isAlphaNum()) {
 142         tokenValue[i] = c;
 143         i++;
 144         c = fgetwc(source);
 145         goto M1;
 146     }
 147     if (c == L'=') {
 148         c = fgetwc(source);
 149         goto S1SS1;
 150     }
 151     if (c == L'>') {
 152         c = fgetwc(source);
 153         goto MC1;
 154     }
 155     if (isEOF()) {
 156         goto FIN;
 157     }
 158     goto error;
 159
 160 initLV1LV2:
 161     if (isSeparator()) {
 162         c = fgetwc(source);
 163         goto initLV1LV2;
 164     }
 165     if (isAlphaNum()) {
 166         goto NPARA;
 167     }
 168     if (c == L'>') {
 169         c = fgetwc(source);
 170         goto MC1;
 171     }
 172     if (c == L'=') {
 173         c = fgetwc(source);
 174         goto S1SS1;
 175     }
 176     if (isEOF()) {
 177         goto FIN;
 178     }
 179     goto error;
 180
 181 NPARA:
 182     tokenType = NPARA;
 183     return EXIT_SUCCESS;
 184
 185 MOT:
 186     tokenType = MOT;
 187     tokenValue[i] = 0;
 188     wcscpy((wchar_t*)token[tokenFound].value, (wchar_t*)tokenValue);
 189     return EXIT_SUCCESS;
 190
 191 MC2:
 192     if (isSeparator() || isEOF()) {
 193         goto MOTCLE;
 194     }
 195     goto error;
 196
 197 MOTCLE:
 198     tokenType = MOTCLE;
 199     wcscpy((wchar_t*)token[tokenFound].value, (wchar_t*)tokenValue);
 200     return EXIT_SUCCESS;
 201
 202 FIN:
 203     tokenType = FIN;
 204     return EXIT_SUCCESS;
 205
 206 error:
 207     if (tokenType == MOT || tokenType == MOTCLE) {
 208         fwprintf(stderr, L"%s error with token type: %s and value: %ls\n",
 209                  __func__,
 210                  tokenTypestr[tokenType],
 211                  tokenValue);
 212     } else {
 213         fwprintf(stderr, L"%s error with token type: %s\n",
 214                  __func__,
 215                  tokenTypestr[tokenType]);
 216     }
 217     fflush(stderr);
 218     tokenType = FIN;
 219     exit(EXIT_FAILURE);
 220 }