lexer/lexical_analyzer.c

   1 /* Lexical analyzer */
   2
   3 #include <stdlib.h>
   4 #include <stdbool.h>
   5
   6 #include "global_vars.h"
   7 #include "print_helper.h"
   8
   9 wint_t c;
  10
  11 /* It looks silly to check for each characters but for debugging, it's just the way to go */
  12 static bool isAlphaNum() {
  13     if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \
  14         c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \
  15         c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \
  16         c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \
  17         c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \
  18         c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \
  19         c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \
  20         c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \
  21         c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \
  22         c == L'\''|| c == L'#' || \
  23         c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
  24         c == L'7' || c == L'8' || c == L'9' || \
  25         //FIXME: Accentued characters (aka multibytes characters) support is still buggy
  26         c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
  27         c == L'ù' || c == L'û' || \
  28         c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
  29         c == L'Ù' || c == L'Û') {
  30             return true;
  31         }
  32         return false;
  33 }
  34
  35 static bool isSeparator() {
  36     if (c == L'\t' || c == L' ' || c == L'\n') {
  37         return true;
  38     }
  39     return false;
  40 }
  41
  42 static bool isEOF() {
  43     if (c == WEOF) {
  44         return true;
  45     }
  46     return false;
  47 }
  48
  49 int scanner() {
  50     tokenValue[0] = 0;
  51     unsigned int i = 0;
  52     wchar_t m[6];
  53
  54 init:
  55     if (c == L' ' || c == L'\t') {
  56         c = fgetwc(source);
  57         goto init;
  58     }
  59     if (c == L'\n') {
  60         c = fgetwc(source);
  61         goto initLV1;
  62     }
  63     if (c == L'>') {
  64         c = fgetwc(source);
  65         goto MC1;
  66     }
  67     if (c == L'=') {
  68         c = fgetwc(source);
  69         goto S1SS1;
  70     }
  71     if (isAlphaNum()) {
  72         tokenValue[i] = c;
  73         i++;
  74         c = fgetwc(source);
  75         goto M1;
  76     }
  77     if (isEOF()) {
  78         goto FIN;
  79     }
  80     goto error;
  81
  82 MC1:
  83     if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
  84         wcscpy((wchar_t*)tokenValue, L">Auteur");
  85         c = fgetwc(source);
  86         goto MC2;
  87     }
  88     if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
  89         wcscpy((wchar_t*)tokenValue, L">Titre");
  90         c = fgetwc(source);
  91         goto MC2;
  92     }
  93     goto error;
  94
  95 S1SS1:
  96     if (c == L'=') {
  97         c = fgetwc(source);
  98         goto SS2;
  99     }
 100     if (isSeparator() || isEOF()) {
 101         goto SECTION;
 102     }
 103     goto error;
 104
 105 SS2:
 106     if (isSeparator() || isEOF()) {
 107         goto SSECTION;
 108     }
 109     goto error;
 110
 111 SECTION:
 112     tokenType = SECTION;
 113     return EXIT_SUCCESS;
 114
 115 SSECTION:
 116     tokenType = SSECTION;
 117     return EXIT_SUCCESS;
 118
 119 M1:
 120     if (isAlphaNum()) {
 121         tokenValue[i] = c;
 122         i++;
 123         c = fgetwc(source);
 124         goto M1;
 125     }
 126     if (isSeparator() || isEOF()) {
 127         goto MOT;
 128     }
 129     goto error;
 130
 131 initLV1:
 132     if (c == L' ' || c == L'\t') {
 133         c = fgetwc(source);
 134         goto initLV1;
 135     }
 136     if (c == L'\n') {
 137         c = fgetwc(source);
 138         goto initLV1LV2;
 139     }
 140     if (isAlphaNum()) {
 141         tokenValue[i] = c;
 142         i++;
 143         c = fgetwc(source);
 144         goto M1;
 145     }
 146     if (c == L'=') {
 147         c = fgetwc(source);
 148         goto S1SS1;
 149     }
 150     if (c == L'>') {
 151         c = fgetwc(source);
 152         goto MC1;
 153     }
 154     if (isEOF()) {
 155         goto FIN;
 156     }
 157     goto error;
 158
 159 initLV1LV2:
 160     if (isSeparator()) {
 161         c = fgetwc(source);
 162         goto initLV1LV2;
 163     }
 164     if (isAlphaNum()) {
 165         goto NPARA;
 166     }
 167     if (c == L'>') {
 168         c = fgetwc(source);
 169         goto MC1;
 170     }
 171     if (c == L'=') {
 172         c = fgetwc(source);
 173         goto S1SS1;
 174     }
 175     if (isEOF()) {
 176         goto FIN;
 177     }
 178     goto error;
 179
 180 NPARA:
 181     tokenType = NPARA;
 182     return EXIT_SUCCESS;
 183
 184 MOT:
 185     tokenType = MOT;
 186     tokenValue[i] = 0;
 187     wcscpy((wchar_t*)token[tokenFound].value, (wchar_t*)tokenValue);
 188     return EXIT_SUCCESS;
 189
 190 MC2:
 191     if (isSeparator() || isEOF()) {
 192         goto MOTCLE;
 193     }
 194     goto error;
 195
 196 MOTCLE:
 197     tokenType = MOTCLE;
 198     wcscpy((wchar_t*)token[tokenFound].value, (wchar_t*)tokenValue);
 199     return EXIT_SUCCESS;
 200
 201 FIN:
 202     tokenType = FIN;
 203     return EXIT_SUCCESS;
 204
 205 error:
 206     if (tokenType == MOT || tokenType == MOTCLE) {
 207         fwprintf(stderr, L"%s error with token type: %s and value: %ls\n",
 208                  __func__,
 209                  tokenTypestr[tokenType],
 210                  tokenValue);
 211     } else {
 212         fwprintf(stderr, L"%s error with token type: %s\n",
 213                  __func__,
 214                  tokenTypestr[tokenType]);
 215     }
 216     fflush(stderr);
 217     tokenType = FIN;
 218     exit(EXIT_FAILURE);
 219 }