lexer/main.c

   1 #include <stdlib.h>
   2 #include <stdio.h>
   3 #include <string.h>
   4 #include <stdbool.h>
   5 #include <wchar.h>
   6
   7 #define TOKEN_MAX 500
   8
   9 struct token_s {
  10     const char* type;
  11     wint_t value[50];
  12 };
  13
  14 struct token_s token[TOKEN_MAX] = {NULL, 0};
  15
  16 FILE *source = NULL, *target = NULL;
  17 wint_t c;
  18 unsigned int tokenFound = 0;
  19 enum TokenType {
  20     MOTCLE,
  21     SECTION,
  22     SSECTION,
  23     NPARA,
  24     MOT,
  25     FIN
  26 } tokenType;
  27 const char* tokenTypestr[] = { "MOTCLE", "SECTION", "SSECTION", "NPARA", "MOT", "FIN" };
  28 unsigned int i = 0;
  29
  30 /* It looks silly to check for each characters but for debugging, it's just the way to go */
  31 bool istAlpha() {
  32     if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \
  33         c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \
  34         c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \
  35         c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \
  36         c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \
  37         c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \
  38         c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \
  39         c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \
  40         c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \
  41         c == L'\''|| c == L'#' || \
  42         c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
  43         c == L'7' || c == L'8' || c == L'9' || \
  44         c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
  45         c == L'ù' || c == L'û' || \
  46         c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
  47         c == L'Ù' || c == L'Û') {
  48             return true;
  49         }
  50         return false;
  51 }
  52
  53 bool isSeparator() {
  54     if (c == L'\t' || c == L' ' || c == L'\n') {
  55         return true;
  56     }
  57     return false;
  58 }
  59
  60 int scanner() {
  61     const wchar_t* Titre = L"Titre";
  62     const wchar_t* Auteur = L"Auteur";
  63     unsigned int j = 0;
  64
  65 // The main loop get the next character
  66 init:
  67     if (c == L' ' || c == L'\t') {
  68         c = fgetwc(source);
  69         token[tokenFound].value[i] = c;
  70         i++;
  71         goto init;
  72     }
  73     if (c == L'\n') {
  74         c = fgetwc(source);
  75         token[tokenFound].value[i] = c;
  76         i++;
  77         goto initLV1;
  78     }
  79     if (c == L'>') {
  80         c = fgetwc(source);
  81         token[tokenFound].value[i] = c;
  82         i++;
  83         goto MC1;
  84     }
  85     if (c == L'=') {
  86         c = fgetwc(source);
  87         token[tokenFound].value[i] = c;
  88         i++;
  89         goto S1SS1;
  90     }
  91     if (istAlpha()) {
  92         c = fgetwc(source);
  93         token[tokenFound].value[i] = c;
  94         i++;
  95         goto M1;
  96     }
  97     if (c == WEOF) {
  98         goto FIN;
  99     } else {
 100         goto error;
 101     }
 102
 103 MC1:
 104     // FIXME: Partial match need a rewind in the characters extraction from the file
 105     if (c == (wint_t)Titre[j] && j < wcslen(Titre) - 1) {
 106         c = fgetwc(source);
 107         token[tokenFound].value[i] = c;
 108         i++;
 109         j++;
 110         goto MC1;
 111     }
 112     if (c == (wint_t)Auteur[j] && j < wcslen(Auteur) - 1) {
 113         c = fgetwc(source);
 114         token[tokenFound].value[i] = c;
 115         i++;
 116         j++;
 117         goto MC1;
 118     } else {
 119         c = fgetwc(source);
 120         token[tokenFound].value[i] = c;
 121         i++;
 122         goto MC2;
 123     }
 124
 125 S1SS1:
 126     if (c == L'=') {
 127         c = fgetwc(source);
 128         token[tokenFound].value[i] = c;
 129         i++;
 130         goto SS2;
 131     }
 132     if (isSeparator() || c == WEOF) {
 133         goto SECTION;
 134     }
 135
 136 SS2:
 137     if (isSeparator() || c == WEOF) {
 138         goto SSECTION;
 139     }
 140
 141 SECTION:
 142     tokenType = SECTION;
 143     return 1;
 144
 145 SSECTION:
 146     tokenType = SSECTION;
 147     return 1;
 148
 149 M1:
 150     if (istAlpha()) {
 151         c = fgetwc(source);
 152         token[tokenFound].value[i] = c;
 153         i++;
 154         goto M1;
 155     }
 156     if (isSeparator() || c == WEOF) {
 157         goto MOT;
 158     }
 159
 160 initLV1:
 161     if (c == L' ' || c == L'\t') {
 162         c = fgetwc(source);
 163         token[tokenFound].value[i] = c;
 164         i++;
 165         goto initLV1;
 166     }
 167     if (c == L'\n') {
 168         c = fgetwc(source);
 169         token[tokenFound].value[i] = c;
 170         i++;
 171         goto initLV1LV2;
 172     }
 173     if (istAlpha()) {
 174         c = fgetwc(source);
 175         token[tokenFound].value[i] = c;
 176         i++;
 177         goto M1;
 178     }
 179     if (c == L'=') {
 180         c = fgetwc(source);
 181         token[tokenFound].value[i] = c;
 182         i++;
 183         goto S1SS1;
 184     }
 185     if (c == L'>') {
 186         c = fgetwc(source);
 187         token[tokenFound].value[i] = c;
 188         i++;
 189         goto MC1;
 190     }
 191     if (c == WEOF) {
 192         goto FIN;
 193     }
 194
 195 initLV1LV2:
 196     if (isSeparator()) {
 197         c = fgetwc(source);
 198         token[tokenFound].value[i] = c;
 199         i++;
 200         goto initLV1LV2;
 201     }
 202     if (istAlpha()) {
 203         goto NPARA;
 204     }
 205     if (c == L'>') {
 206         c = fgetwc(source);
 207         token[tokenFound].value[i] = c;
 208         i++;
 209         goto MC1;
 210     }
 211     if (c == L'=') {
 212         c = fgetwc(source);
 213         token[tokenFound].value[i] = c;
 214         i++;
 215         goto S1SS1;
 216     }
 217     if (c == WEOF) {
 218         goto FIN;
 219     }
 220
 221 NPARA:
 222     tokenType = NPARA;
 223     return 1;
 224
 225 MOT:
 226     tokenType = MOT;
 227     return 1;
 228
 229 MC2:
 230     if (isSeparator() || c == WEOF) {
 231         goto MOTCLE;
 232     }
 233
 234 MOTCLE:
 235     tokenType = MOTCLE;
 236     return 1;
 237
 238 FIN:
 239     tokenType = FIN;
 240     return 1;
 241
 242 error:
 243     tokenType = FIN;
 244     return -1;
 245 }
 246
 247 int main() {
 248
 249     // Ouvre le fichier test.txt en lecture seulement (le fichier doit exister) :
 250     source = fopen("test.txt", "r");
 251     // Cree et ouvre un fichier target.html en lecture/ecriture
 252     // avec suppression du contenu au prealable :
 253     target = fopen("target.html", "w+");
 254
 255     if (source == NULL) {
 256         printf("Impossible d'ouvrir le fichier source\n");
 257         return -1;
 258     }
 259
 260     if (target == NULL) {
 261         printf("Impossible d'ouvrir le fichier target\n");
 262         return -1;
 263     }
 264
 265     do {
 266         c = fgetwc(source); // lecture du caractere suivant du fichier source
 267         token[tokenFound].value[i] = c;
 268         i++;
 269         int scanrt = scanner();
 270         if (scanrt == -1) {
 271             wprintf(L"Scanner error with token value: %ls\n", token[tokenFound].value);
 272             exit(EXIT_FAILURE);
 273         }
 274         if (c != WEOF) {
 275             wprintf(L"Token type found: %s with value: %ls\n", tokenTypestr[tokenType], token[tokenFound].value);
 276         } else {
 277             wprintf(L"Token type found: %s\n", tokenTypestr[tokenType]);
 278         }
 279         token[tokenFound].type = tokenTypestr[tokenType];
 280         tokenFound++;
 281         // reinit token.value array counter
 282         i = 0;
 283     //} while (c != WEOF); // tant que la fin du fichier n'est pas atteinte
 284     } while (!feof(source)); // tant que la fin du fichier n'est pas atteinte
 285
 286     if (source != NULL) fclose(source); // fermeture du fichier source
 287     if (target != NULL) fclose(target); // fermeture du fichier target
 288
 289     return 0;
 290 }