lexer/main.c

   1 #include <stdlib.h>
   2 #include <stdio.h>
   3 #include <string.h>
   4 #include <stdbool.h>
   5 #include <wchar.h>
   6
   7 #define TOKEN_MAX 500
   8
   9 struct token_s {
  10     const char* type;
  11     wint_t value[50];
  12 };
  13
  14 struct token_s token[TOKEN_MAX] = {NULL, 0};
  15
  16 FILE *source = NULL, *target = NULL;
  17 wint_t c;
  18 unsigned int tokenFound = 0;
  19 enum TokenType {
  20     MOTCLE,
  21     SECTION,
  22     SSECTION,
  23     NPARA,
  24     MOT,
  25     FIN
  26 } tokenType;
  27 const char* tokenTypestr[] = { "MOTCLE", "SECTION", "SSECTION", "NPARA", "MOT", "FIN" };
  28
  29 /* It looks silly to check for each characters but for debugging, it's just the way to go */
  30 bool istAlpha() {
  31     if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \
  32         c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \
  33         c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \
  34         c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \
  35         c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \
  36         c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \
  37         c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \
  38         c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \
  39         c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \
  40         c == L'\''|| c == L'#' || \
  41         c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
  42         c == L'7' || c == L'8' || c == L'9' || \
  43         // FIXME: Accentued characters (aka multibytes characters) support is still buggy
  44         c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
  45         c == L'ù' || c == L'û' || \
  46         c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
  47         c == L'Ù' || c == L'Û') {
  48             return true;
  49         }
  50         return false;
  51 }
  52
  53 bool isSeparator() {
  54     if (c == L'\t' || c == L' ' || c == L'\n') {
  55         return true;
  56     }
  57     return false;
  58 }
  59
  60 int scanner() {
  61     unsigned int i = 0;
  62     wchar_t m[6];
  63
  64 init:
  65     if (c == L' ' || c == L'\t') {
  66         c = fgetwc(source);
  67         goto init;
  68     }
  69     if (c == L'\n') {
  70         c = fgetwc(source);
  71         goto initLV1;
  72     }
  73     if (c == L'>') {
  74         c = fgetwc(source);
  75         goto MC1;
  76     }
  77     if (c == L'=') {
  78         c = fgetwc(source);
  79         goto S1SS1;
  80     }
  81     if (istAlpha()) {
  82         token[tokenFound].value[i] = c;
  83         i++;
  84         c = fgetwc(source);
  85         goto M1;
  86     }
  87     if (c == WEOF) {
  88         goto FIN;
  89     }
  90     goto error;
  91
  92 MC1:
  93     if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
  94         wcscpy((wchar_t*)token[tokenFound].value, L">Auteur");
  95         c = fgetwc(source);
  96         goto MC2;
  97     }
  98     if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
  99         wcscpy((wchar_t*)token[tokenFound].value, L">Titre");
 100         c = fgetwc(source);
 101         goto MC2;
 102     }
 103     goto error;
 104
 105 S1SS1:
 106     if (c == L'=') {
 107         c = fgetwc(source);
 108         goto SS2;
 109     }
 110     if (isSeparator() || c == WEOF) {
 111         goto SECTION;
 112     }
 113     goto error;
 114
 115 SS2:
 116     if (isSeparator() || c == WEOF) {
 117         goto SSECTION;
 118     }
 119     goto error;
 120
 121 SECTION:
 122     tokenType = SECTION;
 123     return EXIT_SUCCESS;
 124
 125 SSECTION:
 126     tokenType = SSECTION;
 127     return EXIT_SUCCESS;
 128
 129 M1:
 130     if (istAlpha()) {
 131         token[tokenFound].value[i] = c;
 132         i++;
 133         c = fgetwc(source);
 134         goto M1;
 135     }
 136     if (isSeparator() || c == WEOF) {
 137         goto MOT;
 138     }
 139     goto error;
 140
 141 initLV1:
 142     if (c == L' ' || c == L'\t') {
 143         c = fgetwc(source);
 144         goto initLV1;
 145     }
 146     if (c == L'\n') {
 147         c = fgetwc(source);
 148         goto initLV1LV2;
 149     }
 150     if (istAlpha()) {
 151         token[tokenFound].value[i] = c;
 152         i++;
 153         c = fgetwc(source);
 154         goto M1;
 155     }
 156     if (c == L'=') {
 157         c = fgetwc(source);
 158         goto S1SS1;
 159     }
 160     if (c == L'>') {
 161         c = fgetwc(source);
 162         goto MC1;
 163     }
 164     if (c == WEOF) {
 165         goto FIN;
 166     }
 167     goto error;
 168
 169 initLV1LV2:
 170     if (isSeparator()) {
 171         c = fgetwc(source);
 172         goto initLV1LV2;
 173     }
 174     if (istAlpha()) {
 175         goto NPARA;
 176     }
 177     if (c == L'>') {
 178         c = fgetwc(source);
 179         goto MC1;
 180     }
 181     if (c == L'=') {
 182         c = fgetwc(source);
 183         goto S1SS1;
 184     }
 185     if (c == WEOF) {
 186         goto FIN;
 187     }
 188     goto error;
 189
 190 NPARA:
 191     tokenType = NPARA;
 192     return EXIT_SUCCESS;
 193
 194 MOT:
 195     tokenType = MOT;
 196     return EXIT_SUCCESS;
 197
 198 MC2:
 199     if (isSeparator() || c == WEOF) {
 200         goto MOTCLE;
 201     }
 202     goto error;
 203
 204 MOTCLE:
 205     tokenType = MOTCLE;
 206     return EXIT_SUCCESS;
 207
 208 FIN:
 209     tokenType = FIN;
 210     return EXIT_SUCCESS;
 211
 212 error:
 213     tokenType = FIN;
 214     return EXIT_FAILURE;
 215 }
 216
 217 int main() {
 218     // Ouvre le fichier test.txt en lecture seulement (le fichier doit exister) :
 219     source = fopen("test.txt", "r+");
 220     // Cree et ouvre un fichier target.html en lecture/ecriture
 221     // avec suppression du contenu au prealable :
 222     target = fopen("target.html", "w+");
 223
 224     if (source == NULL) {
 225         printf("Impossible d'ouvrir le fichier source\n");
 226         return -1;
 227     }
 228
 229     if (target == NULL) {
 230         printf("Impossible d'ouvrir le fichier target\n");
 231         return -1;
 232     }
 233
 234     c = fgetwc(source); // lecture du premier caractere
 235     do {
 236         int scanrt = scanner();
 237         if (scanrt == EXIT_FAILURE) {
 238             wprintf(L"Scanner error with token value: %ls\n", token[tokenFound].value);
 239             exit(EXIT_FAILURE);
 240         }
 241         if (tokenType == MOT || tokenType == MOTCLE) {
 242             wprintf(L"%20s: %ls\n", tokenTypestr[tokenType], token[tokenFound].value);
 243         } else {
 244             wprintf(L"%20s\n", tokenTypestr[tokenType]);
 245         }
 246         token[tokenFound].type = tokenTypestr[tokenType];
 247         tokenFound++;
 248     } while (tokenType != FIN); // tant que la fin du fichier n'est pas atteinte
 249
 250     if (source != NULL) fclose(source); // fermeture du fichier source
 251     if (target != NULL) fclose(target); // fermeture du fichier target
 252
 253     return EXIT_SUCCESS;
 254 }