lexer/main.c

   1 #include <stdlib.h>
   2 #include <stdio.h>
   3 #include <string.h>
   4 #include <stdbool.h>
   5 #include <wchar.h>
   6
   7 #define TOKEN_MAX 500
   8
   9 struct token_s {
  10     const char* type;
  11     wint_t value[50];
  12 };
  13
  14 struct token_s token[TOKEN_MAX] = {NULL, 0};
  15
  16 FILE *source = NULL, *target = NULL;
  17 wint_t c;
  18 unsigned int tokenFound = 0;
  19 enum TokenType {
  20     MOTCLE,
  21     SECTION,
  22     SSECTION,
  23     NPARA,
  24     MOT,
  25     FIN
  26 } tokenType;
  27 const char* tokenTypestr[] = { "MOTCLE", "SECTION", "SSECTION", "NPARA", "MOT", "FIN" };
  28
  29 /* It looks silly to check for each characters but for debugging, it's just the way to go */
  30 bool istAlpha() {
  31     if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \
  32         c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \
  33         c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \
  34         c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \
  35         c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \
  36         c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \
  37         c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \
  38         c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \
  39         c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \
  40         c == L'\''|| c == L'#' || \
  41         c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
  42         c == L'7' || c == L'8' || c == L'9' || \
  43         c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
  44         c == L'ù' || c == L'û' || \
  45         c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
  46         c == L'Ù' || c == L'Û') {
  47             return true;
  48         }
  49         return false;
  50 }
  51
  52 bool isSeparator() {
  53     if (c == L'\t' || c == L' ' || c == L'\n') {
  54         return true;
  55     }
  56     return false;
  57 }
  58
  59 int scanner() {
  60     unsigned int i = 0;
  61     wchar_t m[6];
  62
  63 init:
  64     if (c == L' ' || c == L'\t') {
  65         c = fgetwc(source);
  66         goto init;
  67     }
  68     if (c == L'\n') {
  69         c = fgetwc(source);
  70         goto initLV1;
  71     }
  72     if (c == L'>') {
  73         c = fgetwc(source);
  74         goto MC1;
  75     }
  76     if (c == L'=') {
  77         c = fgetwc(source);
  78         goto S1SS1;
  79     }
  80     if (istAlpha()) {
  81         token[tokenFound].value[i] = c;
  82         i++;
  83         c = fgetwc(source);
  84         goto M1;
  85     }
  86     if (c == WEOF) {
  87         goto FIN;
  88     }
  89     goto error;
  90
  91 MC1:
  92     if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
  93         wcscpy((wchar_t*)token[tokenFound].value, L">Auteur");
  94         c = fgetwc(source);
  95         goto MC2;
  96     }
  97     if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
  98         wcscpy((wchar_t*)token[tokenFound].value, L">Titre");
  99         c = fgetwc(source);
 100         goto MC2;
 101     }
 102     goto error;
 103
 104 S1SS1:
 105     if (c == L'=') {
 106         c = fgetwc(source);
 107         goto SS2;
 108     }
 109     if (isSeparator() || c == WEOF) {
 110         goto SECTION;
 111     }
 112     goto error;
 113
 114 SS2:
 115     if (isSeparator() || c == WEOF) {
 116         goto SSECTION;
 117     }
 118     goto error;
 119
 120 SECTION:
 121     tokenType = SECTION;
 122     return EXIT_SUCCESS;
 123
 124 SSECTION:
 125     tokenType = SSECTION;
 126     return EXIT_SUCCESS;
 127
 128 M1:
 129     if (istAlpha()) {
 130         token[tokenFound].value[i] = c;
 131         i++;
 132         c = fgetwc(source);
 133         goto M1;
 134     }
 135     if (isSeparator() || c == WEOF) {
 136         goto MOT;
 137     }
 138     goto error;
 139
 140 initLV1:
 141     if (c == L' ' || c == L'\t') {
 142         c = fgetwc(source);
 143         goto initLV1;
 144     }
 145     if (c == L'\n') {
 146         c = fgetwc(source);
 147         goto initLV1LV2;
 148     }
 149     if (istAlpha()) {
 150         token[tokenFound].value[i] = c;
 151         i++;
 152         c = fgetwc(source);
 153         goto M1;
 154     }
 155     if (c == L'=') {
 156         c = fgetwc(source);
 157         goto S1SS1;
 158     }
 159     if (c == L'>') {
 160         c = fgetwc(source);
 161         goto MC1;
 162     }
 163     if (c == WEOF) {
 164         goto FIN;
 165     }
 166     goto error;
 167
 168 initLV1LV2:
 169     if (isSeparator()) {
 170         c = fgetwc(source);
 171         goto initLV1LV2;
 172     }
 173     if (istAlpha()) {
 174         goto NPARA;
 175     }
 176     if (c == L'>') {
 177         c = fgetwc(source);
 178         goto MC1;
 179     }
 180     if (c == L'=') {
 181         c = fgetwc(source);
 182         goto S1SS1;
 183     }
 184     if (c == WEOF) {
 185         goto FIN;
 186     }
 187     goto error;
 188
 189 NPARA:
 190     tokenType = NPARA;
 191     return EXIT_SUCCESS;
 192
 193 MOT:
 194     tokenType = MOT;
 195     return EXIT_SUCCESS;
 196
 197 MC2:
 198     if (isSeparator() || c == WEOF) {
 199         goto MOTCLE;
 200     }
 201     goto error;
 202
 203 MOTCLE:
 204     tokenType = MOTCLE;
 205     return EXIT_SUCCESS;
 206
 207 FIN:
 208     tokenType = FIN;
 209     return EXIT_SUCCESS;
 210
 211 error:
 212     tokenType = FIN;
 213     return EXIT_FAILURE;
 214 }
 215
 216 int main() {
 217     // Ouvre le fichier test.txt en lecture seulement (le fichier doit exister) :
 218     source = fopen("test.txt", "r+");
 219     // Cree et ouvre un fichier target.html en lecture/ecriture
 220     // avec suppression du contenu au prealable :
 221     target = fopen("target.html", "w+");
 222
 223     if (source == NULL) {
 224         printf("Impossible d'ouvrir le fichier source\n");
 225         return -1;
 226     }
 227
 228     if (target == NULL) {
 229         printf("Impossible d'ouvrir le fichier target\n");
 230         return -1;
 231     }
 232
 233     c = fgetwc(source); // lecture du premier caractere
 234     do {
 235         int scanrt = scanner();
 236         if (scanrt == EXIT_FAILURE) {
 237             wprintf(L"Scanner error with token value: %ls\n", token[tokenFound].value);
 238             exit(EXIT_FAILURE);
 239         }
 240         if (tokenType == MOT || tokenType == MOTCLE) {
 241             wprintf(L"%20s: %ls\n", tokenTypestr[tokenType], token[tokenFound].value);
 242         } else {
 243             wprintf(L"%20s\n", tokenTypestr[tokenType]);
 244         }
 245         token[tokenFound].type = tokenTypestr[tokenType];
 246         tokenFound++;
 247     } while (tokenType != FIN); // tant que la fin du fichier n'est pas atteinte
 248
 249     if (source != NULL) fclose(source); // fermeture du fichier source
 250     if (target != NULL) fclose(target); // fermeture du fichier target
 251
 252     return EXIT_SUCCESS;
 253 }