Add a FIXME comment about multibytes characters support.

[TP_AL_C.git] / lexer / main.c
diff --git a/lexer/main.c b/lexer/main.c

index c8b897978c6c27e9945b1e5de0a2565132718bce..40ff04d540f95c4b7e6fa16b984d8e188997af6d 100644 (file)
--- a/lexer/main.c
+++ b/lexer/main.c
@@ -25,7 +25,6 @@ enum TokenType {
      FIN
  } tokenType;
  const char* tokenTypestr[] = { "MOTCLE", "SECTION", "SSECTION", "NPARA", "MOT", "FIN" };
-unsigned int i = 0;
  
  /* It looks silly to check for each characters but for debugging, it's just the way to go */
  bool istAlpha() {
@@ -41,6 +40,7 @@ bool istAlpha() {
          c == L'\''|| c == L'#' || \
          c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
          c == L'7' || c == L'8' || c == L'9' || \
+        // FIXME: Accentued characters (aka multibytes characters) support is still buggy
          c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
          c == L'ù' || c == L'û' || \
          c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
@@ -58,145 +58,117 @@ bool isSeparator() {
  }
  
  int scanner() {
-    const wchar_t* Titre = L"Titre";
-    const wchar_t* Auteur = L"Auteur";
-    unsigned int j = 0;
+    unsigned int i = 0;
+    wchar_t m[6];
  
-// The main loop get the next character
  init:
      if (c == L' ' || c == L'\t') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto init;
      }
      if (c == L'\n') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto initLV1;
      }
      if (c == L'>') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto MC1;
      }
      if (c == L'=') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto S1SS1;
      }
      if (istAlpha()) {
-        c = fgetwc(source);
          token[tokenFound].value[i] = c;
          i++;
+        c = fgetwc(source);
          goto M1;
      }
      if (c == WEOF) {
          goto FIN;
-    } else {
-        goto error;
      }
+    goto error;
  
  MC1:
-    // FIXME: Partial match need a rewind in the characters extraction from the file
-    if (c == (wint_t)Titre[j] && j < wcslen(Titre) - 1) {
+    if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
+        wcscpy((wchar_t*)token[tokenFound].value, L">Auteur");
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
-        j++;
-        goto MC1;
+        goto MC2;
      }
-    if (c == (wint_t)Auteur[j] && j < wcslen(Auteur) - 1) {
+    if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
+        wcscpy((wchar_t*)token[tokenFound].value, L">Titre");
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
-        j++;
-        goto MC1;
-    } else {
-        c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto MC2;
      }
+    goto error;
  
  S1SS1:
      if (c == L'=') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto SS2;
      }
      if (isSeparator() || c == WEOF) {
          goto SECTION;
      }
+    goto error;
  
  SS2:
      if (isSeparator() || c == WEOF) {
          goto SSECTION;
      }
+    goto error;
  
  SECTION:
      tokenType = SECTION;
-    return 1;
+    return EXIT_SUCCESS;
  
  SSECTION:
      tokenType = SSECTION;
-    return 1;
+    return EXIT_SUCCESS;
  
  M1:
      if (istAlpha()) {
-        c = fgetwc(source);
          token[tokenFound].value[i] = c;
          i++;
+        c = fgetwc(source);
          goto M1;
      }
      if (isSeparator() || c == WEOF) {
          goto MOT;
      }
+    goto error;
  
  initLV1:
      if (c == L' ' || c == L'\t') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto initLV1;
      }
      if (c == L'\n') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto initLV1LV2;
      }
      if (istAlpha()) {
-        c = fgetwc(source);
          token[tokenFound].value[i] = c;
          i++;
+        c = fgetwc(source);
          goto M1;
      }
      if (c == L'=') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto S1SS1;
      }
      if (c == L'>') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto MC1;
      }
      if (c == WEOF) {
          goto FIN;
      }
+    goto error;
  
  initLV1LV2:
      if (isSeparator()) {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto initLV1LV2;
      }
      if (istAlpha()) {
@@ -204,50 +176,47 @@ initLV1LV2:
      }
      if (c == L'>') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto MC1;
      }
      if (c == L'=') {
          c = fgetwc(source);
-        token[tokenFound].value[i] = c;
-        i++;
          goto S1SS1;
      }
      if (c == WEOF) {
          goto FIN;
      }
+    goto error;
  
  NPARA:
      tokenType = NPARA;
-    return 1;
+    return EXIT_SUCCESS;
  
  MOT:
      tokenType = MOT;
-    return 1;
+    return EXIT_SUCCESS;
  
  MC2:
      if (isSeparator() || c == WEOF) {
          goto MOTCLE;
      }
+    goto error;
  
  MOTCLE:
      tokenType = MOTCLE;
-    return 1;
+    return EXIT_SUCCESS;
  
  FIN:
      tokenType = FIN;
-    return 1;
+    return EXIT_SUCCESS;
  
  error:
      tokenType = FIN;
-    return -1;
+    return EXIT_FAILURE;
  }
  
  int main() {
-
      // Ouvre le fichier test.txt en lecture seulement (le fichier doit exister) :
-    source = fopen("test.txt", "r");
+    source = fopen("test.txt", "r+");
      // Cree et ouvre un fichier target.html en lecture/ecriture
      // avec suppression du contenu au prealable :
      target = fopen("target.html", "w+");
@@ -262,29 +231,24 @@ int main() {
          return -1;
      }
  
+    c = fgetwc(source); // lecture du premier caractere
      do {
-        c = fgetwc(source); // lecture du caractere suivant du fichier source
-        token[tokenFound].value[i] = c;
-        i++;
          int scanrt = scanner();
-        if (scanrt == -1) {
+        if (scanrt == EXIT_FAILURE) {
              wprintf(L"Scanner error with token value: %ls\n", token[tokenFound].value);
              exit(EXIT_FAILURE);
          }
-        if (c != WEOF) {
-            wprintf(L"Token type found: %s with value: %ls\n", tokenTypestr[tokenType], token[tokenFound].value);
+        if (tokenType == MOT || tokenType == MOTCLE) {
+            wprintf(L"%20s: %ls\n", tokenTypestr[tokenType], token[tokenFound].value);
          } else {
-            wprintf(L"Token type found: %s\n", tokenTypestr[tokenType]);
+            wprintf(L"%20s\n", tokenTypestr[tokenType]);
          }
          token[tokenFound].type = tokenTypestr[tokenType];
          tokenFound++;
-        // reinit token.value array counter
-        i = 0;
-    //} while (c != WEOF); // tant que la fin du fichier n'est pas atteinte
-    } while (!feof(source)); // tant que la fin du fichier n'est pas atteinte
+    } while (tokenType != FIN); // tant que la fin du fichier n'est pas atteinte
  
      if (source != NULL) fclose(source); // fermeture du fichier source
      if (target != NULL) fclose(target); // fermeture du fichier target
  
-    return 0;
+    return EXIT_SUCCESS;
  }