Push the code of a basic lexical analyser for the markdown syntax.

author Jérôme Benoit <jerome.benoit@piment-noir.org>

Sat, 21 Oct 2017 16:34:24 +0000 (18:34 +0200)

committer Jérôme Benoit <jerome.benoit@piment-noir.org>

Sat, 21 Oct 2017 16:34:24 +0000 (18:34 +0200)
author Jérôme Benoit <jerome.benoit@piment-noir.org>
Sat, 21 Oct 2017 16:34:24 +0000 (18:34 +0200)
committer Jérôme Benoit <jerome.benoit@piment-noir.org>
Sat, 21 Oct 2017 16:34:24 +0000 (18:34 +0200)
diff --git a/lexer/main.c b/lexer/main.c

index bb3c6aea0588433425d39213380b64571191fd5b..671d88f94286139bee7cd87159b149312c0e92c2 100644 (file)
--- a/lexer/main.c
+++ b/lexer/main.c
@@ -1,9 +1,228 @@
  #include <stdlib.h>
  #include <stdio.h>
  #include <string.h>
+#include <stdbool.h>
  
  FILE *source, *target = NULL;
  char c;
+unsigned int i = 0;
+char tokenValue[50];
+enum TokenType {
+    MOTCLE,
+    SECTION,
+    SSECTION,
+    NPARA,
+    MOT,
+    FIN
+} tokenType;
+const char* tokenTypestr[] = { "MOTCLE", "SECTION", "SSECTION", "NPARA", "MOT", "FIN" };
+
+bool istAlpha() {
+    if (c == 'a' || c == 'b' || c == 'c' || c == 'd' || c == 'e' || c == 'f' || c == 'g' || \
+        c == 'h' || c == 'i' || c == 'j' || c == 'k' || c == 'l' || c == 'm' || c == 'n' || \
+        c == 'o' || c == 'p' || c == 'q' || c == 'r' || c == 's' || c == 't' || c == 'u' || \
+        c == 'v' || c == 'w' || c == 'x' || c == 'y' || c == 'z' || \
+        c == 'A' || c == 'B' || c == 'C' || c == 'D' || c == 'E' || c == 'F' || c == 'G' || \
+        c == 'H' || c == 'I' || c == 'J' || c == 'K' || c == 'L' || c == 'M' || c == 'N' || \
+        c == 'O' || c == 'P' || c == 'Q' || c == 'R' || c == 'S' || c == 'T' || c == 'U' || \
+        c == 'V' || c == 'W' || c == 'X' || c == 'Y' || c == 'Z' || \
+        c == '.' || c == '?' || c == '!' || c == ',' || c == ';' || c == ':' || c == '-' || \
+        c == '\''|| c == '#' || \
+        c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || c == '5' || c == '6' || \
+        c == '7' || c == '8' || c == '9') {
+            return true;
+        }
+        return false;
+}
+
+bool isSeparator() {
+    if (c == '\t' || c == ' ' || c == '\n') {
+        return true;
+    }
+    return false;
+}
+
+int scanner() {
+const char* Titre = "Titre";
+const char* Auteur = "Auteur";
+unsigned int j = 0;
+
+// The main loop get the next character
+init:
+    if (c == ' ' || c == '\t') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto init;
+    }
+    if (c == '>') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto MC1;
+    }
+    if (c == '=') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto S1SS1;
+    }
+    if (istAlpha()) {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto M1;
+    }
+    if (c == '\n') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto initLV1;
+    }
+    if (c == EOF) {
+        goto FIN;
+    } else {
+        goto error;
+    }
+
+MC1:
+    if (c == Titre[j] && j < strlen(Titre) - 1) {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        j++;
+        goto MC1;
+    }
+    if (c == Auteur[j] && j < strlen(Auteur) - 1) {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        j++;
+        goto MC1;
+    } else {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto MC2;
+    }
+
+S1SS1:
+    if (isSeparator() || c == EOF) {
+        goto SECTION;
+    }
+    if (c == '=') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto SS2;
+    }
+
+SS2:
+    if (isSeparator() || c == EOF) {
+        goto SECTION;
+    }
+
+SECTION:
+    tokenType = SECTION;
+    return 1;
+
+M1:
+    if (istAlpha()) {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto M1;
+    }
+    if (isSeparator() || c == EOF) {
+        goto MOT;
+    }
+
+initLV1:
+    if (c == '\n' || c == '\t') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto initLV1;
+    }
+    if (istAlpha()) {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto M1;
+    }
+    if (c == '=') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto S1SS1;
+    }
+    if (c == '>') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto MC1;
+    }
+    if (c == '\n') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto initLV1LV2;
+    }
+    if (c == EOF) {
+        goto FIN;
+    }
+
+initLV1LV2:
+    if (isSeparator()) {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto initLV1LV2;
+    }
+    if (c == '>') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto MC1;
+    }
+    if (c == '=') {
+        c = fgetc(source);
+        tokenValue[i] = c;
+        i++;
+        goto S1SS1;
+    }
+    if (c == EOF) {
+        goto FIN;
+    }
+    if (istAlpha()) {
+        goto NPARA;
+    }
+
+NPARA:
+    tokenType = NPARA;
+    return 1;
+
+MOT:
+    tokenType = MOT;
+    return 1;
+
+MC2:
+    if (isSeparator() || c == EOF) {
+        goto MOTCLE;
+    }
+
+MOTCLE:
+    tokenType = MOTCLE;
+    return 1;
+
+FIN:
+    tokenType = FIN;
+    return 1;
+
+error:
+    tokenType = FIN;
+    return -1;
+}
  
  int main (int argc, char const *argv[]) {
  
@@ -23,11 +242,25 @@ int main (int argc, char const *argv[]) {
          return -1;
      }
  
-    c = fgetc(source);     // lecture du caractere suivant du fichier source
-    while(c != EOF) {        // tant que la fin du fichier n'est pas atteinte
-        fputc(c, target);   // ecrire c dans le fichier target
+    do {
          c = fgetc(source); // lecture du caractere suivant du fichier source
-    }
+        //fputc(c, target);  // ecrire c dans le fichier target
+        tokenValue[i] = c;
+        i++;
+        int scanrt = scanner();
+        if (scanrt == -1) {
+            printf ("Scanner error with token value: %s\n", tokenValue);
+            exit(EXIT_FAILURE);
+        }
+        if (c != EOF) {
+            printf ("Token type found: %s with value: %s\n", tokenTypestr[tokenType], tokenValue);
+        } else {
+            printf ("Token type found: %s\n", tokenTypestr[tokenType]);
+        }
+        // reinit
+        i = 0;
+        memset(tokenValue, 0, sizeof(tokenValue));
+    } while (c != EOF);    // tant que la fin du fichier n'est pas atteinte
  
      if (source != NULL) fclose(source); // fermeture du fichier source
      if (target != NULL) fclose(target); // fermeture du fichier target
diff --git a/lexer/test.txt b/lexer/test.txt

index f2c8453dbdc27b374657e47e802c93233e991f5e..f07f97970d8f42e1659acd05816c405328b397fe 100644 (file)
--- a/lexer/test.txt
+++ b/lexer/test.txt
@@ -3,9 +3,9 @@
  
  Ce fichier vous montre a quoi ressemble un fichier texte lu en entree.
  Pour eviter les problemes de format, les accents ne sont pas autorises.
-Un tel fichier commence toujours par le mot cle Titre, precede d’un
-chevron, et suivi du titre du texte; puis du mot cle Auteur precede d’un
-chevron suivi du nom de l’auteur.
+Un tel fichier commence toujours par le mot cle Titre, precede d'un
+chevron, et suivi du titre du texte; puis du mot cle Auteur precede d'un
+chevron suivi du nom de l'auteur.
  Vient ensuite un ou plusieurs paragraphes optionnels comme celui que vous
  lisez.
author	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Sat, 21 Oct 2017 16:34:24 +0000 (18:34 +0200)
committer	Jérôme Benoit <jerome.benoit@piment-noir.org>
	Sat, 21 Oct 2017 16:34:24 +0000 (18:34 +0200)
lexer/main.c		patch \| blob \| blame \| history
lexer/test.txt		patch \| blob \| blame \| history