--- /dev/null
+/* Lexical analyzer */
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "global_vars.h"
+#include "print_helper.h"
+
+wint_t c;
+
+/* It looks silly to check for each characters but for debugging, it's just the way to go */
+static bool isAlphaNum() {
+ if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \
+ c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \
+ c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \
+ c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \
+ c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \
+ c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \
+ c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \
+ c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \
+ c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \
+ c == L'\''|| c == L'#' || \
+ c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
+ c == L'7' || c == L'8' || c == L'9' || \
+ // FIXME: Accentued characters (aka multibytes characters) support is still buggy
+ c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
+ c == L'ù' || c == L'û' || \
+ c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
+ c == L'Ù' || c == L'Û') {
+ return true;
+ }
+ return false;
+}
+
+static bool isSeparator() {
+ if (c == L'\t' || c == L' ' || c == L'\n') {
+ return true;
+ }
+ return false;
+}
+
+static bool isEOF() {
+ if (c == WEOF) {
+ return true;
+ }
+ return false;
+}
+
+int scanner() {
+ unsigned int i = 0;
+ wchar_t m[6];
+
+init:
+ if (c == L' ' || c == L'\t') {
+ c = fgetwc(source);
+ goto init;
+ }
+ if (c == L'\n') {
+ c = fgetwc(source);
+ goto initLV1;
+ }
+ if (c == L'>') {
+ c = fgetwc(source);
+ goto MC1;
+ }
+ if (c == L'=') {
+ c = fgetwc(source);
+ goto S1SS1;
+ }
+ if (isAlphaNum()) {
+ token[tokenFound].value[i] = c;
+ i++;
+ c = fgetwc(source);
+ goto M1;
+ }
+ if (isEOF()) {
+ goto FIN;
+ }
+ goto error;
+
+MC1:
+ if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
+ wcscpy((wchar_t*)token[tokenFound].value, L">Auteur");
+ c = fgetwc(source);
+ goto MC2;
+ }
+ if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
+ wcscpy((wchar_t*)token[tokenFound].value, L">Titre");
+ c = fgetwc(source);
+ goto MC2;
+ }
+ goto error;
+
+S1SS1:
+ if (c == L'=') {
+ c = fgetwc(source);
+ goto SS2;
+ }
+ if (isSeparator() || isEOF()) {
+ goto SECTION;
+ }
+ goto error;
+
+SS2:
+ if (isSeparator() || isEOF()) {
+ goto SSECTION;
+ }
+ goto error;
+
+SECTION:
+ tokenType = SECTION;
+ return EXIT_SUCCESS;
+
+SSECTION:
+ tokenType = SSECTION;
+ return EXIT_SUCCESS;
+
+M1:
+ if (isAlphaNum()) {
+ token[tokenFound].value[i] = c;
+ i++;
+ c = fgetwc(source);
+ goto M1;
+ }
+ if (isSeparator() || isEOF()) {
+ goto MOT;
+ }
+ goto error;
+
+initLV1:
+ if (c == L' ' || c == L'\t') {
+ c = fgetwc(source);
+ goto initLV1;
+ }
+ if (c == L'\n') {
+ c = fgetwc(source);
+ goto initLV1LV2;
+ }
+ if (isAlphaNum()) {
+ token[tokenFound].value[i] = c;
+ i++;
+ c = fgetwc(source);
+ goto M1;
+ }
+ if (c == L'=') {
+ c = fgetwc(source);
+ goto S1SS1;
+ }
+ if (c == L'>') {
+ c = fgetwc(source);
+ goto MC1;
+ }
+ if (isEOF()) {
+ goto FIN;
+ }
+ goto error;
+
+initLV1LV2:
+ if (isSeparator()) {
+ c = fgetwc(source);
+ goto initLV1LV2;
+ }
+ if (isAlphaNum()) {
+ goto NPARA;
+ }
+ if (c == L'>') {
+ c = fgetwc(source);
+ goto MC1;
+ }
+ if (c == L'=') {
+ c = fgetwc(source);
+ goto S1SS1;
+ }
+ if (isEOF()) {
+ goto FIN;
+ }
+ goto error;
+
+NPARA:
+ tokenType = NPARA;
+ return EXIT_SUCCESS;
+
+MOT:
+ tokenType = MOT;
+ return EXIT_SUCCESS;
+
+MC2:
+ if (isSeparator() || isEOF()) {
+ goto MOTCLE;
+ }
+ goto error;
+
+MOTCLE:
+ tokenType = MOTCLE;
+ return EXIT_SUCCESS;
+
+FIN:
+ tokenType = FIN;
+ return EXIT_SUCCESS;
+
+error:
+ if (tokenType == MOT || tokenType == MOTCLE) {
+ wpr_error(L"Scanner error with token type: %s and value: %ls\n",
+ tokenTypestr[tokenType],
+ token[tokenFound].value);
+ } else {
+ wpr_error(L"Scanner error with token type: %s\n",
+ tokenTypestr[tokenType]);
+ }
+ tokenType = FIN;
+ exit(EXIT_FAILURE);
+}
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
-#include <wchar.h>
+#include <getopt.h>
-#define TOKEN_MAX 500
+#include "global_vars.h"
+#include "lexical_analyzer.h"
+#include "syntactic_analyzer.h"
+#include "print_helper.h"
-struct token_s {
- const char* type;
- wint_t value[50];
-};
-
-struct token_s token[TOKEN_MAX] = {{NULL, {0}}};
-
-FILE *source = NULL, *target = NULL;
-wint_t c;
-unsigned int tokenFound = 0;
-enum TokenType {
- MOTCLE,
- SECTION,
- SSECTION,
- NPARA,
- MOT,
- FIN
-} tokenType;
-const char* tokenTypestr[] = { "MOTCLE", "SECTION", "SSECTION", "NPARA", "MOT", "FIN" };
-
-/* It looks silly to check for each characters but for debugging, it's just the way to go */
-bool istAlpha() {
- if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \
- c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \
- c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \
- c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \
- c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \
- c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \
- c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \
- c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \
- c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \
- c == L'\''|| c == L'#' || \
- c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
- c == L'7' || c == L'8' || c == L'9' || \
- // FIXME: Accentued characters (aka multibytes characters) support is still buggy
- c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
- c == L'ù' || c == L'û' || \
- c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
- c == L'Ù' || c == L'Û') {
- return true;
+void do_lexical_analysis() {
+ c = fgetwc(source); // lecture du premier caractere
+ do {
+ scanner();
+ if (tokenType == MOT || tokenType == MOTCLE) {
+ fwprintf(target, L"%20s: %ls\n", tokenTypestr[tokenType], token[tokenFound].value);
+ } else {
+ fwprintf(target, L"%20s\n", tokenTypestr[tokenType]);
}
- return false;
+ token[tokenFound].type = tokenTypestr[tokenType];
+ tokenFound++;
+ } while (tokenType != FIN); // tant que la fin du fichier n'est pas atteinte
}
-bool isSeparator() {
- if (c == L'\t' || c == L' ' || c == L'\n') {
- return true;
- }
- return false;
+void do_syntactic_analysis() {
+ c = fgetwc(source); // lecture du premier caractere
+ do {
+ analyze_AXIOME();
+ } while (tokenType != FIN);
}
-int scanner() {
- unsigned int i = 0;
- wchar_t m[6];
-
-init:
- if (c == L' ' || c == L'\t') {
- c = fgetwc(source);
- goto init;
- }
- if (c == L'\n') {
- c = fgetwc(source);
- goto initLV1;
- }
- if (c == L'>') {
- c = fgetwc(source);
- goto MC1;
- }
- if (c == L'=') {
- c = fgetwc(source);
- goto S1SS1;
- }
- if (istAlpha()) {
- token[tokenFound].value[i] = c;
- i++;
- c = fgetwc(source);
- goto M1;
- }
- if (c == WEOF) {
- goto FIN;
- }
- goto error;
-
-MC1:
- if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
- wcscpy((wchar_t*)token[tokenFound].value, L">Auteur");
- c = fgetwc(source);
- goto MC2;
- }
- if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
- wcscpy((wchar_t*)token[tokenFound].value, L">Titre");
- c = fgetwc(source);
- goto MC2;
- }
- goto error;
-
-S1SS1:
- if (c == L'=') {
- c = fgetwc(source);
- goto SS2;
- }
- if (isSeparator() || c == WEOF) {
- goto SECTION;
- }
- goto error;
-
-SS2:
- if (isSeparator() || c == WEOF) {
- goto SSECTION;
- }
- goto error;
-
-SECTION:
- tokenType = SECTION;
- return EXIT_SUCCESS;
-
-SSECTION:
- tokenType = SSECTION;
- return EXIT_SUCCESS;
-
-M1:
- if (istAlpha()) {
- token[tokenFound].value[i] = c;
- i++;
- c = fgetwc(source);
- goto M1;
- }
- if (isSeparator() || c == WEOF) {
- goto MOT;
- }
- goto error;
-
-initLV1:
- if (c == L' ' || c == L'\t') {
- c = fgetwc(source);
- goto initLV1;
- }
- if (c == L'\n') {
- c = fgetwc(source);
- goto initLV1LV2;
- }
- if (istAlpha()) {
- token[tokenFound].value[i] = c;
- i++;
- c = fgetwc(source);
- goto M1;
- }
- if (c == L'=') {
- c = fgetwc(source);
- goto S1SS1;
- }
- if (c == L'>') {
- c = fgetwc(source);
- goto MC1;
- }
- if (c == WEOF) {
- goto FIN;
- }
- goto error;
+void print_usage(const char* name) {
+ fprintf(stdout,"Usage: %s [options]\n"
+ "Where [options] are:\n"
+ " -h, --help: display this help text\n"
+ " -l, --lexical-only: do only the lexical analysis\n"
+ " -i, --input<filename>: use <filename> as input file instead of standard input\n"
+ " -o, --output<filename>: use <filename> as output file instead of standard output\n",
+ name);
+}
-initLV1LV2:
- if (isSeparator()) {
- c = fgetwc(source);
- goto initLV1LV2;
- }
- if (istAlpha()) {
- goto NPARA;
- }
- if (c == L'>') {
- c = fgetwc(source);
- goto MC1;
- }
- if (c == L'=') {
- c = fgetwc(source);
- goto S1SS1;
- }
- if (c == WEOF) {
- goto FIN;
+int main(int argc, char **argv) {
+ /* In and out files name */
+ const char* in_file = NULL;
+ const char* out_file = NULL;
+ static int hflag = 0;
+ static int lflag = 0;
+
+ /* getopt_long stores the option index here. */
+ int option_index = 0;
+
+ static struct option long_options[] =
+ {
+ {"help", no_argument, &hflag, 1},
+ {"input", optional_argument, NULL, 'i'},
+ {"lexical-only", no_argument, &lflag, 1},
+ {"output", optional_argument, NULL, 'o'},
+ {0, 0, 0, 0}
+ };
+
+ int c_in;
+
+ while ((c_in = getopt_long(argc, argv, "hi::lo::", long_options,
+ &option_index)) != -1) {
+ switch (c_in) {
+ case 'h':
+ hflag = 1;
+ break;
+ case 'i':
+ if (optarg != NULL) {
+ in_file = optarg;
+ }
+ break;
+ case 'l':
+ lflag = 1;
+ break;
+ case 'o':
+ if (optarg != NULL) {
+ out_file = optarg;
+ }
+ break;
+ case 0:
+ /* getopt_long() set a variable, just keep going */
+ break;
+ case ':':
+ /* missing option argument */
+ pr_error("%s: option '-%c' requires an argument\n",
+ argv[0], optopt);
+ break;
+ case '?':
+ default:
+ /* invalid option */
+ pr_error("%s: option '-%c' is invalid: ignored\n",
+ argv[0], optopt);
+ break;
+ }
}
- goto error;
-
-NPARA:
- tokenType = NPARA;
- return EXIT_SUCCESS;
-MOT:
- tokenType = MOT;
- return EXIT_SUCCESS;
-
-MC2:
- if (isSeparator() || c == WEOF) {
- goto MOTCLE;
+ if (in_file != NULL) {
+ // Ouvre le fichier source en lecture seulement (le fichier doit exister) :
+ source = fopen(in_file, "r+");
+ if (source == NULL) {
+ pr_error("Impossible d'ouvrir le fichier %s\n", in_file);
+ return EXIT_FAILURE;
+ }
+ } else {
+ source = stdin;
}
- goto error;
-MOTCLE:
- tokenType = MOTCLE;
- return EXIT_SUCCESS;
-
-FIN:
- tokenType = FIN;
- return EXIT_SUCCESS;
-
-error:
- tokenType = FIN;
- return EXIT_FAILURE;
-}
-
-int main() {
- // Ouvre le fichier test.txt en lecture seulement (le fichier doit exister) :
- source = fopen("test.txt", "r+");
- // Cree et ouvre un fichier target.html en lecture/ecriture
- // avec suppression du contenu au prealable :
- target = fopen("target.html", "w+");
-
- if (source == NULL) {
- printf("Impossible d'ouvrir le fichier source\n");
- return -1;
+ if (out_file != NULL) {
+ // Cree et ouvre le fichier cible en lecture/ecriture
+ // avec suppression du contenu au prealable :
+ target = fopen(out_file, "w+");
+ if (target == NULL) {
+ pr_error("Impossible d'ouvrir le fichier %s\n", out_file);
+ return EXIT_FAILURE;
+ }
+ } else {
+ target = stdout;
}
- if (target == NULL) {
- printf("Impossible d'ouvrir le fichier target\n");
- return -1;
+ if (hflag) {
+ print_usage(argv[0]);
+ } else if (lflag){
+ do_lexical_analysis();
+ } else {
+ do_syntactic_analysis();
}
- c = fgetwc(source); // lecture du premier caractere
- do {
- int scanrt = scanner();
- if (scanrt == EXIT_FAILURE) {
- wprintf(L"Scanner error with token value: %ls\n", token[tokenFound].value);
- exit(EXIT_FAILURE);
- }
- if (tokenType == MOT || tokenType == MOTCLE) {
- wprintf(L"%20s: %ls\n", tokenTypestr[tokenType], token[tokenFound].value);
- } else {
- wprintf(L"%20s\n", tokenTypestr[tokenType]);
- }
- token[tokenFound].type = tokenTypestr[tokenType];
- tokenFound++;
- } while (tokenType != FIN); // tant que la fin du fichier n'est pas atteinte
-
if (source != NULL) fclose(source); // fermeture du fichier source
if (target != NULL) fclose(target); // fermeture du fichier target
--- /dev/null
+/* Syntactic analyzer */
+
+#include <stdbool.h>
+#include <stdarg.h>
+
+#include "global_vars.h"
+#include "lexical_analyzer.h"
+
+/* Syntactic analyzer functions implementation */
+
+enum TokenType tokenType;
+
+static bool analyze_TEXT() {
+ bool rtval = true;
+ if (tokenType == MOT) {
+ scanner();
+ rtval = analyze_TEXT();
+ } else if (tokenType != MOTCLE && tokenType != NPARA && tokenType != SECTION && \
+ tokenType != SSECTION && tokenType != FIN) {
+ rtval = false;
+ }
+ return rtval;
+}
+
+static bool analyze_P() {
+ bool rtval = true;
+ if (tokenType == NPARA) {
+ scanner();
+ if (tokenType == MOT) {
+ scanner();
+ rtval = analyze_TEXT();
+ rtval = analyze_P();
+ }
+ } else if (tokenType != SECTION && tokenType != SSECTION && tokenType != FIN) {
+ rtval = false;
+ }
+ return rtval;
+}
+
+static bool analyze_HEAD() {
+ bool rtval = true;
+ if (tokenType == MOTCLE) {
+ scanner();
+ rtval = analyze_TEXT();
+ if (tokenType == MOTCLE) {
+ scanner();
+ rtval = analyze_TEXT();
+ } else {
+ rtval = false;
+ }
+ } else {
+ rtval = false;
+ }
+ return rtval;
+}
+
+static bool analyze_H1() {
+ bool rtval = true;
+ if (tokenType == SECTION) {
+ scanner();
+ rtval = analyze_TEXT();
+ } else {
+ rtval = false;
+ }
+ return rtval;
+}
+
+static bool analyze_H2() {
+ bool rtval = true;
+ if (tokenType == SSECTION) {
+ scanner();
+ rtval = analyze_TEXT();
+ } else {
+ rtval = false;
+ }
+ return rtval;
+}
+
+static bool analyze_S2() {
+ bool rtval = true;
+ if (analyze_H2()) {
+ rtval = analyze_P();
+ rtval = analyze_S2();
+ } else if (tokenType != SECTION && tokenType != FIN) {
+ rtval = false;
+ } else {
+ rtval = false;
+ }
+ return rtval;
+}
+
+static bool analyze_S1() {
+ bool rtval = true;
+ if (analyze_H1()) {
+ rtval = analyze_P();
+ rtval = analyze_S2();
+ rtval = analyze_S1();
+ } else if (tokenType != FIN) {
+ rtval = false;
+ } else {
+ rtval = false;
+ }
+ return rtval;
+}
+
+static bool analyze_BODY() {
+ bool rtval = true;
+ rtval = analyze_P();
+ rtval = analyze_S1();
+ return rtval;
+}
+
+bool analyze_AXIOME() {
+ bool rtval = true;
+ scanner();
+ rtval = analyze_HEAD();
+ rtval = analyze_BODY();
+ if (tokenType != FIN) {
+ rtval = false;
+ }
+ return rtval;
+}