From 9ed84d89d23a6198fd4157bfe93424b7b7582332 Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Wed, 15 Nov 2017 20:57:55 +0100 Subject: [PATCH] Major code revamping: MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit * Separate lexical analysis and syntactic analysis code; * Add some print helper functions; * Allow to use stdin and stdout as input and output file; And probably some things I've forgot ... Signed-off-by: Jérôme Benoit --- lexer/global_vars.c | 6 + lexer/global_vars.h | 30 ++++ lexer/lexical_analyzer.c | 212 +++++++++++++++++++++++ lexer/lexical_analyzer.h | 6 + lexer/main.c | 339 ++++++++++++------------------------- lexer/print_helper.c | 35 ++++ lexer/print_helper.h | 10 ++ lexer/syntactic_analyzer.c | 122 +++++++++++++ lexer/syntactic_analyzer.h | 8 + lexer/test.txt | 1 - 10 files changed, 539 insertions(+), 230 deletions(-) create mode 100644 lexer/global_vars.c create mode 100644 lexer/global_vars.h create mode 100644 lexer/lexical_analyzer.c create mode 100644 lexer/lexical_analyzer.h create mode 100644 lexer/print_helper.c create mode 100644 lexer/print_helper.h create mode 100644 lexer/syntactic_analyzer.c create mode 100644 lexer/syntactic_analyzer.h diff --git a/lexer/global_vars.c b/lexer/global_vars.c new file mode 100644 index 0000000..bec8942 --- /dev/null +++ b/lexer/global_vars.c @@ -0,0 +1,6 @@ +#include "global_vars.h" + +FILE *source = NULL, *target = NULL; +struct token_s token[TOKEN_MAX] = {{NULL, {0}}}; +unsigned int tokenFound = 0; +const char* tokenTypestr[] = { "MOTCLE", "SECTION", "SSECTION", "NPARA", "MOT", "FIN" }; diff --git a/lexer/global_vars.h b/lexer/global_vars.h new file mode 100644 index 0000000..6e7ddea --- /dev/null +++ b/lexer/global_vars.h @@ -0,0 +1,30 @@ +#ifndef GLOBAL_VARS_H_ +#define GLOBAL_VARS_H_ + +#include +#include + +#define TOKEN_MAX 500 + +extern FILE *source, *target; + +struct token_s { + const char* type; + wint_t value[50]; +}; +extern struct token_s token[TOKEN_MAX]; + +extern wint_t c; + +extern unsigned int tokenFound; +extern enum TokenType { + MOTCLE, + SECTION, + SSECTION, + NPARA, + MOT, + FIN +} tokenType; +extern const char* tokenTypestr[]; + +#endif /* GLOBAL_VARS_H_ */ diff --git a/lexer/lexical_analyzer.c b/lexer/lexical_analyzer.c new file mode 100644 index 0000000..36acd5e --- /dev/null +++ b/lexer/lexical_analyzer.c @@ -0,0 +1,212 @@ +/* Lexical analyzer */ + +#include +#include + +#include "global_vars.h" +#include "print_helper.h" + +wint_t c; + +/* It looks silly to check for each characters but for debugging, it's just the way to go */ +static bool isAlphaNum() { + if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \ + c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \ + c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \ + c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \ + c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \ + c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \ + c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \ + c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \ + c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \ + c == L'\''|| c == L'#' || \ + c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \ + c == L'7' || c == L'8' || c == L'9' || \ + // FIXME: Accentued characters (aka multibytes characters) support is still buggy + c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \ + c == L'ù' || c == L'û' || \ + c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \ + c == L'Ù' || c == L'Û') { + return true; + } + return false; +} + +static bool isSeparator() { + if (c == L'\t' || c == L' ' || c == L'\n') { + return true; + } + return false; +} + +static bool isEOF() { + if (c == WEOF) { + return true; + } + return false; +} + +int scanner() { + unsigned int i = 0; + wchar_t m[6]; + +init: + if (c == L' ' || c == L'\t') { + c = fgetwc(source); + goto init; + } + if (c == L'\n') { + c = fgetwc(source); + goto initLV1; + } + if (c == L'>') { + c = fgetwc(source); + goto MC1; + } + if (c == L'=') { + c = fgetwc(source); + goto S1SS1; + } + if (isAlphaNum()) { + token[tokenFound].value[i] = c; + i++; + c = fgetwc(source); + goto M1; + } + if (isEOF()) { + goto FIN; + } + goto error; + +MC1: + if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) { + wcscpy((wchar_t*)token[tokenFound].value, L">Auteur"); + c = fgetwc(source); + goto MC2; + } + if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) { + wcscpy((wchar_t*)token[tokenFound].value, L">Titre"); + c = fgetwc(source); + goto MC2; + } + goto error; + +S1SS1: + if (c == L'=') { + c = fgetwc(source); + goto SS2; + } + if (isSeparator() || isEOF()) { + goto SECTION; + } + goto error; + +SS2: + if (isSeparator() || isEOF()) { + goto SSECTION; + } + goto error; + +SECTION: + tokenType = SECTION; + return EXIT_SUCCESS; + +SSECTION: + tokenType = SSECTION; + return EXIT_SUCCESS; + +M1: + if (isAlphaNum()) { + token[tokenFound].value[i] = c; + i++; + c = fgetwc(source); + goto M1; + } + if (isSeparator() || isEOF()) { + goto MOT; + } + goto error; + +initLV1: + if (c == L' ' || c == L'\t') { + c = fgetwc(source); + goto initLV1; + } + if (c == L'\n') { + c = fgetwc(source); + goto initLV1LV2; + } + if (isAlphaNum()) { + token[tokenFound].value[i] = c; + i++; + c = fgetwc(source); + goto M1; + } + if (c == L'=') { + c = fgetwc(source); + goto S1SS1; + } + if (c == L'>') { + c = fgetwc(source); + goto MC1; + } + if (isEOF()) { + goto FIN; + } + goto error; + +initLV1LV2: + if (isSeparator()) { + c = fgetwc(source); + goto initLV1LV2; + } + if (isAlphaNum()) { + goto NPARA; + } + if (c == L'>') { + c = fgetwc(source); + goto MC1; + } + if (c == L'=') { + c = fgetwc(source); + goto S1SS1; + } + if (isEOF()) { + goto FIN; + } + goto error; + +NPARA: + tokenType = NPARA; + return EXIT_SUCCESS; + +MOT: + tokenType = MOT; + return EXIT_SUCCESS; + +MC2: + if (isSeparator() || isEOF()) { + goto MOTCLE; + } + goto error; + +MOTCLE: + tokenType = MOTCLE; + return EXIT_SUCCESS; + +FIN: + tokenType = FIN; + return EXIT_SUCCESS; + +error: + if (tokenType == MOT || tokenType == MOTCLE) { + wpr_error(L"Scanner error with token type: %s and value: %ls\n", + tokenTypestr[tokenType], + token[tokenFound].value); + } else { + wpr_error(L"Scanner error with token type: %s\n", + tokenTypestr[tokenType]); + } + tokenType = FIN; + exit(EXIT_FAILURE); +} diff --git a/lexer/lexical_analyzer.h b/lexer/lexical_analyzer.h new file mode 100644 index 0000000..ec9272e --- /dev/null +++ b/lexer/lexical_analyzer.h @@ -0,0 +1,6 @@ +#ifndef LEXICAL_ANALYZER_H_ +#define LEXICAL_ANALYZER_H_ + +int scanner(); + +#endif /* LEXICAL_ANALYZER_H_ */ diff --git a/lexer/main.c b/lexer/main.c index 3837e9b..c5a0457 100644 --- a/lexer/main.c +++ b/lexer/main.c @@ -2,251 +2,132 @@ #include #include #include -#include +#include -#define TOKEN_MAX 500 +#include "global_vars.h" +#include "lexical_analyzer.h" +#include "syntactic_analyzer.h" +#include "print_helper.h" -struct token_s { - const char* type; - wint_t value[50]; -}; - -struct token_s token[TOKEN_MAX] = {{NULL, {0}}}; - -FILE *source = NULL, *target = NULL; -wint_t c; -unsigned int tokenFound = 0; -enum TokenType { - MOTCLE, - SECTION, - SSECTION, - NPARA, - MOT, - FIN -} tokenType; -const char* tokenTypestr[] = { "MOTCLE", "SECTION", "SSECTION", "NPARA", "MOT", "FIN" }; - -/* It looks silly to check for each characters but for debugging, it's just the way to go */ -bool istAlpha() { - if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \ - c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \ - c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \ - c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \ - c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \ - c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \ - c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \ - c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \ - c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \ - c == L'\''|| c == L'#' || \ - c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \ - c == L'7' || c == L'8' || c == L'9' || \ - // FIXME: Accentued characters (aka multibytes characters) support is still buggy - c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \ - c == L'ù' || c == L'û' || \ - c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \ - c == L'Ù' || c == L'Û') { - return true; +void do_lexical_analysis() { + c = fgetwc(source); // lecture du premier caractere + do { + scanner(); + if (tokenType == MOT || tokenType == MOTCLE) { + fwprintf(target, L"%20s: %ls\n", tokenTypestr[tokenType], token[tokenFound].value); + } else { + fwprintf(target, L"%20s\n", tokenTypestr[tokenType]); } - return false; + token[tokenFound].type = tokenTypestr[tokenType]; + tokenFound++; + } while (tokenType != FIN); // tant que la fin du fichier n'est pas atteinte } -bool isSeparator() { - if (c == L'\t' || c == L' ' || c == L'\n') { - return true; - } - return false; +void do_syntactic_analysis() { + c = fgetwc(source); // lecture du premier caractere + do { + analyze_AXIOME(); + } while (tokenType != FIN); } -int scanner() { - unsigned int i = 0; - wchar_t m[6]; - -init: - if (c == L' ' || c == L'\t') { - c = fgetwc(source); - goto init; - } - if (c == L'\n') { - c = fgetwc(source); - goto initLV1; - } - if (c == L'>') { - c = fgetwc(source); - goto MC1; - } - if (c == L'=') { - c = fgetwc(source); - goto S1SS1; - } - if (istAlpha()) { - token[tokenFound].value[i] = c; - i++; - c = fgetwc(source); - goto M1; - } - if (c == WEOF) { - goto FIN; - } - goto error; - -MC1: - if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) { - wcscpy((wchar_t*)token[tokenFound].value, L">Auteur"); - c = fgetwc(source); - goto MC2; - } - if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) { - wcscpy((wchar_t*)token[tokenFound].value, L">Titre"); - c = fgetwc(source); - goto MC2; - } - goto error; - -S1SS1: - if (c == L'=') { - c = fgetwc(source); - goto SS2; - } - if (isSeparator() || c == WEOF) { - goto SECTION; - } - goto error; - -SS2: - if (isSeparator() || c == WEOF) { - goto SSECTION; - } - goto error; - -SECTION: - tokenType = SECTION; - return EXIT_SUCCESS; - -SSECTION: - tokenType = SSECTION; - return EXIT_SUCCESS; - -M1: - if (istAlpha()) { - token[tokenFound].value[i] = c; - i++; - c = fgetwc(source); - goto M1; - } - if (isSeparator() || c == WEOF) { - goto MOT; - } - goto error; - -initLV1: - if (c == L' ' || c == L'\t') { - c = fgetwc(source); - goto initLV1; - } - if (c == L'\n') { - c = fgetwc(source); - goto initLV1LV2; - } - if (istAlpha()) { - token[tokenFound].value[i] = c; - i++; - c = fgetwc(source); - goto M1; - } - if (c == L'=') { - c = fgetwc(source); - goto S1SS1; - } - if (c == L'>') { - c = fgetwc(source); - goto MC1; - } - if (c == WEOF) { - goto FIN; - } - goto error; +void print_usage(const char* name) { + fprintf(stdout,"Usage: %s [options]\n" + "Where [options] are:\n" + " -h, --help: display this help text\n" + " -l, --lexical-only: do only the lexical analysis\n" + " -i, --input: use as input file instead of standard input\n" + " -o, --output: use as output file instead of standard output\n", + name); +} -initLV1LV2: - if (isSeparator()) { - c = fgetwc(source); - goto initLV1LV2; - } - if (istAlpha()) { - goto NPARA; - } - if (c == L'>') { - c = fgetwc(source); - goto MC1; - } - if (c == L'=') { - c = fgetwc(source); - goto S1SS1; - } - if (c == WEOF) { - goto FIN; +int main(int argc, char **argv) { + /* In and out files name */ + const char* in_file = NULL; + const char* out_file = NULL; + static int hflag = 0; + static int lflag = 0; + + /* getopt_long stores the option index here. */ + int option_index = 0; + + static struct option long_options[] = + { + {"help", no_argument, &hflag, 1}, + {"input", optional_argument, NULL, 'i'}, + {"lexical-only", no_argument, &lflag, 1}, + {"output", optional_argument, NULL, 'o'}, + {0, 0, 0, 0} + }; + + int c_in; + + while ((c_in = getopt_long(argc, argv, "hi::lo::", long_options, + &option_index)) != -1) { + switch (c_in) { + case 'h': + hflag = 1; + break; + case 'i': + if (optarg != NULL) { + in_file = optarg; + } + break; + case 'l': + lflag = 1; + break; + case 'o': + if (optarg != NULL) { + out_file = optarg; + } + break; + case 0: + /* getopt_long() set a variable, just keep going */ + break; + case ':': + /* missing option argument */ + pr_error("%s: option '-%c' requires an argument\n", + argv[0], optopt); + break; + case '?': + default: + /* invalid option */ + pr_error("%s: option '-%c' is invalid: ignored\n", + argv[0], optopt); + break; + } } - goto error; - -NPARA: - tokenType = NPARA; - return EXIT_SUCCESS; -MOT: - tokenType = MOT; - return EXIT_SUCCESS; - -MC2: - if (isSeparator() || c == WEOF) { - goto MOTCLE; + if (in_file != NULL) { + // Ouvre le fichier source en lecture seulement (le fichier doit exister) : + source = fopen(in_file, "r+"); + if (source == NULL) { + pr_error("Impossible d'ouvrir le fichier %s\n", in_file); + return EXIT_FAILURE; + } + } else { + source = stdin; } - goto error; -MOTCLE: - tokenType = MOTCLE; - return EXIT_SUCCESS; - -FIN: - tokenType = FIN; - return EXIT_SUCCESS; - -error: - tokenType = FIN; - return EXIT_FAILURE; -} - -int main() { - // Ouvre le fichier test.txt en lecture seulement (le fichier doit exister) : - source = fopen("test.txt", "r+"); - // Cree et ouvre un fichier target.html en lecture/ecriture - // avec suppression du contenu au prealable : - target = fopen("target.html", "w+"); - - if (source == NULL) { - printf("Impossible d'ouvrir le fichier source\n"); - return -1; + if (out_file != NULL) { + // Cree et ouvre le fichier cible en lecture/ecriture + // avec suppression du contenu au prealable : + target = fopen(out_file, "w+"); + if (target == NULL) { + pr_error("Impossible d'ouvrir le fichier %s\n", out_file); + return EXIT_FAILURE; + } + } else { + target = stdout; } - if (target == NULL) { - printf("Impossible d'ouvrir le fichier target\n"); - return -1; + if (hflag) { + print_usage(argv[0]); + } else if (lflag){ + do_lexical_analysis(); + } else { + do_syntactic_analysis(); } - c = fgetwc(source); // lecture du premier caractere - do { - int scanrt = scanner(); - if (scanrt == EXIT_FAILURE) { - wprintf(L"Scanner error with token value: %ls\n", token[tokenFound].value); - exit(EXIT_FAILURE); - } - if (tokenType == MOT || tokenType == MOTCLE) { - wprintf(L"%20s: %ls\n", tokenTypestr[tokenType], token[tokenFound].value); - } else { - wprintf(L"%20s\n", tokenTypestr[tokenType]); - } - token[tokenFound].type = tokenTypestr[tokenType]; - tokenFound++; - } while (tokenType != FIN); // tant que la fin du fichier n'est pas atteinte - if (source != NULL) fclose(source); // fermeture du fichier source if (target != NULL) fclose(target); // fermeture du fichier target diff --git a/lexer/print_helper.c b/lexer/print_helper.c new file mode 100644 index 0000000..d5ca8f0 --- /dev/null +++ b/lexer/print_helper.c @@ -0,0 +1,35 @@ +#include +#include +#include + +void pr_warning(const char *format, ...) { + va_list args; + + va_start(args, format); + fprintf(stdout, format, args); + va_end(args); +} + +void pr_error(const char *format, ...) { + va_list args; + + va_start(args, format); + fprintf(stderr, format, args); + va_end(args); +} + +void wpr_warning(const wchar_t *format, ...) { + va_list args; + + va_start(args, format); + fwprintf(stdout, format, args); + va_end(args); +} + +void wpr_error(const wchar_t *format, ...) { + va_list args; + + va_start(args, format); + fwprintf(stderr, format, args); + va_end(args); +} diff --git a/lexer/print_helper.h b/lexer/print_helper.h new file mode 100644 index 0000000..1015199 --- /dev/null +++ b/lexer/print_helper.h @@ -0,0 +1,10 @@ +#ifndef PRINT_HELPER_H_ +#define PRINT_HELPER_H_ + +void pr_warning(const char *format, ...); +void pr_error(const char *format, ...); + +void wpr_warning(const wchar_t *format, ...); +void wpr_error(const wchar_t *format, ...); + +#endif diff --git a/lexer/syntactic_analyzer.c b/lexer/syntactic_analyzer.c new file mode 100644 index 0000000..e23c130 --- /dev/null +++ b/lexer/syntactic_analyzer.c @@ -0,0 +1,122 @@ +/* Syntactic analyzer */ + +#include +#include + +#include "global_vars.h" +#include "lexical_analyzer.h" + +/* Syntactic analyzer functions implementation */ + +enum TokenType tokenType; + +static bool analyze_TEXT() { + bool rtval = true; + if (tokenType == MOT) { + scanner(); + rtval = analyze_TEXT(); + } else if (tokenType != MOTCLE && tokenType != NPARA && tokenType != SECTION && \ + tokenType != SSECTION && tokenType != FIN) { + rtval = false; + } + return rtval; +} + +static bool analyze_P() { + bool rtval = true; + if (tokenType == NPARA) { + scanner(); + if (tokenType == MOT) { + scanner(); + rtval = analyze_TEXT(); + rtval = analyze_P(); + } + } else if (tokenType != SECTION && tokenType != SSECTION && tokenType != FIN) { + rtval = false; + } + return rtval; +} + +static bool analyze_HEAD() { + bool rtval = true; + if (tokenType == MOTCLE) { + scanner(); + rtval = analyze_TEXT(); + if (tokenType == MOTCLE) { + scanner(); + rtval = analyze_TEXT(); + } else { + rtval = false; + } + } else { + rtval = false; + } + return rtval; +} + +static bool analyze_H1() { + bool rtval = true; + if (tokenType == SECTION) { + scanner(); + rtval = analyze_TEXT(); + } else { + rtval = false; + } + return rtval; +} + +static bool analyze_H2() { + bool rtval = true; + if (tokenType == SSECTION) { + scanner(); + rtval = analyze_TEXT(); + } else { + rtval = false; + } + return rtval; +} + +static bool analyze_S2() { + bool rtval = true; + if (analyze_H2()) { + rtval = analyze_P(); + rtval = analyze_S2(); + } else if (tokenType != SECTION && tokenType != FIN) { + rtval = false; + } else { + rtval = false; + } + return rtval; +} + +static bool analyze_S1() { + bool rtval = true; + if (analyze_H1()) { + rtval = analyze_P(); + rtval = analyze_S2(); + rtval = analyze_S1(); + } else if (tokenType != FIN) { + rtval = false; + } else { + rtval = false; + } + return rtval; +} + +static bool analyze_BODY() { + bool rtval = true; + rtval = analyze_P(); + rtval = analyze_S1(); + return rtval; +} + +bool analyze_AXIOME() { + bool rtval = true; + scanner(); + rtval = analyze_HEAD(); + rtval = analyze_BODY(); + if (tokenType != FIN) { + rtval = false; + } + return rtval; +} diff --git a/lexer/syntactic_analyzer.h b/lexer/syntactic_analyzer.h new file mode 100644 index 0000000..675873c --- /dev/null +++ b/lexer/syntactic_analyzer.h @@ -0,0 +1,8 @@ +#ifndef SYNTACTIC_ANALYZER_H_ +#define SYNTACTIC_ANALYZER_H_ + +/* Syntactic analyser functions declarations */ + +bool analyze_AXIOME(); + +#endif /* SYNTACTIC_ANALYZER_H_ */ diff --git a/lexer/test.txt b/lexer/test.txt index 1b7a4ee..f07f979 100644 --- a/lexer/test.txt +++ b/lexer/test.txt @@ -1,7 +1,6 @@ >Titre Exemple de fichier texte >Auteur Nicolas - Ce fichier vous montre a quoi ressemble un fichier texte lu en entree. Pour eviter les problemes de format, les accents ne sont pas autorises. Un tel fichier commence toujours par le mot cle Titre, precede d'un -- 2.34.1