From 6a19b8fe2e017d899437b43646b27fdcc26d5ed0 Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Sun, 12 Nov 2017 17:16:01 +0100 Subject: [PATCH] Introduction of wchar.h to support more characters. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Store the found token and its value in an struct array while at it. Signed-off-by: Jérôme Benoit --- lexer/main.c | 188 +++++++++++++++++++++++++++------------------------ 1 file changed, 98 insertions(+), 90 deletions(-) diff --git a/lexer/main.c b/lexer/main.c index 19fc34f..c8b8979 100644 --- a/lexer/main.c +++ b/lexer/main.c @@ -2,14 +2,20 @@ #include #include #include +#include -#define TOKEN_MAX_LENGTH 50 -#define TOKEN_LIST_MAX 500 +#define TOKEN_MAX 500 + +struct token_s { + const char* type; + wint_t value[50]; +}; + +struct token_s token[TOKEN_MAX] = {NULL, 0}; FILE *source = NULL, *target = NULL; -char c; -unsigned int i = 0; -char tokenValue[TOKEN_MAX_LENGTH]; +wint_t c; +unsigned int tokenFound = 0; enum TokenType { MOTCLE, SECTION, @@ -19,72 +25,76 @@ enum TokenType { FIN } tokenType; const char* tokenTypestr[] = { "MOTCLE", "SECTION", "SSECTION", "NPARA", "MOT", "FIN" }; -const char* tokenList[TOKEN_LIST_MAX]; +unsigned int i = 0; /* It looks silly to check for each characters but for debugging, it's just the way to go */ bool istAlpha() { - if (c == 'a' || c == 'b' || c == 'c' || c == 'd' || c == 'e' || c == 'f' || c == 'g' || \ - c == 'h' || c == 'i' || c == 'j' || c == 'k' || c == 'l' || c == 'm' || c == 'n' || \ - c == 'o' || c == 'p' || c == 'q' || c == 'r' || c == 's' || c == 't' || c == 'u' || \ - c == 'v' || c == 'w' || c == 'x' || c == 'y' || c == 'z' || \ - c == 'A' || c == 'B' || c == 'C' || c == 'D' || c == 'E' || c == 'F' || c == 'G' || \ - c == 'H' || c == 'I' || c == 'J' || c == 'K' || c == 'L' || c == 'M' || c == 'N' || \ - c == 'O' || c == 'P' || c == 'Q' || c == 'R' || c == 'S' || c == 'T' || c == 'U' || \ - c == 'V' || c == 'W' || c == 'X' || c == 'Y' || c == 'Z' || \ - c == '.' || c == '?' || c == '!' || c == ',' || c == ';' || c == ':' || c == '-' || \ - c == '\''|| c == '#' || \ - c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || c == '5' || c == '6' || \ - c == '7' || c == '8' || c == '9') { + if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \ + c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \ + c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \ + c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \ + c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \ + c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \ + c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \ + c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \ + c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \ + c == L'\''|| c == L'#' || \ + c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \ + c == L'7' || c == L'8' || c == L'9' || \ + c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \ + c == L'ù' || c == L'û' || \ + c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \ + c == L'Ù' || c == L'Û') { return true; } return false; } bool isSeparator() { - if (c == '\t' || c == ' ' || c == '\n') { + if (c == L'\t' || c == L' ' || c == L'\n') { return true; } return false; } int scanner() { - const char* Titre = "Titre"; - const char* Auteur = "Auteur"; + const wchar_t* Titre = L"Titre"; + const wchar_t* Auteur = L"Auteur"; unsigned int j = 0; // The main loop get the next character init: - if (c == ' ' || c == '\t') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L' ' || c == L'\t') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto init; } - if (c == '\n') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L'\n') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto initLV1; } - if (c == '>') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L'>') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto MC1; } - if (c == '=') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L'=') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto S1SS1; } if (istAlpha()) { - c = fgetc(source); - tokenValue[i] = c; + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto M1; } - if (c == EOF) { + if (c == WEOF) { goto FIN; } else { goto error; @@ -92,39 +102,39 @@ init: MC1: // FIXME: Partial match need a rewind in the characters extraction from the file - if (c == Titre[j] && j < strlen(Titre) - 1) { - c = fgetc(source); - tokenValue[i] = c; + if (c == (wint_t)Titre[j] && j < wcslen(Titre) - 1) { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; j++; goto MC1; } - if (c == Auteur[j] && j < strlen(Auteur) - 1) { - c = fgetc(source); - tokenValue[i] = c; + if (c == (wint_t)Auteur[j] && j < wcslen(Auteur) - 1) { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; j++; goto MC1; } else { - c = fgetc(source); - tokenValue[i] = c; + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto MC2; } S1SS1: - if (c == '=') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L'=') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto SS2; } - if (isSeparator() || c == EOF) { + if (isSeparator() || c == WEOF) { goto SECTION; } SS2: - if (isSeparator() || c == EOF) { + if (isSeparator() || c == WEOF) { goto SSECTION; } @@ -138,73 +148,73 @@ SSECTION: M1: if (istAlpha()) { - c = fgetc(source); - tokenValue[i] = c; + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto M1; } - if (isSeparator() || c == EOF) { + if (isSeparator() || c == WEOF) { goto MOT; } initLV1: - if (c == ' ' || c == '\t') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L' ' || c == L'\t') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto initLV1; } - if (c == '\n') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L'\n') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto initLV1LV2; } if (istAlpha()) { - c = fgetc(source); - tokenValue[i] = c; + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto M1; } - if (c == '=') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L'=') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto S1SS1; } - if (c == '>') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L'>') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto MC1; } - if (c == EOF) { + if (c == WEOF) { goto FIN; } initLV1LV2: if (isSeparator()) { - c = fgetc(source); - tokenValue[i] = c; + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto initLV1LV2; } if (istAlpha()) { goto NPARA; } - if (c == '>') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L'>') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto MC1; } - if (c == '=') { - c = fgetc(source); - tokenValue[i] = c; + if (c == L'=') { + c = fgetwc(source); + token[tokenFound].value[i] = c; i++; goto S1SS1; } - if (c == EOF) { + if (c == WEOF) { goto FIN; } @@ -217,7 +227,7 @@ MOT: return 1; MC2: - if (isSeparator() || c == EOF) { + if (isSeparator() || c == WEOF) { goto MOTCLE; } @@ -234,7 +244,7 @@ error: return -1; } -int main (int argc, char const *argv[]) { +int main() { // Ouvre le fichier test.txt en lecture seulement (le fichier doit exister) : source = fopen("test.txt", "r"); @@ -252,28 +262,26 @@ int main (int argc, char const *argv[]) { return -1; } - int tokenFound = 0; - do { - c = fgetc(source); // lecture du caractere suivant du fichier source - tokenValue[i] = c; + c = fgetwc(source); // lecture du caractere suivant du fichier source + token[tokenFound].value[i] = c; i++; int scanrt = scanner(); if (scanrt == -1) { - printf ("Scanner error with token value: %s\n", tokenValue); + wprintf(L"Scanner error with token value: %ls\n", token[tokenFound].value); exit(EXIT_FAILURE); } - if (c != EOF) { - printf ("Token type found: %s with value: %s\n", tokenTypestr[tokenType], tokenValue); + if (c != WEOF) { + wprintf(L"Token type found: %s with value: %ls\n", tokenTypestr[tokenType], token[tokenFound].value); } else { - printf ("Token type found: %s\n", tokenTypestr[tokenType]); + wprintf(L"Token type found: %s\n", tokenTypestr[tokenType]); } - tokenList[tokenFound] = tokenTypestr[tokenType]; + token[tokenFound].type = tokenTypestr[tokenType]; tokenFound++; - // reinit tokenValue + // reinit token.value array counter i = 0; - memset(tokenValue, 0, sizeof(tokenValue)); - } while (c != EOF); // tant que la fin du fichier n'est pas atteinte + //} while (c != WEOF); // tant que la fin du fichier n'est pas atteinte + } while (!feof(source)); // tant que la fin du fichier n'est pas atteinte if (source != NULL) fclose(source); // fermeture du fichier source if (target != NULL) fclose(target); // fermeture du fichier target -- 2.34.1