[TP_AL_C.git] / lexer / lexical_analyzer.c

/* Lexical analyzer */

#include <stdlib.h>
#include <stdbool.h>

#include "global_vars.h"
#include "print_helper.h"
#include "lexical_analyzer.h"

wint_t c;

/* It looks silly to check for each characters but for debugging, it's just the way to go */
static bool isAlphaNum() {
    if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \
        c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \
        c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \
        c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \
        c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \
        c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \
        c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \
        c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \
        c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \
        c == L'\''|| c == L'#' || \
        c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
        c == L'7' || c == L'8' || c == L'9' || \
        //FIXME: Accentued characters (aka multibytes characters) support is still buggy
        c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
        c == L'ù' || c == L'û' || \
        c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
        c == L'Ù' || c == L'Û') {
            return true;
        }
        return false;
}

static bool isSeparator() {
    if (c == L'\t' || c == L' ' || c == L'\n') {
        return true;
    }
    return false;
}

static bool isEOF() {
    if (c == WEOF) {
        return true;
    }
    return false;
}

int scanner(void) {
    tokenValue[0] = 0;
    unsigned int i = 0;
    wchar_t m[6];

init:
    if (c == L' ' || c == L'\t') {
        c = fgetwc(source);
        goto init;
    }
    if (c == L'\n') {
        c = fgetwc(source);
        goto initLV1;
    }
    if (c == L'>') {
        c = fgetwc(source);
        goto MC1;
    }
    if (c == L'=') {
        c = fgetwc(source);
        goto S1SS1;
    }
    if (isAlphaNum()) {
        tokenValue[i] = c;
        i++;
        c = fgetwc(source);
        goto M1;
    }
    if (isEOF()) {
        goto FIN;
    }
    goto error;

MC1:
    if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
        wcscpy((wchar_t*)tokenValue, L">Auteur");
        c = fgetwc(source);
        goto MC2;
    }
    if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
        wcscpy((wchar_t*)tokenValue, L">Titre");
        c = fgetwc(source);
        goto MC2;
    }
    goto error;

S1SS1:
    if (c == L'=') {
        c = fgetwc(source);
        goto SS2;
    }
    if (isSeparator() || isEOF()) {
        goto SECTION;
    }
    goto error;

SS2:
    if (isSeparator() || isEOF()) {
        goto SSECTION;
    }
    goto error;

SECTION:
    tokenType = SECTION;
    return EXIT_SUCCESS;

SSECTION:
    tokenType = SSECTION;
    return EXIT_SUCCESS;

M1:
    if (isAlphaNum()) {
        tokenValue[i] = c;
        i++;
        c = fgetwc(source);
        goto M1;
    }
    if (isSeparator() || isEOF()) {
        goto MOT;
    }
    goto error;

initLV1:
    if (c == L' ' || c == L'\t') {
        c = fgetwc(source);
        goto initLV1;
    }
    if (c == L'\n') {
        c = fgetwc(source);
        goto initLV1LV2;
    }
    if (isAlphaNum()) {
        tokenValue[i] = c;
        i++;
        c = fgetwc(source);
        goto M1;
    }
    if (c == L'=') {
        c = fgetwc(source);
        goto S1SS1;
    }
    if (c == L'>') {
        c = fgetwc(source);
        goto MC1;
    }
    if (isEOF()) {
        goto FIN;
    }
    goto error;

initLV1LV2:
    if (isSeparator()) {
        c = fgetwc(source);
        goto initLV1LV2;
    }
    if (isAlphaNum()) {
        goto NPARA;
    }
    if (c == L'>') {
        c = fgetwc(source);
        goto MC1;
    }
    if (c == L'=') {
        c = fgetwc(source);
        goto S1SS1;
    }
    if (isEOF()) {
        goto FIN;
    }
    goto error;

NPARA:
    tokenType = NPARA;
    return EXIT_SUCCESS;

MOT:
    tokenType = MOT;
    tokenValue[i] = 0;
    wcscpy((wchar_t*)token[tokenFound].value, (wchar_t*)tokenValue);
    return EXIT_SUCCESS;

MC2:
    if (isSeparator() || isEOF()) {
        goto MOTCLE;
    }
    goto error;

MOTCLE:
    tokenType = MOTCLE;
    wcscpy((wchar_t*)token[tokenFound].value, (wchar_t*)tokenValue);
    return EXIT_SUCCESS;

FIN:
    tokenType = FIN;
    return EXIT_SUCCESS;

error:
    if (tokenType == MOT || tokenType == MOTCLE) {
        fwprintf(stderr, L"%s error with token type: %s and value: %ls\n",
                 __func__,
                 tokenTypestr[tokenType],
                 tokenValue);
    } else {
        fwprintf(stderr, L"%s error with token type: %s\n",
                 __func__,
                 tokenTypestr[tokenType]);
    }
    fflush(stderr);
    tokenType = FIN;
    exit(EXIT_FAILURE);
}
Commit	Line	Data
	1	/* Lexical analyzer */
	2
	3	#include <stdlib.h>
	4	#include <stdbool.h>
	5
	6	#include "global_vars.h"
	7	#include "print_helper.h"
	8	#include "lexical_analyzer.h"
	9
	10	wint_t c;
	11
	12	/* It looks silly to check for each characters but for debugging, it's just the way to go */
	13	static bool isAlphaNum() {
	14	if (c == L'a' \|\| c == L'b' \|\| c == L'c' \|\| c == L'd' \|\| c == L'e' \|\| c == L'f' \|\| c == L'g' \|\| \
	15	c == L'h' \|\| c == L'i' \|\| c == L'j' \|\| c == L'k' \|\| c == L'l' \|\| c == L'm' \|\| c == L'n' \|\| \
	16	c == L'o' \|\| c == L'p' \|\| c == L'q' \|\| c == L'r' \|\| c == L's' \|\| c == L't' \|\| c == L'u' \|\| \
	17	c == L'v' \|\| c == L'w' \|\| c == L'x' \|\| c == L'y' \|\| c == L'z' \|\| \
	18	c == L'A' \|\| c == L'B' \|\| c == L'C' \|\| c == L'D' \|\| c == L'E' \|\| c == L'F' \|\| c == L'G' \|\| \
	19	c == L'H' \|\| c == L'I' \|\| c == L'J' \|\| c == L'K' \|\| c == L'L' \|\| c == L'M' \|\| c == L'N' \|\| \
	20	c == L'O' \|\| c == L'P' \|\| c == L'Q' \|\| c == L'R' \|\| c == L'S' \|\| c == L'T' \|\| c == L'U' \|\| \
	21	c == L'V' \|\| c == L'W' \|\| c == L'X' \|\| c == L'Y' \|\| c == L'Z' \|\| \
	22	c == L'.' \|\| c == L'?' \|\| c == L'!' \|\| c == L',' \|\| c == L';' \|\| c == L':' \|\| c == L'-' \|\| \
	23	c == L'\''\|\| c == L'#' \|\| \
	24	c == L'0' \|\| c == L'1' \|\| c == L'2' \|\| c == L'3' \|\| c == L'4' \|\| c == L'5' \|\| c == L'6' \|\| \
	25	c == L'7' \|\| c == L'8' \|\| c == L'9' \|\| \
	26	//FIXME: Accentued characters (aka multibytes characters) support is still buggy
	27	c == L'à' \|\| c == L'â' \|\| c == L'ç' \|\| c == L'è' \|\| c == L'é' \|\| c == L'î' \|\| c == L'ô' \|\| \
	28	c == L'ù' \|\| c == L'û' \|\| \
	29	c == L'À' \|\| c == L'Â' \|\| c == L'Ç' \|\| c == L'È' \|\| c == L'É' \|\| c == L'Î' \|\| c == L'Ô' \|\| \
	30	c == L'Ù' \|\| c == L'Û') {
	31	return true;
	32	}
	33	return false;
	34	}
	35
	36	static bool isSeparator() {
	37	if (c == L'\t' \|\| c == L' ' \|\| c == L'\n') {
	38	return true;
	39	}
	40	return false;
	41	}
	42
	43	static bool isEOF() {
	44	if (c == WEOF) {
	45	return true;
	46	}
	47	return false;
	48	}
	49
	50	int scanner(void) {
	51	tokenValue[0] = 0;
	52	unsigned int i = 0;
	53	wchar_t m[6];
	54
	55	init:
	56	if (c == L' ' \|\| c == L'\t') {
	57	c = fgetwc(source);
	58	goto init;
	59	}
	60	if (c == L'\n') {
	61	c = fgetwc(source);
	62	goto initLV1;
	63	}
	64	if (c == L'>') {
	65	c = fgetwc(source);
	66	goto MC1;
	67	}
	68	if (c == L'=') {
	69	c = fgetwc(source);
	70	goto S1SS1;
	71	}
	72	if (isAlphaNum()) {
	73	tokenValue[i] = c;
	74	i++;
	75	c = fgetwc(source);
	76	goto M1;
	77	}
	78	if (isEOF()) {
	79	goto FIN;
	80	}
	81	goto error;
	82
	83	MC1:
	84	if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
	85	wcscpy((wchar_t*)tokenValue, L">Auteur");
	86	c = fgetwc(source);
	87	goto MC2;
	88	}
	89	if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
	90	wcscpy((wchar_t*)tokenValue, L">Titre");
	91	c = fgetwc(source);
	92	goto MC2;
	93	}
	94	goto error;
	95
	96	S1SS1:
	97	if (c == L'=') {
	98	c = fgetwc(source);
	99	goto SS2;
	100	}
	101	if (isSeparator() \|\| isEOF()) {
	102	goto SECTION;
	103	}
	104	goto error;
	105
	106	SS2:
	107	if (isSeparator() \|\| isEOF()) {
	108	goto SSECTION;
	109	}
	110	goto error;
	111
	112	SECTION:
	113	tokenType = SECTION;
	114	return EXIT_SUCCESS;
	115
	116	SSECTION:
	117	tokenType = SSECTION;
	118	return EXIT_SUCCESS;
	119
	120	M1:
	121	if (isAlphaNum()) {
	122	tokenValue[i] = c;
	123	i++;
	124	c = fgetwc(source);
	125	goto M1;
	126	}
	127	if (isSeparator() \|\| isEOF()) {
	128	goto MOT;
	129	}
	130	goto error;
	131
	132	initLV1:
	133	if (c == L' ' \|\| c == L'\t') {
	134	c = fgetwc(source);
	135	goto initLV1;
	136	}
	137	if (c == L'\n') {
	138	c = fgetwc(source);
	139	goto initLV1LV2;
	140	}
	141	if (isAlphaNum()) {
	142	tokenValue[i] = c;
	143	i++;
	144	c = fgetwc(source);
	145	goto M1;
	146	}
	147	if (c == L'=') {
	148	c = fgetwc(source);
	149	goto S1SS1;
	150	}
	151	if (c == L'>') {
	152	c = fgetwc(source);
	153	goto MC1;
	154	}
	155	if (isEOF()) {
	156	goto FIN;
	157	}
	158	goto error;
	159
	160	initLV1LV2:
	161	if (isSeparator()) {
	162	c = fgetwc(source);
	163	goto initLV1LV2;
	164	}
	165	if (isAlphaNum()) {
	166	goto NPARA;
	167	}
	168	if (c == L'>') {
	169	c = fgetwc(source);
	170	goto MC1;
	171	}
	172	if (c == L'=') {
	173	c = fgetwc(source);
	174	goto S1SS1;
	175	}
	176	if (isEOF()) {
	177	goto FIN;
	178	}
	179	goto error;
	180
	181	NPARA:
	182	tokenType = NPARA;
	183	return EXIT_SUCCESS;
	184
	185	MOT:
	186	tokenType = MOT;
	187	tokenValue[i] = 0;
	188	wcscpy((wchar_t)token[tokenFound].value, (wchar_t)tokenValue);
	189	return EXIT_SUCCESS;
	190
	191	MC2:
	192	if (isSeparator() \|\| isEOF()) {
	193	goto MOTCLE;
	194	}
	195	goto error;
	196
	197	MOTCLE:
	198	tokenType = MOTCLE;
	199	wcscpy((wchar_t)token[tokenFound].value, (wchar_t)tokenValue);
	200	return EXIT_SUCCESS;
	201
	202	FIN:
	203	tokenType = FIN;
	204	return EXIT_SUCCESS;
	205
	206	error:
	207	if (tokenType == MOT \|\| tokenType == MOTCLE) {
	208	fwprintf(stderr, L"%s error with token type: %s and value: %ls\n",
	209	__func__,
	210	tokenTypestr[tokenType],
	211	tokenValue);
	212	} else {
	213	fwprintf(stderr, L"%s error with token type: %s\n",
	214	__func__,
	215	tokenTypestr[tokenType]);
	216	}
	217	fflush(stderr);
	218	tokenType = FIN;
	219	exit(EXIT_FAILURE);
	220	}