Avoid filling twice token values.
[TP_AL_C.git] / lexer / lexical_analyzer.c
1 /* Lexical analyzer */
2
3 #include <stdlib.h>
4 #include <stdbool.h>
5
6 #include "global_vars.h"
7 #include "print_helper.h"
8
9 wint_t c;
10
11 /* It looks silly to check for each characters but for debugging, it's just the way to go */
12 static bool isAlphaNum() {
13 if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \
14 c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \
15 c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \
16 c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \
17 c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \
18 c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \
19 c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \
20 c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \
21 c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \
22 c == L'\''|| c == L'#' || \
23 c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
24 c == L'7' || c == L'8' || c == L'9' || \
25 //FIXME: Accentued characters (aka multibytes characters) support is still buggy
26 c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
27 c == L'ù' || c == L'û' || \
28 c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
29 c == L'Ù' || c == L'Û') {
30 return true;
31 }
32 return false;
33 }
34
35 static bool isSeparator() {
36 if (c == L'\t' || c == L' ' || c == L'\n') {
37 return true;
38 }
39 return false;
40 }
41
42 static bool isEOF() {
43 if (c == WEOF) {
44 return true;
45 }
46 return false;
47 }
48
49 int scanner() {
50 tokenValue[0] = 0;
51 unsigned int i = 0;
52 wchar_t m[6];
53
54 init:
55 if (c == L' ' || c == L'\t') {
56 c = fgetwc(source);
57 goto init;
58 }
59 if (c == L'\n') {
60 c = fgetwc(source);
61 goto initLV1;
62 }
63 if (c == L'>') {
64 c = fgetwc(source);
65 goto MC1;
66 }
67 if (c == L'=') {
68 c = fgetwc(source);
69 goto S1SS1;
70 }
71 if (isAlphaNum()) {
72 tokenValue[i] = c;
73 i++;
74 c = fgetwc(source);
75 goto M1;
76 }
77 if (isEOF()) {
78 goto FIN;
79 }
80 goto error;
81
82 MC1:
83 if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
84 wcscpy((wchar_t*)tokenValue, L">Auteur");
85 c = fgetwc(source);
86 goto MC2;
87 }
88 if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
89 wcscpy((wchar_t*)tokenValue, L">Titre");
90 c = fgetwc(source);
91 goto MC2;
92 }
93 goto error;
94
95 S1SS1:
96 if (c == L'=') {
97 c = fgetwc(source);
98 goto SS2;
99 }
100 if (isSeparator() || isEOF()) {
101 goto SECTION;
102 }
103 goto error;
104
105 SS2:
106 if (isSeparator() || isEOF()) {
107 goto SSECTION;
108 }
109 goto error;
110
111 SECTION:
112 tokenType = SECTION;
113 return EXIT_SUCCESS;
114
115 SSECTION:
116 tokenType = SSECTION;
117 return EXIT_SUCCESS;
118
119 M1:
120 if (isAlphaNum()) {
121 tokenValue[i] = c;
122 i++;
123 c = fgetwc(source);
124 goto M1;
125 }
126 if (isSeparator() || isEOF()) {
127 goto MOT;
128 }
129 goto error;
130
131 initLV1:
132 if (c == L' ' || c == L'\t') {
133 c = fgetwc(source);
134 goto initLV1;
135 }
136 if (c == L'\n') {
137 c = fgetwc(source);
138 goto initLV1LV2;
139 }
140 if (isAlphaNum()) {
141 tokenValue[i] = c;
142 i++;
143 c = fgetwc(source);
144 goto M1;
145 }
146 if (c == L'=') {
147 c = fgetwc(source);
148 goto S1SS1;
149 }
150 if (c == L'>') {
151 c = fgetwc(source);
152 goto MC1;
153 }
154 if (isEOF()) {
155 goto FIN;
156 }
157 goto error;
158
159 initLV1LV2:
160 if (isSeparator()) {
161 c = fgetwc(source);
162 goto initLV1LV2;
163 }
164 if (isAlphaNum()) {
165 goto NPARA;
166 }
167 if (c == L'>') {
168 c = fgetwc(source);
169 goto MC1;
170 }
171 if (c == L'=') {
172 c = fgetwc(source);
173 goto S1SS1;
174 }
175 if (isEOF()) {
176 goto FIN;
177 }
178 goto error;
179
180 NPARA:
181 tokenType = NPARA;
182 return EXIT_SUCCESS;
183
184 MOT:
185 tokenType = MOT;
186 tokenValue[i] = 0;
187 wcscpy((wchar_t*)token[tokenFound].value, (wchar_t*)tokenValue);
188 return EXIT_SUCCESS;
189
190 MC2:
191 if (isSeparator() || isEOF()) {
192 goto MOTCLE;
193 }
194 goto error;
195
196 MOTCLE:
197 tokenType = MOTCLE;
198 wcscpy((wchar_t*)token[tokenFound].value, (wchar_t*)tokenValue);
199 return EXIT_SUCCESS;
200
201 FIN:
202 tokenType = FIN;
203 return EXIT_SUCCESS;
204
205 error:
206 if (tokenType == MOT || tokenType == MOTCLE) {
207 fwprintf(stderr, L"%s error with token type: %s and value: %ls\n",
208 __func__,
209 tokenTypestr[tokenType],
210 tokenValue);
211 } else {
212 fwprintf(stderr, L"%s error with token type: %s\n",
213 __func__,
214 tokenTypestr[tokenType]);
215 }
216 fflush(stderr);
217 tokenType = FIN;
218 exit(EXIT_FAILURE);
219 }