Add a FIXME comment about multibytes characters support.
[TP_AL_C.git] / lexer / main.c
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <stdbool.h>
5 #include <wchar.h>
6
7 #define TOKEN_MAX 500
8
9 struct token_s {
10 const char* type;
11 wint_t value[50];
12 };
13
14 struct token_s token[TOKEN_MAX] = {NULL, 0};
15
16 FILE *source = NULL, *target = NULL;
17 wint_t c;
18 unsigned int tokenFound = 0;
19 enum TokenType {
20 MOTCLE,
21 SECTION,
22 SSECTION,
23 NPARA,
24 MOT,
25 FIN
26 } tokenType;
27 const char* tokenTypestr[] = { "MOTCLE", "SECTION", "SSECTION", "NPARA", "MOT", "FIN" };
28
29 /* It looks silly to check for each characters but for debugging, it's just the way to go */
30 bool istAlpha() {
31 if (c == L'a' || c == L'b' || c == L'c' || c == L'd' || c == L'e' || c == L'f' || c == L'g' || \
32 c == L'h' || c == L'i' || c == L'j' || c == L'k' || c == L'l' || c == L'm' || c == L'n' || \
33 c == L'o' || c == L'p' || c == L'q' || c == L'r' || c == L's' || c == L't' || c == L'u' || \
34 c == L'v' || c == L'w' || c == L'x' || c == L'y' || c == L'z' || \
35 c == L'A' || c == L'B' || c == L'C' || c == L'D' || c == L'E' || c == L'F' || c == L'G' || \
36 c == L'H' || c == L'I' || c == L'J' || c == L'K' || c == L'L' || c == L'M' || c == L'N' || \
37 c == L'O' || c == L'P' || c == L'Q' || c == L'R' || c == L'S' || c == L'T' || c == L'U' || \
38 c == L'V' || c == L'W' || c == L'X' || c == L'Y' || c == L'Z' || \
39 c == L'.' || c == L'?' || c == L'!' || c == L',' || c == L';' || c == L':' || c == L'-' || \
40 c == L'\''|| c == L'#' || \
41 c == L'0' || c == L'1' || c == L'2' || c == L'3' || c == L'4' || c == L'5' || c == L'6' || \
42 c == L'7' || c == L'8' || c == L'9' || \
43 // FIXME: Accentued characters (aka multibytes characters) support is still buggy
44 c == L'à' || c == L'â' || c == L'ç' || c == L'è' || c == L'é' || c == L'î' || c == L'ô' || \
45 c == L'ù' || c == L'û' || \
46 c == L'À' || c == L'Â' || c == L'Ç' || c == L'È' || c == L'É' || c == L'Î' || c == L'Ô' || \
47 c == L'Ù' || c == L'Û') {
48 return true;
49 }
50 return false;
51 }
52
53 bool isSeparator() {
54 if (c == L'\t' || c == L' ' || c == L'\n') {
55 return true;
56 }
57 return false;
58 }
59
60 int scanner() {
61 unsigned int i = 0;
62 wchar_t m[6];
63
64 init:
65 if (c == L' ' || c == L'\t') {
66 c = fgetwc(source);
67 goto init;
68 }
69 if (c == L'\n') {
70 c = fgetwc(source);
71 goto initLV1;
72 }
73 if (c == L'>') {
74 c = fgetwc(source);
75 goto MC1;
76 }
77 if (c == L'=') {
78 c = fgetwc(source);
79 goto S1SS1;
80 }
81 if (istAlpha()) {
82 token[tokenFound].value[i] = c;
83 i++;
84 c = fgetwc(source);
85 goto M1;
86 }
87 if (c == WEOF) {
88 goto FIN;
89 }
90 goto error;
91
92 MC1:
93 if (c == L'A' && !wcscmp(fgetws(m, 6, source), L"uteur")) {
94 wcscpy((wchar_t*)token[tokenFound].value, L">Auteur");
95 c = fgetwc(source);
96 goto MC2;
97 }
98 if (c == L'T' && !wcscmp(fgetws(m, 5, source), L"itre")) {
99 wcscpy((wchar_t*)token[tokenFound].value, L">Titre");
100 c = fgetwc(source);
101 goto MC2;
102 }
103 goto error;
104
105 S1SS1:
106 if (c == L'=') {
107 c = fgetwc(source);
108 goto SS2;
109 }
110 if (isSeparator() || c == WEOF) {
111 goto SECTION;
112 }
113 goto error;
114
115 SS2:
116 if (isSeparator() || c == WEOF) {
117 goto SSECTION;
118 }
119 goto error;
120
121 SECTION:
122 tokenType = SECTION;
123 return EXIT_SUCCESS;
124
125 SSECTION:
126 tokenType = SSECTION;
127 return EXIT_SUCCESS;
128
129 M1:
130 if (istAlpha()) {
131 token[tokenFound].value[i] = c;
132 i++;
133 c = fgetwc(source);
134 goto M1;
135 }
136 if (isSeparator() || c == WEOF) {
137 goto MOT;
138 }
139 goto error;
140
141 initLV1:
142 if (c == L' ' || c == L'\t') {
143 c = fgetwc(source);
144 goto initLV1;
145 }
146 if (c == L'\n') {
147 c = fgetwc(source);
148 goto initLV1LV2;
149 }
150 if (istAlpha()) {
151 token[tokenFound].value[i] = c;
152 i++;
153 c = fgetwc(source);
154 goto M1;
155 }
156 if (c == L'=') {
157 c = fgetwc(source);
158 goto S1SS1;
159 }
160 if (c == L'>') {
161 c = fgetwc(source);
162 goto MC1;
163 }
164 if (c == WEOF) {
165 goto FIN;
166 }
167 goto error;
168
169 initLV1LV2:
170 if (isSeparator()) {
171 c = fgetwc(source);
172 goto initLV1LV2;
173 }
174 if (istAlpha()) {
175 goto NPARA;
176 }
177 if (c == L'>') {
178 c = fgetwc(source);
179 goto MC1;
180 }
181 if (c == L'=') {
182 c = fgetwc(source);
183 goto S1SS1;
184 }
185 if (c == WEOF) {
186 goto FIN;
187 }
188 goto error;
189
190 NPARA:
191 tokenType = NPARA;
192 return EXIT_SUCCESS;
193
194 MOT:
195 tokenType = MOT;
196 return EXIT_SUCCESS;
197
198 MC2:
199 if (isSeparator() || c == WEOF) {
200 goto MOTCLE;
201 }
202 goto error;
203
204 MOTCLE:
205 tokenType = MOTCLE;
206 return EXIT_SUCCESS;
207
208 FIN:
209 tokenType = FIN;
210 return EXIT_SUCCESS;
211
212 error:
213 tokenType = FIN;
214 return EXIT_FAILURE;
215 }
216
217 int main() {
218 // Ouvre le fichier test.txt en lecture seulement (le fichier doit exister) :
219 source = fopen("test.txt", "r+");
220 // Cree et ouvre un fichier target.html en lecture/ecriture
221 // avec suppression du contenu au prealable :
222 target = fopen("target.html", "w+");
223
224 if (source == NULL) {
225 printf("Impossible d'ouvrir le fichier source\n");
226 return -1;
227 }
228
229 if (target == NULL) {
230 printf("Impossible d'ouvrir le fichier target\n");
231 return -1;
232 }
233
234 c = fgetwc(source); // lecture du premier caractere
235 do {
236 int scanrt = scanner();
237 if (scanrt == EXIT_FAILURE) {
238 wprintf(L"Scanner error with token value: %ls\n", token[tokenFound].value);
239 exit(EXIT_FAILURE);
240 }
241 if (tokenType == MOT || tokenType == MOTCLE) {
242 wprintf(L"%20s: %ls\n", tokenTypestr[tokenType], token[tokenFound].value);
243 } else {
244 wprintf(L"%20s\n", tokenTypestr[tokenType]);
245 }
246 token[tokenFound].type = tokenTypestr[tokenType];
247 tokenFound++;
248 } while (tokenType != FIN); // tant que la fin du fichier n'est pas atteinte
249
250 if (source != NULL) fclose(source); // fermeture du fichier source
251 if (target != NULL) fclose(target); // fermeture du fichier target
252
253 return EXIT_SUCCESS;
254 }