Update README.md
[sm64pc.git] / tools / textconv.c
blob595835aeba86dd70444b25ecf322ae56eb7791a9
1 #include <ctype.h>
2 #include <errno.h>
3 #include <stdarg.h>
4 #include <stdint.h>
5 #include <stdlib.h>
6 #include <stdio.h>
7 #include <string.h>
9 #include "hashtable.h"
10 #include "utf8.h"
12 #define ARRAY_COUNT(arr) (sizeof(arr) / sizeof(arr[0]))
14 struct CharmapEntry
16 uint32_t unicode[3];
17 int length; // length of the unicode array. TODO: use dynamic memory allocation
18 int bytesCount;
19 uint8_t bytes[2]; // bytes to convert unicode array to, (e.g. 'A' = 0x0A)
22 static struct HashTable *charmap;
24 static void fatal_error(const char *msgfmt, ...)
26 va_list args;
28 fputs("error: ", stderr);
30 va_start(args, msgfmt);
31 vfprintf(stderr, msgfmt, args);
32 va_end(args);
34 fputc('\n', stderr);
36 exit(1);
39 static void parse_error(const char *filename, int lineNum, const char *msgfmt, ...)
41 va_list args;
43 fprintf(stderr, "%s: line %i: ", filename, lineNum);
45 va_start(args, msgfmt);
46 vfprintf(stderr, msgfmt, args);
47 va_end(args);
49 fputc('\n', stderr);
51 exit(1);
54 // Reads the whole file and returns a null-terminated buffer with its contents
55 void *read_text_file(const char *filename)
57 FILE *file = fopen(filename, "rb");
58 uint8_t *buffer;
59 size_t size;
61 if (file == NULL)
62 fatal_error("failed to open file '%s' for reading: %s", filename, strerror(errno));
64 // get size
65 fseek(file, 0, SEEK_END);
66 size = ftell(file);
68 // allocate buffer
69 buffer = malloc(size + 1);
71 // read file
72 fseek(file, 0, SEEK_SET);
73 if (fread(buffer, size, 1, file) != 1)
74 fatal_error("error reading from file '%s': %s", filename, strerror(errno));
76 // null-terminate the buffer
77 buffer[size] = 0;
79 fclose(file);
81 return buffer;
84 static char *skip_whitespace(char *str)
86 while (isspace(*str))
87 str++;
88 return str;
91 // null terminates the current line and returns a pointer to the next line
92 static char *line_split(char *str)
94 while (*str != '\n')
96 if (*str == 0)
97 return str; // end of string
98 str++;
100 *str = 0; // terminate line
101 return str + 1;
104 static char *parse_number(const char *str, unsigned int *num)
106 char *endptr;
107 unsigned int n = strtol(str, &endptr, 0);
109 *num = n;
110 if (endptr > str)
111 return endptr;
112 else
113 return NULL;
116 static int is_identifier_char(char c)
118 return isalnum(c) || c == '_';
121 static int get_escape_char(int c)
123 const uint8_t escapeTable[] =
125 ['a'] = '\a',
126 ['b'] = '\b',
127 ['f'] = '\f',
128 ['n'] = '\n',
129 ['r'] = '\r',
130 ['t'] = '\t',
131 ['v'] = '\v',
132 ['\\'] = '\\',
133 ['\''] = '\'',
134 ['"'] = '"',
137 if ((unsigned int)c < ARRAY_COUNT(escapeTable) && escapeTable[c] != 0)
138 return escapeTable[c];
139 else
140 return 0;
143 static void read_charmap(const char *filename)
145 char *filedata = read_text_file(filename);
146 char *line = filedata;
147 int lineNum = 1;
149 while (line[0] != 0)
151 char *nextLine = line_split(line);
153 struct CharmapEntry entry;
155 line = skip_whitespace(line);
156 if (line[0] != 0 && line[0] != '#') // ignore empty lines and comments
158 int len = 0;
159 /* Read Character */
161 // opening quote
162 if (*line != '\'')
163 parse_error(filename, lineNum, "expected '");
164 line++;
166 // perform analysis of charmap entry, we are in the quote
167 while(1)
169 if(*line == '\'')
171 line++;
172 break;
174 else if(len == ARRAY_COUNT(entry.unicode))
176 // TODO: Use dynamic memory allocation so this is unnecessary.
177 parse_error(filename, lineNum, "string limit exceeded");
179 else if (*line == '\\')
181 line++; // advance to get the character being escaped
182 if (*line == '\r')
183 line++;
184 if (*line == '\n')
186 // Backslash at end of line is ignored
187 continue;
189 entry.unicode[len] = get_escape_char(*line);
190 if (entry.unicode[len] == 0)
191 parse_error(filename, lineNum, "unknown escape sequence \\%c", *line);
192 line++; // increment again to get past the escape sequence.
194 else
196 line = utf8_decode(line, &entry.unicode[len]);
197 if (line == NULL)
198 parse_error(filename, lineNum, "invalid UTF8");
200 len++;
202 entry.length = len;
204 // equals sign
205 line = skip_whitespace(line);
206 if (*line != '=')
207 parse_error(filename, lineNum, "expected = after character \\%c", *line);
208 line++;
210 entry.bytesCount = 0;
212 // value
213 while (1)
215 uint32_t value;
217 if (entry.bytesCount >= 2)
218 parse_error(filename, lineNum, "more than 2 values specified");
220 line = skip_whitespace(line);
222 line = parse_number(line, &value);
223 if (line == NULL)
224 parse_error(filename, lineNum, "expected number after =");
225 if (value > 0xFF)
226 parse_error(filename, lineNum, "0x%X is larger than 1 byte", value);
228 entry.bytes[entry.bytesCount] = value;
229 entry.bytesCount++;
231 line = skip_whitespace(line);
232 if (*line == 0)
233 break;
234 if (*line != ',')
235 parse_error(filename, lineNum, "junk at end of line");
236 line++;
239 if (hashtable_query(charmap, &entry) != NULL)
240 parse_error(filename, lineNum, "entry for character already exists");
241 hashtable_insert(charmap, &entry);
244 line = nextLine;
245 lineNum++;
248 free(filedata);
251 static int count_line_num(const char *start, const char *pos)
253 const char *c;
254 int lineNum = 1;
256 for (c = start; c < pos; c++)
258 if (*c == '\n')
259 lineNum++;
261 return lineNum;
264 static char *convert_string(char *pos, FILE *fout, const char *inputFileName, char *start, int uncompressed)
266 int hasString = 0;
268 while (1)
270 pos = skip_whitespace(pos);
271 if (*pos == ')')
273 if (hasString)
274 break;
275 else
276 parse_error(inputFileName, count_line_num(start, pos), "expected quoted string after '_('");
278 else if (*pos != '"')
279 parse_error(inputFileName, count_line_num(start, pos), "unexpected character '%c'", *pos);
280 pos++;
282 hasString = 1;
284 // convert quoted string
285 while (*pos != '"')
287 struct CharmapEntry input;
288 struct CharmapEntry *last_valid_entry = NULL;
289 struct CharmapEntry *entry;
290 int i, c;
291 int length = 0;
292 char* last_valid_pos = NULL;
294 // safely erase the unicode area before use
295 memset(input.unicode, 0, sizeof (input.unicode));
296 input.length = 0;
298 // Find a charmap entry of longest length possible starting from this position
299 while (*pos != '"')
301 if ((uncompressed && length == 1) || length == ARRAY_COUNT(entry->unicode))
303 // Stop searching after length 3; we only support strings of lengths up
304 // to that right now. Unless uncompressed is set, in which we ignore multi
305 // texts by discarding entries longer than 1.
306 break;
309 if (*pos == 0)
310 parse_error(inputFileName, count_line_num(start, pos), "EOF in string literal");
311 if (*pos == '\\')
313 pos++;
314 c = get_escape_char(*pos);
315 if (c == 0)
316 parse_error(inputFileName, count_line_num(start, pos), "unknown escape sequence \\%c", *pos);
317 input.unicode[length] = c;
318 pos++;
320 else
322 pos = utf8_decode(pos, &input.unicode[length]);
323 if (pos == NULL)
324 parse_error(inputFileName, count_line_num(start, pos), "invalid unicode encountered in file");
326 length++;
327 input.length = length;
329 entry = hashtable_query(charmap, &input);
330 if (entry != NULL)
332 last_valid_entry = entry;
333 last_valid_pos = pos;
337 entry = last_valid_entry;
338 pos = last_valid_pos;
339 if (entry == NULL)
340 parse_error(inputFileName, count_line_num(start, pos), "no charmap entry for U+%X", input.unicode[0]);
341 for (i = 0; i < entry->bytesCount; i++)
342 fprintf(fout, "0x%02X,", entry->bytes[i]);
344 pos++; // skip over closing '"'
346 pos++; // skip over closing ')'
347 fputs("0xFF", fout);
348 return pos;
351 static void convert_file(const char *infilename, const char *outfilename)
353 char *in = read_text_file(infilename);
354 FILE *fout = fopen(outfilename, "wb");
356 if (fout == NULL)
357 fatal_error("failed to open file '%s' for writing: %s", strerror(errno));
359 char *start = in;
360 char *end = in;
361 char *pos = in;
363 while (1)
365 if (*pos == 0) // end of file
366 goto eof;
368 // check for comment
369 if (*pos == '/')
371 pos++;
372 // skip over // comment
373 if (*pos == '/')
375 pos++;
376 // skip over next newline
377 while (*pos != '\n')
379 if (*pos == 0)
380 goto eof;
381 pos++;
383 pos++;
385 // skip over /* */ comment
386 else if (*pos == '*')
388 pos++;
389 while (*pos != '*' && pos[1] != '/')
391 if (*pos == 0)
392 goto eof;
393 pos++;
395 pos += 2;
398 // skip over normal string literal
399 else if (*pos == '"')
401 pos++;
402 while (*pos != '"')
404 if (*pos == 0)
405 goto eof;
406 if (*pos == '\\')
407 pos++;
408 pos++;
410 pos++;
412 // check for _( sequence
413 else if ((*pos == '_') && (pos == in || !is_identifier_char(pos[-1])))
415 int uncompressed = 0;
416 end = pos;
417 pos++;
418 if (*pos == '_') // an extra _ signifies uncompressed strings. Enable uncompressed flag
420 pos++;
421 uncompressed = 1;
423 if (*pos == '(')
425 pos++;
426 fwrite(start, end - start, 1, fout);
427 pos = convert_string(pos, fout, infilename, in, uncompressed);
428 start = pos;
431 else
433 pos++;
437 eof:
438 fwrite(start, pos - start, 1, fout);
439 fclose(fout);
440 free(in);
443 static unsigned int charmap_hash(const void *value)
445 const struct CharmapEntry* entry = value;
446 unsigned int ret = 0;
447 for (int i = 0; i < entry->length; i++)
448 ret = ret * 17 + entry->unicode[i];
449 return ret;
452 static int charmap_cmp(const void *a, const void *b)
454 const struct CharmapEntry *ea = a;
455 const struct CharmapEntry *eb = b;
456 if (ea->length != eb->length)
457 return 0;
458 for(int i = 0; i < ea->length; i++)
459 if(ea->unicode[i] != eb->unicode[i])
460 return 0;
461 return 1;
464 static void usage(const char *execName)
466 fprintf(stderr, "Usage: %s CHARMAP INPUT OUTPUT\n", execName);
469 int main(int argc, char **argv)
471 if (argc != 4)
473 usage(argv[0]);
474 return 1;
477 charmap = hashtable_new(charmap_hash, charmap_cmp, 256, sizeof(struct CharmapEntry));
479 read_charmap(argv[1]);
480 convert_file(argv[2], argv[3]);
482 hashtable_free(charmap);
484 return 0;