(DISTFILES): Comment out a few missing files.
[mono-project.git] / mcs / class / I18N / tools / ucm2cp.c
blobfa43452b0789c1783227d80aad26fff9d2947e48
1 /*
2 * ucm2cp.c - Convert IBM ".ucm" files into code page handling classes.
4 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
27 Usage: ucm2cp [options] file
29 --region name I18N region name
30 --page num Code page number
31 --wpage num Windows code page number (optional)
32 --name str Human-readable encoding name
33 --webname str Web name of the encoding
34 --headername str Header name of the encoding (optional)
35 --bodyname str Body name of the encoding (optional)
36 --no-browser-display Set browser display value to false (optional)
37 --no-browser-save Set browser save value to false (optional)
38 --no-mailnews-display Set mail/news display value to false (optional)
39 --no-mailnews-save Set mail/news save value to false (optional)
43 #include <stdio.h>
44 #include <string.h>
45 #include <stdlib.h>
48 * Option values.
50 static char *region = 0;
51 static int codePage = 0;
52 static int windowsCodePage = 0;
53 static char *name = 0;
54 static char *webName = 0;
55 static char *headerName = 0;
56 static char *bodyName = 0;
57 static int isBrowserDisplay = 1;
58 static int isBrowserSave = 1;
59 static int isMailNewsDisplay = 1;
60 static int isMailNewsSave = 1;
61 static const char *filename = 0;
64 * Forward declarations.
66 static void usage(char *progname);
67 static void loadCharMaps(FILE *file);
68 static void printHeader(void);
69 static void printFooter(void);
70 static void printByteToChar(void);
71 static void printCharToByte(void);
73 int main(int argc, char *argv[])
75 char *progname = argv[0];
76 FILE *file;
77 int len;
79 /* Process the command-line options */
80 while(argc > 1 && argv[1][0] == '-')
82 if(!strcmp(argv[1], "--page") && argc > 2)
84 codePage = atoi(argv[2]);
85 ++argv;
86 --argc;
88 else if(!strcmp(argv[1], "--wpage") && argc > 2)
90 windowsCodePage = atoi(argv[2]);
91 ++argv;
92 --argc;
94 else if(!strcmp(argv[1], "--region") && argc > 2)
96 region = argv[2];
97 ++argv;
98 --argc;
100 else if(!strcmp(argv[1], "--name") && argc > 2)
102 name = argv[2];
103 ++argv;
104 --argc;
106 else if(!strcmp(argv[1], "--webname") && argc > 2)
108 webName = argv[2];
109 ++argv;
110 --argc;
112 else if(!strcmp(argv[1], "--headername") && argc > 2)
114 headerName = argv[2];
115 ++argv;
116 --argc;
118 else if(!strcmp(argv[1], "--bodyname") && argc > 2)
120 bodyName = argv[2];
121 ++argv;
122 --argc;
124 else if(!strcmp(argv[1], "--no-browser-display"))
126 isBrowserDisplay = 0;
128 else if(!strcmp(argv[1], "--no-browser-save"))
130 isBrowserSave = 0;
132 else if(!strcmp(argv[1], "--no-mailnews-display"))
134 isMailNewsDisplay = 0;
136 else if(!strcmp(argv[1], "--no-mailnews-save"))
138 isMailNewsSave = 0;
140 ++argv;
141 --argc;
144 /* Make sure that we have sufficient options */
145 if(!region || !codePage || !name || !webName || argc != 2)
147 usage(progname);
148 return 1;
151 /* Set defaults for unspecified options */
152 if(!headerName)
154 headerName = webName;
156 if(!bodyName)
158 bodyName = webName;
160 if(!windowsCodePage)
162 windowsCodePage = codePage;
165 /* Open the UCM file */
166 file = fopen(argv[1], "r");
167 if(!file)
169 perror(argv[1]);
170 return 1;
172 filename = argv[1];
173 len = strlen(filename);
174 while(len > 0 && filename[len - 1] != '/' && filename[len - 1] != '\\')
176 --len;
178 filename += len;
180 /* Load the character maps from the input file */
181 loadCharMaps(file);
183 /* Print the output header */
184 printHeader();
186 /* Print the byte->char conversion table */
187 printByteToChar();
189 /* Output the char->byte conversion methods */
190 printCharToByte();
192 /* Print the output footer */
193 printFooter();
195 /* Clean up and exit */
196 fclose(file);
197 return 0;
200 static void usage(char *progname)
202 fprintf(stderr, "Usage: %s [options] file\n\n", progname);
203 fprintf(stderr, " --region name I18N region name\n");
204 fprintf(stderr, " --page num Code page number\n");
205 fprintf(stderr, " --wpage num Windows code page number (optional)\n");
206 fprintf(stderr, " --name str Human-readable encoding name\n");
207 fprintf(stderr, " --webname str Web name of the encoding\n");
208 fprintf(stderr, " --headername str Header name of the encoding (optional)\n");
209 fprintf(stderr, " --bodyname str Body name of the encoding (optional)\n");
210 fprintf(stderr, " --no-browser-display Set browser display value to false (optional)\n");
211 fprintf(stderr, " --no-browser-save Set browser save value to false (optional)\n");
212 fprintf(stderr, " --no-mailnews-display Set mail/news display value to false (optional)\n");
213 fprintf(stderr, " --no-mailnews-save Set mail/news save value to false (optional)\n");
217 * Map bytes to characters. The level value is used to determine
218 * which char mapping is the most likely if there is more than one.
220 static unsigned byteToChar[256];
221 static int byteToCharLevel[256];
224 * Map characters to bytes.
226 static int charToByte[65536];
229 * Parse a hexadecimal value. Returns the length
230 * of the value that was parsed.
232 static int parseHex(const char *buf, unsigned long *value)
234 int len = 0;
235 char ch;
236 *value = 0;
237 while((ch = buf[len]) != '\0')
239 if(ch >= '0' && ch <= '9')
241 *value = *value * 16 + (unsigned long)(ch - '0');
243 else if(ch >= 'A' && ch <= 'F')
245 *value = *value * 16 + (unsigned long)(ch - 'A' + 10);
247 else if(ch >= 'a' && ch <= 'f')
249 *value = *value * 16 + (unsigned long)(ch - 'a' + 10);
251 else
253 break;
255 ++len;
257 return len;
261 * Load the character mapping information from a UCM file.
263 static void loadCharMaps(FILE *file)
265 unsigned long posn;
266 unsigned long byteValue;
267 int level;
268 char buffer[BUFSIZ];
269 const char *buf;
271 /* Initialize the mapping tables */
272 for(posn = 0; posn < 256; ++posn)
274 byteToChar[posn] = (unsigned)'?';
275 byteToCharLevel[posn] = 100;
277 for(posn = 0; posn < 65536; ++posn)
279 charToByte[posn] = -1;
282 /* Read the contents of the file */
283 while(fgets(buffer, BUFSIZ, file))
285 /* Lines of interest begin with "<U" */
286 if(buffer[0] != '<' || buffer[1] != 'U')
288 continue;
291 /* Parse the fields on the line */
292 buf = buffer + 2;
293 buf += parseHex(buf, &posn);
294 if(posn >= 65536)
296 continue;
298 while(*buf != '\0' && *buf != '\\')
300 ++buf;
302 if(*buf != '\\' || buf[1] != 'x')
304 continue;
306 buf += 2;
307 buf += parseHex(buf, &byteValue);
308 if(byteValue >= 256)
310 continue;
312 while(*buf != '\0' && *buf != '|')
314 ++buf;
316 if(*buf != '|')
318 continue;
320 level = (int)(buf[1] - '0');
322 /* Update the byte->char mapping table */
323 if(level < byteToCharLevel[byteValue])
325 byteToCharLevel[byteValue] = level;
326 byteToChar[byteValue] = (unsigned)posn;
329 /* Update the char->byte mapping table */
330 charToByte[posn] = (int)byteValue;
334 #define COPYRIGHT_MSG \
335 " *\n" \
336 " * Copyright (c) 2002 Southern Storm Software, Pty Ltd\n" \
337 " *\n" \
338 " * Permission is hereby granted, free of charge, to any person obtaining\n" \
339 " * a copy of this software and associated documentation files (the \"Software\"),\n" \
340 " * to deal in the Software without restriction, including without limitation\n" \
341 " * the rights to use, copy, modify, merge, publish, distribute, sublicense,\n" \
342 " * and/or sell copies of the Software, and to permit persons to whom the\n" \
343 " * Software is furnished to do so, subject to the following conditions:\n" \
344 " *\n" \
345 " * The above copyright notice and this permission notice shall be included\n" \
346 " * in all copies or substantial portions of the Software.\n" \
347 " *\n" \
348 " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n" \
349 " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" \
350 " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n" \
351 " * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR\n" \
352 " * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\n" \
353 " * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\n" \
354 " * OTHER DEALINGS IN THE SOFTWARE.\n" \
355 " */\n\n"
358 * Print the header for the current code page definition.
360 static void printHeader(void)
362 printf("/*\n * CP%d.cs - %s code page.\n", codePage, name);
363 fputs(COPYRIGHT_MSG, stdout);
364 printf("// Generated from \"%s\".\n\n", filename);
365 printf("namespace I18N.%s\n{\n\n", region);
366 printf("using System;\n");
367 printf("using I18N.Common;\n\n");
368 printf("public class CP%d : ByteEncoding\n{\n", codePage);
369 printf("\tpublic CP%d()\n", codePage);
370 printf("\t\t: base(%d, ToChars, \"%s\",\n", codePage, name);
371 printf("\t\t \"%s\", \"%s\", \"%s\",\n",
372 bodyName, headerName, webName);
373 printf("\t\t %s, %s, %s, %s, %d)\n",
374 (isBrowserDisplay ? "true" : "false"),
375 (isBrowserSave ? "true" : "false"),
376 (isMailNewsDisplay ? "true" : "false"),
377 (isMailNewsSave ? "true" : "false"),
378 windowsCodePage);
379 printf("\t{}\n\n");
383 * Print an encoding name, adjusted to look like a type name.
385 static void printEncodingName(const char *name)
387 while(*name != '\0')
389 if(*name >= 'A' && *name <= 'Z')
391 putc(*name - 'A' + 'a', stdout);
393 else if(*name == '-')
395 putc('_', stdout);
397 else
399 putc(*name, stdout);
401 ++name;
406 * Print the footer for the current code page definition.
408 static void printFooter(void)
410 printf("}; // class CP%d\n\n", codePage);
411 printf("public class ENC");
412 printEncodingName(webName);
413 printf(" : CP%d\n{\n", codePage);
414 printf("\tpublic ENC");
415 printEncodingName(webName);
416 printf("() : base() {}\n\n");
417 printf("}; // class ENC");
418 printEncodingName(webName);
419 printf("\n\n}; // namespace I18N.%s\n", region);
423 * Print the byte->char conversion table.
425 static void printByteToChar(void)
427 int posn;
428 printf("\tprivate static readonly char[] ToChars = {");
429 for(posn = 0; posn < 256; ++posn)
431 if((posn % 6) == 0)
433 printf("\n\t\t");
435 printf("'\\u%04X', ", byteToChar[posn]);
437 printf("\n\t};\n\n");
441 * Print a "switch" statement that converts "ch" from
442 * a character value into a byte value.
444 static void printConvertSwitch(void)
446 unsigned long directLimit;
447 unsigned long posn;
448 unsigned long posn2;
449 unsigned long rangeSize;
450 int haveDirect;
451 int haveFullWidth;
453 /* Find the limit of direct byte mappings */
454 directLimit = 0;
455 while(directLimit < 256 && charToByte[directLimit] == (int)directLimit)
457 ++directLimit;
460 /* Determine if we have the full-width Latin1 mappings, which
461 we can optimise in the default case of the switch */
462 haveFullWidth = 1;
463 for(posn = 0xFF01; posn <= 0xFF5E; ++posn)
465 if((charToByte[posn] - 0x21) != (int)(posn - 0xFF01))
467 haveFullWidth = 0;
471 /* Print the switch header. The "if" is an optimisation
472 to ignore the common case of direct ASCII mappings */
473 printf("\t\t\tif(ch >= %lu) switch(ch)\n", directLimit);
474 printf("\t\t\t{\n");
476 /* Handle all direct byte mappings above the direct limit */
477 haveDirect = 0;
478 for(posn = directLimit; posn < 256; ++posn)
480 if(charToByte[posn] == (int)posn)
482 haveDirect = 1;
483 printf("\t\t\t\tcase 0x%04lX:\n", posn);
486 if(haveDirect)
488 printf("\t\t\t\t\tbreak;\n");
491 /* Handle the indirect mappings */
492 for(posn = 0; posn < 65536; ++posn)
494 if(haveFullWidth && posn >= 0xFF01 && posn <= 0xFF5E)
496 /* Handle full-width Latin1 conversions later */
497 continue;
499 if(charToByte[posn] != (int)posn &&
500 charToByte[posn] != -1)
502 /* See if we have a run of 4 or more characters that
503 can be mapped algorithmically to some other range */
504 rangeSize = 1;
505 for(posn2 = posn + 1; posn2 < 65536; ++posn2)
507 if(charToByte[posn2] == (int)posn2 ||
508 charToByte[posn2] == -1)
510 break;
512 if((charToByte[posn2] - charToByte[posn]) !=
513 (int)(posn2 - posn))
515 break;
517 ++rangeSize;
519 if(rangeSize >= 4)
521 /* Output a range mapping for the characters */
522 for(posn2 = posn; posn2 < (posn + rangeSize); ++posn2)
524 printf("\t\t\t\tcase 0x%04lX:\n", posn2);
526 posn += rangeSize - 1;
527 if(((long)posn) >= (long)(charToByte[posn]))
529 printf("\t\t\t\t\tch -= 0x%04lX;\n",
530 (long)(posn - charToByte[posn]));
532 else
534 printf("\t\t\t\t\tch += 0x%04lX;\n",
535 (long)(charToByte[posn] - posn));
537 printf("\t\t\t\t\tbreak;\n");
539 else
541 /* Use a simple non-algorithmic mapping */
542 printf("\t\t\t\tcase 0x%04lX: ch = 0x%02X; break;\n",
543 posn, (unsigned)(charToByte[posn]));
548 /* Print the switch footer */
549 if(!haveFullWidth)
551 printf("\t\t\t\tdefault: ch = 0x3F; break;\n");
553 else
555 printf("\t\t\t\tdefault:\n");
556 printf("\t\t\t\t{\n");
557 printf("\t\t\t\t\tif(ch >= 0xFF01 && ch <= 0xFF5E)\n");
558 printf("\t\t\t\t\t\tch -= 0xFEE0;\n");
559 printf("\t\t\t\t\telse\n");
560 printf("\t\t\t\t\t\tch = 0x3F;\n");
561 printf("\t\t\t\t}\n");
562 printf("\t\t\t\tbreak;\n");
564 printf("\t\t\t}\n");
568 * Print the char->byte conversion methods.
570 static void printCharToByte(void)
572 /* Print the conversion method for character buffers */
573 printf("\tprotected override void ToBytes(char[] chars, int charIndex, int charCount,\n");
574 printf("\t byte[] bytes, int byteIndex)\n");
575 printf("\t{\n");
576 printf("\t\tint ch;\n");
577 printf("\t\twhile(charCount > 0)\n");
578 printf("\t\t{\n");
579 printf("\t\t\tch = (int)(chars[charIndex++]);\n");
580 printConvertSwitch();
581 printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
582 printf("\t\t\t--charCount;\n");
583 printf("\t\t}\n");
584 printf("\t}\n\n");
586 /* Print the conversion method for string buffers */
587 printf("\tprotected override void ToBytes(String s, int charIndex, int charCount,\n");
588 printf("\t byte[] bytes, int byteIndex)\n");
589 printf("\t{\n");
590 printf("\t\tint ch;\n");
591 printf("\t\twhile(charCount > 0)\n");
592 printf("\t\t{\n");
593 printf("\t\t\tch = (int)(s[charIndex++]);\n");
594 printConvertSwitch();
595 printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
596 printf("\t\t\t--charCount;\n");
597 printf("\t\t}\n");
598 printf("\t}\n\n");