2 * ucm2cp.c - Convert IBM ".ucm" files into code page handling classes.
4 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
27 Usage: ucm2cp [options] file
29 --region name I18N region name
30 --page num Code page number
31 --wpage num Windows code page number (optional)
32 --name str Human-readable encoding name
33 --webname str Web name of the encoding
34 --headername str Header name of the encoding (optional)
35 --bodyname str Body name of the encoding (optional)
36 --no-browser-display Set browser display value to false (optional)
37 --no-browser-save Set browser save value to false (optional)
38 --no-mailnews-display Set mail/news display value to false (optional)
39 --no-mailnews-save Set mail/news save value to false (optional)
50 static char *region
= 0;
51 static int codePage
= 0;
52 static int windowsCodePage
= 0;
53 static char *name
= 0;
54 static char *webName
= 0;
55 static char *headerName
= 0;
56 static char *bodyName
= 0;
57 static int isBrowserDisplay
= 1;
58 static int isBrowserSave
= 1;
59 static int isMailNewsDisplay
= 1;
60 static int isMailNewsSave
= 1;
61 static const char *filename
= 0;
64 * Forward declarations.
66 static void usage(char *progname
);
67 static void loadCharMaps(FILE *file
);
68 static void printHeader(void);
69 static void printFooter(void);
70 static void printByteToChar(void);
71 static void printCharToByte(void);
73 int main(int argc
, char *argv
[])
75 char *progname
= argv
[0];
79 /* Process the command-line options */
80 while(argc
> 1 && argv
[1][0] == '-')
82 if(!strcmp(argv
[1], "--page") && argc
> 2)
84 codePage
= atoi(argv
[2]);
88 else if(!strcmp(argv
[1], "--wpage") && argc
> 2)
90 windowsCodePage
= atoi(argv
[2]);
94 else if(!strcmp(argv
[1], "--region") && argc
> 2)
100 else if(!strcmp(argv
[1], "--name") && argc
> 2)
106 else if(!strcmp(argv
[1], "--webname") && argc
> 2)
112 else if(!strcmp(argv
[1], "--headername") && argc
> 2)
114 headerName
= argv
[2];
118 else if(!strcmp(argv
[1], "--bodyname") && argc
> 2)
124 else if(!strcmp(argv
[1], "--no-browser-display"))
126 isBrowserDisplay
= 0;
128 else if(!strcmp(argv
[1], "--no-browser-save"))
132 else if(!strcmp(argv
[1], "--no-mailnews-display"))
134 isMailNewsDisplay
= 0;
136 else if(!strcmp(argv
[1], "--no-mailnews-save"))
144 /* Make sure that we have sufficient options */
145 if(!region
|| !codePage
|| !name
|| !webName
|| argc
!= 2)
151 /* Set defaults for unspecified options */
154 headerName
= webName
;
162 windowsCodePage
= codePage
;
165 /* Open the UCM file */
166 file
= fopen(argv
[1], "r");
173 len
= strlen(filename
);
174 while(len
> 0 && filename
[len
- 1] != '/' && filename
[len
- 1] != '\\')
180 /* Load the character maps from the input file */
183 /* Print the output header */
186 /* Print the byte->char conversion table */
189 /* Output the char->byte conversion methods */
192 /* Print the output footer */
195 /* Clean up and exit */
200 static void usage(char *progname
)
202 fprintf(stderr
, "Usage: %s [options] file\n\n", progname
);
203 fprintf(stderr
, " --region name I18N region name\n");
204 fprintf(stderr
, " --page num Code page number\n");
205 fprintf(stderr
, " --wpage num Windows code page number (optional)\n");
206 fprintf(stderr
, " --name str Human-readable encoding name\n");
207 fprintf(stderr
, " --webname str Web name of the encoding\n");
208 fprintf(stderr
, " --headername str Header name of the encoding (optional)\n");
209 fprintf(stderr
, " --bodyname str Body name of the encoding (optional)\n");
210 fprintf(stderr
, " --no-browser-display Set browser display value to false (optional)\n");
211 fprintf(stderr
, " --no-browser-save Set browser save value to false (optional)\n");
212 fprintf(stderr
, " --no-mailnews-display Set mail/news display value to false (optional)\n");
213 fprintf(stderr
, " --no-mailnews-save Set mail/news save value to false (optional)\n");
217 * Map bytes to characters. The level value is used to determine
218 * which char mapping is the most likely if there is more than one.
220 static unsigned byteToChar
[256];
221 static int byteToCharLevel
[256];
224 * Map characters to bytes.
226 static int charToByte
[65536];
229 * Parse a hexadecimal value. Returns the length
230 * of the value that was parsed.
232 static int parseHex(const char *buf
, unsigned long *value
)
237 while((ch
= buf
[len
]) != '\0')
239 if(ch
>= '0' && ch
<= '9')
241 *value
= *value
* 16 + (unsigned long)(ch
- '0');
243 else if(ch
>= 'A' && ch
<= 'F')
245 *value
= *value
* 16 + (unsigned long)(ch
- 'A' + 10);
247 else if(ch
>= 'a' && ch
<= 'f')
249 *value
= *value
* 16 + (unsigned long)(ch
- 'a' + 10);
261 * Load the character mapping information from a UCM file.
263 static void loadCharMaps(FILE *file
)
266 unsigned long byteValue
;
271 /* Initialize the mapping tables */
272 for(posn
= 0; posn
< 256; ++posn
)
274 byteToChar
[posn
] = (unsigned)'?';
275 byteToCharLevel
[posn
] = 100;
277 for(posn
= 0; posn
< 65536; ++posn
)
279 charToByte
[posn
] = -1;
282 /* Read the contents of the file */
283 while(fgets(buffer
, BUFSIZ
, file
))
285 /* Lines of interest begin with "<U" */
286 if(buffer
[0] != '<' || buffer
[1] != 'U')
291 /* Parse the fields on the line */
293 buf
+= parseHex(buf
, &posn
);
298 while(*buf
!= '\0' && *buf
!= '\\')
302 if(*buf
!= '\\' || buf
[1] != 'x')
307 buf
+= parseHex(buf
, &byteValue
);
312 while(*buf
!= '\0' && *buf
!= '|')
320 level
= (int)(buf
[1] - '0');
322 /* Update the byte->char mapping table */
323 if(level
< byteToCharLevel
[byteValue
])
325 byteToCharLevel
[byteValue
] = level
;
326 byteToChar
[byteValue
] = (unsigned)posn
;
329 /* Update the char->byte mapping table */
330 charToByte
[posn
] = (int)byteValue
;
334 #define COPYRIGHT_MSG \
336 " * Copyright (c) 2002 Southern Storm Software, Pty Ltd\n" \
338 " * Permission is hereby granted, free of charge, to any person obtaining\n" \
339 " * a copy of this software and associated documentation files (the \"Software\"),\n" \
340 " * to deal in the Software without restriction, including without limitation\n" \
341 " * the rights to use, copy, modify, merge, publish, distribute, sublicense,\n" \
342 " * and/or sell copies of the Software, and to permit persons to whom the\n" \
343 " * Software is furnished to do so, subject to the following conditions:\n" \
345 " * The above copyright notice and this permission notice shall be included\n" \
346 " * in all copies or substantial portions of the Software.\n" \
348 " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n" \
349 " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" \
350 " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n" \
351 " * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR\n" \
352 " * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\n" \
353 " * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\n" \
354 " * OTHER DEALINGS IN THE SOFTWARE.\n" \
358 * Print the header for the current code page definition.
360 static void printHeader(void)
362 printf("/*\n * CP%d.cs - %s code page.\n", codePage
, name
);
363 fputs(COPYRIGHT_MSG
, stdout
);
364 printf("// Generated from \"%s\".\n\n", filename
);
365 printf("namespace I18N.%s\n{\n\n", region
);
366 printf("using System;\n");
367 printf("using I18N.Common;\n\n");
368 printf("public class CP%d : ByteEncoding\n{\n", codePage
);
369 printf("\tpublic CP%d()\n", codePage
);
370 printf("\t\t: base(%d, ToChars, \"%s\",\n", codePage
, name
);
371 printf("\t\t \"%s\", \"%s\", \"%s\",\n",
372 bodyName
, headerName
, webName
);
373 printf("\t\t %s, %s, %s, %s, %d)\n",
374 (isBrowserDisplay
? "true" : "false"),
375 (isBrowserSave
? "true" : "false"),
376 (isMailNewsDisplay
? "true" : "false"),
377 (isMailNewsSave
? "true" : "false"),
383 * Print an encoding name, adjusted to look like a type name.
385 static void printEncodingName(const char *name
)
389 if(*name
>= 'A' && *name
<= 'Z')
391 putc(*name
- 'A' + 'a', stdout
);
393 else if(*name
== '-')
406 * Print the footer for the current code page definition.
408 static void printFooter(void)
410 printf("}; // class CP%d\n\n", codePage
);
411 printf("public class ENC");
412 printEncodingName(webName
);
413 printf(" : CP%d\n{\n", codePage
);
414 printf("\tpublic ENC");
415 printEncodingName(webName
);
416 printf("() : base() {}\n\n");
417 printf("}; // class ENC");
418 printEncodingName(webName
);
419 printf("\n\n}; // namespace I18N.%s\n", region
);
423 * Print the byte->char conversion table.
425 static void printByteToChar(void)
428 printf("\tprivate static readonly char[] ToChars = {");
429 for(posn
= 0; posn
< 256; ++posn
)
435 printf("'\\u%04X', ", byteToChar
[posn
]);
437 printf("\n\t};\n\n");
441 * Print a "switch" statement that converts "ch" from
442 * a character value into a byte value.
444 static void printConvertSwitch(void)
446 unsigned long directLimit
;
449 unsigned long rangeSize
;
453 /* Find the limit of direct byte mappings */
455 while(directLimit
< 256 && charToByte
[directLimit
] == (int)directLimit
)
460 /* Determine if we have the full-width Latin1 mappings, which
461 we can optimise in the default case of the switch */
463 for(posn
= 0xFF01; posn
<= 0xFF5E; ++posn
)
465 if((charToByte
[posn
] - 0x21) != (int)(posn
- 0xFF01))
471 /* Print the switch header. The "if" is an optimisation
472 to ignore the common case of direct ASCII mappings */
473 printf("\t\t\tif(ch >= %lu) switch(ch)\n", directLimit
);
476 /* Handle all direct byte mappings above the direct limit */
478 for(posn
= directLimit
; posn
< 256; ++posn
)
480 if(charToByte
[posn
] == (int)posn
)
483 printf("\t\t\t\tcase 0x%04lX:\n", posn
);
488 printf("\t\t\t\t\tbreak;\n");
491 /* Handle the indirect mappings */
492 for(posn
= 0; posn
< 65536; ++posn
)
494 if(haveFullWidth
&& posn
>= 0xFF01 && posn
<= 0xFF5E)
496 /* Handle full-width Latin1 conversions later */
499 if(charToByte
[posn
] != (int)posn
&&
500 charToByte
[posn
] != -1)
502 /* See if we have a run of 4 or more characters that
503 can be mapped algorithmically to some other range */
505 for(posn2
= posn
+ 1; posn2
< 65536; ++posn2
)
507 if(charToByte
[posn2
] == (int)posn2
||
508 charToByte
[posn2
] == -1)
512 if((charToByte
[posn2
] - charToByte
[posn
]) !=
521 /* Output a range mapping for the characters */
522 for(posn2
= posn
; posn2
< (posn
+ rangeSize
); ++posn2
)
524 printf("\t\t\t\tcase 0x%04lX:\n", posn2
);
526 posn
+= rangeSize
- 1;
527 if(((long)posn
) >= (long)(charToByte
[posn
]))
529 printf("\t\t\t\t\tch -= 0x%04lX;\n",
530 (long)(posn
- charToByte
[posn
]));
534 printf("\t\t\t\t\tch += 0x%04lX;\n",
535 (long)(charToByte
[posn
] - posn
));
537 printf("\t\t\t\t\tbreak;\n");
541 /* Use a simple non-algorithmic mapping */
542 printf("\t\t\t\tcase 0x%04lX: ch = 0x%02X; break;\n",
543 posn
, (unsigned)(charToByte
[posn
]));
548 /* Print the switch footer */
551 printf("\t\t\t\tdefault: ch = 0x3F; break;\n");
555 printf("\t\t\t\tdefault:\n");
556 printf("\t\t\t\t{\n");
557 printf("\t\t\t\t\tif(ch >= 0xFF01 && ch <= 0xFF5E)\n");
558 printf("\t\t\t\t\t\tch -= 0xFEE0;\n");
559 printf("\t\t\t\t\telse\n");
560 printf("\t\t\t\t\t\tch = 0x3F;\n");
561 printf("\t\t\t\t}\n");
562 printf("\t\t\t\tbreak;\n");
568 * Print the char->byte conversion methods.
570 static void printCharToByte(void)
572 /* Print the conversion method for character buffers */
573 printf("\tprotected override void ToBytes(char[] chars, int charIndex, int charCount,\n");
574 printf("\t byte[] bytes, int byteIndex)\n");
576 printf("\t\tint ch;\n");
577 printf("\t\twhile(charCount > 0)\n");
579 printf("\t\t\tch = (int)(chars[charIndex++]);\n");
580 printConvertSwitch();
581 printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
582 printf("\t\t\t--charCount;\n");
586 /* Print the conversion method for string buffers */
587 printf("\tprotected override void ToBytes(String s, int charIndex, int charCount,\n");
588 printf("\t byte[] bytes, int byteIndex)\n");
590 printf("\t\tint ch;\n");
591 printf("\t\twhile(charCount > 0)\n");
593 printf("\t\t\tch = (int)(s[charIndex++]);\n");
594 printConvertSwitch();
595 printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
596 printf("\t\t\t--charCount;\n");