Added consultant.
[Samba.git] / source / lib / charset.c
blob6b4f0b07bcbd689d552464acbff9495aaedc5ea6
1 /*
2 Unix SMB/Netbios implementation.
3 Version 1.9.
4 Character set handling
5 Copyright (C) Andrew Tridgell 1992-1997
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 #define CHARSET_C
23 #include "includes.h"
25 extern int DEBUGLEVEL;
28 * Codepage definitions.
31 /* lower->upper mapping for IBM Code Page 850 - MS-DOS Latin 1 */
32 unsigned char cp_850[][4] = {
33 /* dec col/row oct hex description */
34 /* 133 08/05 205 85 a grave */
35 /* 183 11/07 267 B7 A grave */ {0x85,0xB7,1,1},
36 /* 160 10/00 240 A0 a acute */
37 /* 181 11/05 265 B5 A acute */ {0xA0,0xB5,1,1},
38 /* 131 08/03 203 83 a circumflex */
39 /* 182 11/06 266 B6 A circumflex */ {0x83,0xB6,1,1},
40 /* 198 12/06 306 C6 a tilde */
41 /* 199 12/07 307 C7 A tilde */ {0xC6,0xC7,1,1},
42 /* 132 08/04 204 84 a diaeresis */
43 /* 142 08/14 216 8E A diaeresis */ {0x84,0x8E,1,1},
44 /* 134 08/06 206 86 a ring */
45 /* 143 08/15 217 8F A ring */ {0x86,0x8F,1,1},
46 /* 145 09/01 221 91 ae diphthong */
47 /* 146 09/02 222 92 AE diphthong */ {0x91,0x92,1,1},
48 /* 135 08/07 207 87 c cedilla */
49 /* 128 08/00 200 80 C cedilla */ {0x87,0x80,1,1},
50 /* 138 08/10 212 8A e grave */
51 /* 212 13/04 324 D4 E grave */ {0x8A,0xD4,1,1},
52 /* 130 08/02 202 82 e acute */
53 /* 144 09/00 220 90 E acute */ {0x82,0x90,1,1},
54 /* 136 08/08 210 88 e circumflex */
55 /* 210 13/02 322 D2 E circumflex */ {0x88,0xD2,1,1},
56 /* 137 08/09 211 89 e diaeresis */
57 /* 211 13/03 323 D3 E diaeresis */ {0x89,0xD3,1,1},
58 /* 141 08/13 215 8D i grave */
59 /* 222 13/14 336 DE I grave */ {0x8D,0xDE,1,1},
60 /* 161 10/01 241 A1 i acute */
61 /* 214 13/06 326 D6 I acute */ {0xA1,0xD6,1,1},
62 /* 140 08/12 214 8C i circumflex */
63 /* 215 13/07 327 D7 I circumflex */ {0x8C,0xD7,1,1},
64 /* 139 08/11 213 8B i diaeresis */
65 /* 216 13/08 330 D8 I diaeresis */ {0x8B,0xD8,1,1},
66 /* 208 13/00 320 D0 Icelandic eth */
67 /* 209 13/01 321 D1 Icelandic Eth */ {0xD0,0xD1,1,1},
68 /* 164 10/04 244 A4 n tilde */
69 /* 165 10/05 245 A5 N tilde */ {0xA4,0xA5,1,1},
70 /* 149 09/05 225 95 o grave */
71 /* 227 14/03 343 E3 O grave */ {0x95,0xE3,1,1},
72 /* 162 10/02 242 A2 o acute */
73 /* 224 14/00 340 E0 O acute */ {0xA2,0xE0,1,1},
74 /* 147 09/03 223 93 o circumflex */
75 /* 226 14/02 342 E2 O circumflex */ {0x93,0xE2,1,1},
76 /* 228 14/04 344 E4 o tilde */
77 /* 229 14/05 345 E5 O tilde */ {0xE4,0xE5,1,1},
78 /* 148 09/04 224 94 o diaeresis */
79 /* 153 09/09 231 99 O diaeresis */ {0x94,0x99,1,1},
80 /* 155 09/11 233 9B o slash */
81 /* 157 09/13 235 9D O slash */ {0x9B,0x9D,1,1},
82 /* 151 09/07 227 97 u grave */
83 /* 235 14/11 353 EB U grave */ {0x97,0xEB,1,1},
84 /* 163 10/03 243 A3 u acute */
85 /* 233 14/09 351 E9 U acute */ {0xA3,0xE9,1,1},
86 /* 150 09/06 226 96 u circumflex */
87 /* 234 14/10 352 EA U circumflex */ {0x96,0xEA,1,1},
88 /* 129 08/01 201 81 u diaeresis */
89 /* 154 09/10 232 9A U diaeresis */ {0x81,0x9A,1,1},
90 /* 236 14/12 354 EC y acute */
91 /* 237 14/13 355 ED Y acute */ {0xEC,0xED,1,1},
92 /* 231 14/07 347 E7 Icelandic thorn */
93 /* 232 14/08 350 E8 Icelandic Thorn */ {0xE7,0xE8,1,1},
95 {0x9C,0,0,0}, /* Pound */
96 {0,0,0,0}
99 /* lower->upper mapping for IBM Code Page 437 - MS-DOS Latin US */
100 unsigned char cp_437[][4] = {
101 /* 135 08/07 207 87 c cedilla */
102 /* 128 08/00 200 80 C cedilla */ {0x87,0x80,1,1},
103 /* 129 08/01 201 81 u diaeresis */
104 /* 154 09/10 232 9A U diaeresis */ {0x81,0x9A,1,1},
105 /* 130 08/02 202 82 e acute */
106 /* 144 09/00 220 90 E acute */ {0x82,0x90,1,1},
107 /* 131 08/03 203 83 a circumflex */ {0x83,0x41,1,0},
108 /* 132 08/04 204 84 a diaeresis */
109 /* 142 08/14 216 8E A diaeresis */ {0x84,0x8E,1,1},
110 /* 133 08/05 205 85 a grave */ {0x85,0x41,1,0},
111 /* 134 08/06 206 86 a ring */ {0x86,0x8F,1,1},
112 /* 136 08/08 210 88 e circumflex */ {0x88,0x45,1,0},
113 /* 137 08/09 211 89 e diaeresis */ {0x89,0x45,1,0},
114 /* 138 08/10 212 8A e grave */ {0x8A,0x45,1,0},
115 /* 139 08/11 213 8B i diaeresis */ {0x8B,0x49,1,0},
116 /* 140 08/12 214 8C i circumflex */ {0x8C,0x49,1,0},
117 /* 141 08/13 215 8D i grave */ {0x8D,0x49,1,0},
118 /* 145 09/01 221 91 ae diphthong */
119 /* 146 09/02 222 92 AE diphthong */ {0x91,0x92,1,1},
120 /* 147 09/03 223 93 o circumflex */ {0x93,0x4F,1,0},
121 /* 148 09/04 224 94 o diaeresis */
122 /* 153 09/09 231 99 O diaeresis */ {0x94,0x99,1,1},
123 /* 149 09/05 225 95 o grave */ {0x95,0x4F,1,0},
124 /* 150 09/06 226 96 u circumflex */ {0x96,0x55,1,0},
125 /* 151 09/07 227 97 u grave */ {0x97,0x55,1,0},
126 /* 152 ??/?? 201 98 u diaeresis */
127 {0x9B,0,0,0}, /* Cent */
128 {0x9C,0,0,0}, /* Pound */
129 {0x9D,0,0,0}, /* Yen */
130 /* 160 10/00 240 A0 a acute */ {0xA0,0x41,1,0},
131 /* 161 10/01 241 A1 i acute */ {0xA1,0x49,1,0},
132 /* 162 10/02 242 A2 o acute */ {0xA2,0x4F,1,0},
133 /* 163 10/03 243 A3 u acute */ {0xA3,0x55,1,0},
134 /* 164 10/04 244 A4 n tilde */
135 /* 165 10/05 245 A5 N tilde */ {0xA4,0xA5,1,1},
136 /* Punctuation... */
137 {0xA8,0,0,0},
138 {0xAD,0,0,0},
139 {0xAE,0,0,0},
140 {0xAF,0,0,0},
141 /* Greek character set */
142 {0xE0,0,0,0},
143 {0xE1,0,0,0},
144 {0xE2,0,0,0},
145 {0xE3,0,0,0},
146 {0xE4,0,0,0},
147 {0xE5,0,0,0},
148 {0xE6,0,0,0},
149 {0xE7,0,0,0},
150 {0xE8,0,0,0},
151 {0xE9,0,0,0},
152 {0xEA,0,0,0},
153 {0xEB,0,0,0},
154 {0xEC,0,0,0},
155 {0xED,0,0,0},
156 {0xEE,0,0,0},
157 {0xEF,0,0,0},
158 {0,0,0,0}
161 char xx_dos_char_map[256];
162 char xx_upper_char_map[256];
163 char xx_lower_char_map[256];
165 char *dos_char_map = xx_dos_char_map;
166 char *upper_char_map = xx_upper_char_map;
167 char *lower_char_map = xx_lower_char_map;
170 * This code has been extended to deal with ascynchronous mappings
171 * like MS-DOS Latin US (Code page 437) where things like :
172 * a acute are capitalized to 'A', but the reverse mapping
173 * must not hold true. This allows the filename case insensitive
174 * matching in do_match() to work, as the DOS/Win95/NT client
175 * uses 'A' as a mask to match against characters like a acute.
176 * This is the meaning behind the parameters that allow a
177 * mapping from lower to upper, but not upper to lower.
180 static void add_dos_char(int lower, BOOL map_lower_to_upper,
181 int upper, BOOL map_upper_to_lower)
183 lower &= 0xff;
184 upper &= 0xff;
185 DEBUG(6,("Adding chars 0x%x 0x%x (l->u = %s) (u->l = %s)\n",lower,upper,
186 map_lower_to_upper ? "True" : "False",
187 map_upper_to_lower ? "True" : "False"));
188 if (lower) dos_char_map[lower] = 1;
189 if (upper) dos_char_map[upper] = 1;
190 if (lower && upper) {
191 if(map_upper_to_lower)
192 lower_char_map[upper] = (char)lower;
193 if(map_lower_to_upper)
194 upper_char_map[lower] = (char)upper;
198 /****************************************************************************
199 initialise the charset arrays
200 ****************************************************************************/
201 void charset_initialise()
203 int i;
205 #ifdef LC_ALL
206 /* include <locale.h> in includes.h if available for OS */
207 /* we take only standard 7-bit ASCII definitions from ctype */
208 setlocale(LC_ALL,"C");
209 #endif
211 for (i= 0;i<=255;i++) {
212 dos_char_map[i] = 0;
215 for (i=0;i<=127;i++) {
216 if (isalnum((char)i) || strchr("._^$~!#%&-{}()@'`",(char)i))
217 add_dos_char(i,False,0,False);
220 for (i=0; i<=255; i++) {
221 char c = (char)i;
222 upper_char_map[i] = lower_char_map[i] = c;
223 if (isupper(c)) lower_char_map[i] = tolower(c);
224 if (islower(c)) upper_char_map[i] = toupper(c);
228 /****************************************************************************
229 initialise the client codepage.
230 ****************************************************************************/
231 void codepage_initialise(int client_codepage)
233 int i;
234 unsigned char (*cp)[4] = NULL;
236 DEBUG(6,("codepage_initialise: client code page = %d\n", client_codepage));
239 * Known client codepages - these can be added to.
241 switch(client_codepage)
243 case 850:
244 cp = cp_850;
245 break;
246 case 437:
247 cp = cp_437;
248 break;
249 default:
250 /* Use default codepage - currently 850 */
251 DEBUG(6,("codepage_initialise: Using default client codepage %d\n",
252 850));
253 cp = cp_850;
254 break;
257 if(cp)
259 for(i = 0; (cp[i][0] != '\0') && (cp[i][1] != '\0'); i++)
260 add_dos_char(cp[i][0], (BOOL)cp[i][2], cp[i][1], (BOOL)cp[i][3]);
264 /*******************************************************************
265 add characters depending on a string passed by the user
266 ********************************************************************/
267 void add_char_string(char *s)
269 char *extra_chars = (char *)strdup(s);
270 char *t;
271 if (!extra_chars) return;
273 for (t=strtok(extra_chars," \t\r\n"); t; t=strtok(NULL," \t\r\n")) {
274 char c1=0,c2=0;
275 int i1=0,i2=0;
276 if (isdigit((unsigned char)*t) || (*t)=='-') {
277 sscanf(t,"%i:%i",&i1,&i2);
278 add_dos_char(i1,True,i2,True);
279 } else {
280 sscanf(t,"%c:%c",&c1,&c2);
281 add_dos_char((unsigned char)c1,True,(unsigned char)c2, True);
285 free(extra_chars);