Move items implemented after a2 into the new a3 section
[python.git] / Objects / unicodectype.c
blobb432399a13ba1fe8139dfb2e6559e07dfcaa8173
1 /*
2 Unicode character type helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
7 Copyright (c) Corporation for National Research Initiatives.
9 */
11 #include "Python.h"
12 #include "unicodeobject.h"
14 #define ALPHA_MASK 0x01
15 #define DECIMAL_MASK 0x02
16 #define DIGIT_MASK 0x04
17 #define LOWER_MASK 0x08
18 #define LINEBREAK_MASK 0x10
19 #define SPACE_MASK 0x20
20 #define TITLE_MASK 0x40
21 #define UPPER_MASK 0x80
23 typedef struct {
24 const Py_UNICODE upper;
25 const Py_UNICODE lower;
26 const Py_UNICODE title;
27 const unsigned char decimal;
28 const unsigned char digit;
29 const unsigned short flags;
30 } _PyUnicode_TypeRecord;
32 #include "unicodetype_db.h"
34 static const _PyUnicode_TypeRecord *
35 gettyperecord(Py_UNICODE code)
37 int index;
39 #ifdef Py_UNICODE_WIDE
40 if (code >= 0x110000)
41 index = 0;
42 else
43 #endif
45 index = index1[(code>>SHIFT)];
46 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
49 return &_PyUnicode_TypeRecords[index];
52 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
53 type 'B', 0 otherwise. */
55 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
57 switch (ch) {
58 case 0x000A: /* LINE FEED */
59 case 0x000D: /* CARRIAGE RETURN */
60 case 0x001C: /* FILE SEPARATOR */
61 case 0x001D: /* GROUP SEPARATOR */
62 case 0x001E: /* RECORD SEPARATOR */
63 case 0x0085: /* NEXT LINE */
64 case 0x2028: /* LINE SEPARATOR */
65 case 0x2029: /* PARAGRAPH SEPARATOR */
66 return 1;
67 default:
68 return 0;
72 /* Returns the titlecase Unicode characters corresponding to ch or just
73 ch if no titlecase mapping is known. */
75 Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
77 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
78 int delta;
80 if (ctype->title)
81 delta = ctype->title;
82 else
83 delta = ctype->upper;
85 if (delta >= 32768)
86 delta -= 65536;
88 return ch + delta;
91 /* Returns 1 for Unicode characters having the category 'Lt', 0
92 otherwise. */
94 int _PyUnicode_IsTitlecase(Py_UNICODE ch)
96 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
98 return (ctype->flags & TITLE_MASK) != 0;
101 /* Returns the integer decimal (0-9) for Unicode characters having
102 this property, -1 otherwise. */
104 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
106 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
108 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
111 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
113 if (_PyUnicode_ToDecimalDigit(ch) < 0)
114 return 0;
115 return 1;
118 /* Returns the integer digit (0-9) for Unicode characters having
119 this property, -1 otherwise. */
121 int _PyUnicode_ToDigit(Py_UNICODE ch)
123 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
125 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
128 int _PyUnicode_IsDigit(Py_UNICODE ch)
130 if (_PyUnicode_ToDigit(ch) < 0)
131 return 0;
132 return 1;
135 /* Returns the numeric value as double for Unicode characters having
136 this property, -1.0 otherwise. */
138 /* TODO: replace with unicodetype_db.h table */
140 double _PyUnicode_ToNumeric(Py_UNICODE ch)
142 switch (ch) {
143 case 0x3007:
144 return (double) 0;
145 case 0x09F4:
146 case 0x215F:
147 case 0x2160:
148 case 0x2170:
149 case 0x3021:
150 case 0x3280:
151 return (double) 1;
152 case 0x00BD:
153 return (double) 1 / 2;
154 case 0x2153:
155 return (double) 1 / 3;
156 case 0x00BC:
157 return (double) 1 / 4;
158 case 0x2155:
159 return (double) 1 / 5;
160 case 0x2159:
161 return (double) 1 / 6;
162 case 0x215B:
163 return (double) 1 / 8;
164 case 0x0BF0:
165 case 0x1372:
166 case 0x2169:
167 case 0x2179:
168 case 0x2469:
169 case 0x247D:
170 case 0x2491:
171 case 0x277F:
172 case 0x2789:
173 case 0x2793:
174 case 0x3038:
175 case 0x3289:
176 return (double) 10;
177 case 0x0BF1:
178 case 0x137B:
179 case 0x216D:
180 case 0x217D:
181 return (double) 100;
182 case 0x0BF2:
183 case 0x216F:
184 case 0x217F:
185 case 0x2180:
186 return (double) 1000;
187 case 0x137C:
188 case 0x2182:
189 return (double) 10000;
190 case 0x216A:
191 case 0x217A:
192 case 0x246A:
193 case 0x247E:
194 case 0x2492:
195 return (double) 11;
196 case 0x216B:
197 case 0x217B:
198 case 0x246B:
199 case 0x247F:
200 case 0x2493:
201 return (double) 12;
202 case 0x246C:
203 case 0x2480:
204 case 0x2494:
205 return (double) 13;
206 case 0x246D:
207 case 0x2481:
208 case 0x2495:
209 return (double) 14;
210 case 0x246E:
211 case 0x2482:
212 case 0x2496:
213 return (double) 15;
214 case 0x09F9:
215 case 0x246F:
216 case 0x2483:
217 case 0x2497:
218 return (double) 16;
219 case 0x16EE:
220 case 0x2470:
221 case 0x2484:
222 case 0x2498:
223 return (double) 17;
224 case 0x16EF:
225 case 0x2471:
226 case 0x2485:
227 case 0x2499:
228 return (double) 18;
229 case 0x16F0:
230 case 0x2472:
231 case 0x2486:
232 case 0x249A:
233 return (double) 19;
234 case 0x09F5:
235 case 0x2161:
236 case 0x2171:
237 case 0x3022:
238 case 0x3281:
239 return (double) 2;
240 case 0x2154:
241 return (double) 2 / 3;
242 case 0x2156:
243 return (double) 2 / 5;
244 case 0x1373:
245 case 0x2473:
246 case 0x2487:
247 case 0x249B:
248 case 0x3039:
249 return (double) 20;
250 case 0x09F6:
251 case 0x2162:
252 case 0x2172:
253 case 0x3023:
254 case 0x3282:
255 return (double) 3;
256 case 0x00BE:
257 return (double) 3 / 4;
258 case 0x2157:
259 return (double) 3 / 5;
260 case 0x215C:
261 return (double) 3 / 8;
262 case 0x1374:
263 case 0x303A:
264 return (double) 30;
265 case 0x09F7:
266 case 0x2163:
267 case 0x2173:
268 case 0x3024:
269 case 0x3283:
270 return (double) 4;
271 case 0x2158:
272 return (double) 4 / 5;
273 case 0x1375:
274 return (double) 40;
275 case 0x2164:
276 case 0x2174:
277 case 0x3025:
278 case 0x3284:
279 return (double) 5;
280 case 0x215A:
281 return (double) 5 / 6;
282 case 0x215D:
283 return (double) 5 / 8;
284 case 0x1376:
285 case 0x216C:
286 case 0x217C:
287 return (double) 50;
288 case 0x216E:
289 case 0x217E:
290 return (double) 500;
291 case 0x2181:
292 return (double) 5000;
293 case 0x2165:
294 case 0x2175:
295 case 0x3026:
296 case 0x3285:
297 return (double) 6;
298 case 0x1377:
299 return (double) 60;
300 case 0x2166:
301 case 0x2176:
302 case 0x3027:
303 case 0x3286:
304 return (double) 7;
305 case 0x215E:
306 return (double) 7 / 8;
307 case 0x1378:
308 return (double) 70;
309 case 0x2167:
310 case 0x2177:
311 case 0x3028:
312 case 0x3287:
313 return (double) 8;
314 case 0x1379:
315 return (double) 80;
316 case 0x2168:
317 case 0x2178:
318 case 0x3029:
319 case 0x3288:
320 return (double) 9;
321 case 0x137A:
322 return (double) 90;
323 default:
324 return (double) _PyUnicode_ToDigit(ch);
328 int _PyUnicode_IsNumeric(Py_UNICODE ch)
330 if (_PyUnicode_ToNumeric(ch) < 0.0)
331 return 0;
332 return 1;
335 #ifndef WANT_WCTYPE_FUNCTIONS
337 /* Returns 1 for Unicode characters having the bidirectional type
338 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
340 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
342 switch (ch) {
343 case 0x0009: /* HORIZONTAL TABULATION */
344 case 0x000A: /* LINE FEED */
345 case 0x000B: /* VERTICAL TABULATION */
346 case 0x000C: /* FORM FEED */
347 case 0x000D: /* CARRIAGE RETURN */
348 case 0x001C: /* FILE SEPARATOR */
349 case 0x001D: /* GROUP SEPARATOR */
350 case 0x001E: /* RECORD SEPARATOR */
351 case 0x001F: /* UNIT SEPARATOR */
352 case 0x0020: /* SPACE */
353 case 0x0085: /* NEXT LINE */
354 case 0x00A0: /* NO-BREAK SPACE */
355 case 0x1680: /* OGHAM SPACE MARK */
356 case 0x2000: /* EN QUAD */
357 case 0x2001: /* EM QUAD */
358 case 0x2002: /* EN SPACE */
359 case 0x2003: /* EM SPACE */
360 case 0x2004: /* THREE-PER-EM SPACE */
361 case 0x2005: /* FOUR-PER-EM SPACE */
362 case 0x2006: /* SIX-PER-EM SPACE */
363 case 0x2007: /* FIGURE SPACE */
364 case 0x2008: /* PUNCTUATION SPACE */
365 case 0x2009: /* THIN SPACE */
366 case 0x200A: /* HAIR SPACE */
367 case 0x200B: /* ZERO WIDTH SPACE */
368 case 0x2028: /* LINE SEPARATOR */
369 case 0x2029: /* PARAGRAPH SEPARATOR */
370 case 0x202F: /* NARROW NO-BREAK SPACE */
371 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
372 case 0x3000: /* IDEOGRAPHIC SPACE */
373 return 1;
374 default:
375 return 0;
379 /* Returns 1 for Unicode characters having the category 'Ll', 0
380 otherwise. */
382 int _PyUnicode_IsLowercase(Py_UNICODE ch)
384 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
386 return (ctype->flags & LOWER_MASK) != 0;
389 /* Returns 1 for Unicode characters having the category 'Lu', 0
390 otherwise. */
392 int _PyUnicode_IsUppercase(Py_UNICODE ch)
394 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
396 return (ctype->flags & UPPER_MASK) != 0;
399 /* Returns the uppercase Unicode characters corresponding to ch or just
400 ch if no uppercase mapping is known. */
402 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
404 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
405 int delta = ctype->upper;
406 if (delta >= 32768)
407 delta -= 65536;
408 return ch + delta;
411 /* Returns the lowercase Unicode characters corresponding to ch or just
412 ch if no lowercase mapping is known. */
414 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
416 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
417 int delta = ctype->lower;
418 if (delta >= 32768)
419 delta -= 65536;
420 return ch + delta;
423 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
424 'Lo' or 'Lm', 0 otherwise. */
426 int _PyUnicode_IsAlpha(Py_UNICODE ch)
428 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
430 return (ctype->flags & ALPHA_MASK) != 0;
433 #else
435 /* Export the interfaces using the wchar_t type for portability
436 reasons: */
438 int _PyUnicode_IsWhitespace(Py_UNICODE ch)
440 return iswspace(ch);
443 int _PyUnicode_IsLowercase(Py_UNICODE ch)
445 return iswlower(ch);
448 int _PyUnicode_IsUppercase(Py_UNICODE ch)
450 return iswupper(ch);
453 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
455 return towlower(ch);
458 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
460 return towupper(ch);
463 int _PyUnicode_IsAlpha(Py_UNICODE ch)
465 return iswalpha(ch);
468 #endif