tagged release 0.6.4
[parrot.git] / src / charset / iso-8859-1.c
blobd1fe487ee2ede9893ee0f8fc528bd48484b04f65
1 /*
2 Copyright (C) 2004-2007, The Perl Foundation.
3 $Id$
5 =head1 NAME
7 src/charset/iso-8859-1.c
9 =head1 DESCRIPTION
11 This file implements the charset functions for iso-8859-1 data
13 =over 4
15 =cut
19 #include "parrot/parrot.h"
20 #include "iso-8859-1.h"
21 #include "ascii.h"
23 /* HEADERIZER HFILE: src/charset/iso-8859-1.h */
25 /* HEADERIZER BEGIN: static */
26 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
28 PARROT_CANNOT_RETURN_NULL
29 PARROT_WARN_UNUSED_RESULT
30 static STRING* compose(PARROT_INTERP, ARGIN(STRING *src))
31 __attribute__nonnull__(1)
32 __attribute__nonnull__(2);
34 PARROT_CANNOT_RETURN_NULL
35 static STRING* decompose(PARROT_INTERP, SHIM(STRING *src))
36 __attribute__nonnull__(1);
38 static void downcase(PARROT_INTERP, ARGIN(STRING *source_string))
39 __attribute__nonnull__(1)
40 __attribute__nonnull__(2);
42 static void downcase_first(PARROT_INTERP, ARGIN(STRING *source_string))
43 __attribute__nonnull__(1)
44 __attribute__nonnull__(2);
46 static INTVAL find_cclass(PARROT_INTERP,
47 INTVAL flags,
48 ARGIN(STRING *source_string),
49 UINTVAL offset,
50 UINTVAL count)
51 __attribute__nonnull__(1)
52 __attribute__nonnull__(3);
54 static INTVAL find_not_cclass(PARROT_INTERP,
55 INTVAL flags,
56 ARGIN(STRING *source_string),
57 UINTVAL offset,
58 UINTVAL count)
59 __attribute__nonnull__(1)
60 __attribute__nonnull__(3);
62 static INTVAL is_cclass(PARROT_INTERP,
63 INTVAL flags,
64 ARGIN(const STRING *source_string),
65 UINTVAL offset)
66 __attribute__nonnull__(1)
67 __attribute__nonnull__(3);
69 static void set_graphemes(PARROT_INTERP,
70 ARGIN(STRING *source_string),
71 UINTVAL offset,
72 UINTVAL replace_count,
73 ARGMOD(STRING *insert_string))
74 __attribute__nonnull__(1)
75 __attribute__nonnull__(2)
76 __attribute__nonnull__(5)
77 FUNC_MODIFIES(*insert_string);
79 PARROT_CANNOT_RETURN_NULL
80 static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
81 __attribute__nonnull__(1);
83 static void titlecase(PARROT_INTERP, ARGIN(STRING *source_string))
84 __attribute__nonnull__(1)
85 __attribute__nonnull__(2);
87 static void titlecase_first(PARROT_INTERP, ARGIN(STRING *source_string))
88 __attribute__nonnull__(1)
89 __attribute__nonnull__(2);
91 PARROT_CANNOT_RETURN_NULL
92 PARROT_WARN_UNUSED_RESULT
93 static STRING * to_charset(PARROT_INTERP,
94 ARGIN(STRING *src),
95 ARGIN_NULLOK(STRING *dest))
96 __attribute__nonnull__(1)
97 __attribute__nonnull__(2);
99 PARROT_CANNOT_RETURN_NULL
100 static STRING * to_latin1(PARROT_INTERP,
101 ARGIN(STRING *src),
102 ARGMOD_NULLOK(STRING *dest))
103 __attribute__nonnull__(1)
104 __attribute__nonnull__(2);
106 PARROT_CANNOT_RETURN_NULL
107 static STRING * to_unicode(PARROT_INTERP,
108 ARGIN(STRING *src),
109 ARGMOD_NULLOK(STRING *dest))
110 __attribute__nonnull__(1)
111 __attribute__nonnull__(2);
113 static void upcase(PARROT_INTERP, ARGIN(STRING *source_string))
114 __attribute__nonnull__(1)
115 __attribute__nonnull__(2);
117 static void upcase_first(PARROT_INTERP, ARGIN(STRING *source_string))
118 __attribute__nonnull__(1)
119 __attribute__nonnull__(2);
121 static UINTVAL validate(PARROT_INTERP, ARGIN(STRING *src))
122 __attribute__nonnull__(1)
123 __attribute__nonnull__(2);
125 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
126 /* HEADERIZER END: static */
128 #include "tables.h"
132 =item C<static void set_graphemes>
134 RT#48260: Not yet documented!!!
136 =cut
140 static void
141 set_graphemes(PARROT_INTERP, ARGIN(STRING *source_string),
142 UINTVAL offset, UINTVAL replace_count, ARGMOD(STRING *insert_string))
144 ENCODING_SET_BYTES(interp, source_string, offset,
145 replace_count, insert_string);
150 =item C<static STRING * to_latin1>
152 RT#48260: Not yet documented!!!
154 =cut
158 PARROT_CANNOT_RETURN_NULL
159 static STRING *
160 to_latin1(PARROT_INTERP, ARGIN(STRING *src), ARGMOD_NULLOK(STRING *dest))
162 UINTVAL offs, src_len;
163 String_iter iter;
165 ENCODING_ITER_INIT(interp, src, &iter);
166 src_len = src->strlen;
167 if (dest) {
168 Parrot_reallocate_string(interp, dest, src_len);
169 dest->strlen = src_len;
171 else {
172 /* latin1 is never bigger then source */
173 dest = src;
175 dest->bufused = src_len;
176 dest->charset = Parrot_iso_8859_1_charset_ptr;
177 dest->encoding = Parrot_fixed_8_encoding_ptr;
178 for (offs = 0; offs < src_len; ++offs) {
179 const UINTVAL c = iter.get_and_advance(interp, &iter);
180 if (c >= 0x100)
181 real_exception(interp, NULL, LOSSY_CONVERSION, "lossy conversion to ascii");
182 ENCODING_SET_BYTE(interp, dest, offs, c);
184 return dest;
189 =item C<static STRING * to_unicode>
191 RT#48260: Not yet documented!!!
193 =cut
197 PARROT_CANNOT_RETURN_NULL
198 static STRING *
199 to_unicode(PARROT_INTERP, ARGIN(STRING *src), ARGMOD_NULLOK(STRING *dest))
201 if (dest) {
202 UINTVAL offs;
203 String_iter iter;
205 dest->charset = Parrot_unicode_charset_ptr;
206 dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, dest);
207 Parrot_reallocate_string(interp, dest, src->strlen);
208 ENCODING_ITER_INIT(interp, dest, &iter);
209 for (offs = 0; offs < src->strlen; ++offs) {
210 const UINTVAL c = ENCODING_GET_BYTE(interp, src, offs);
212 if (iter.bytepos >= PObj_buflen(dest) - 4) {
213 UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
214 if (need < 16)
215 need = 16;
216 Parrot_reallocate_string(interp, dest,
217 PObj_buflen(dest) + need);
219 iter.set_and_advance(interp, &iter, c);
221 dest->bufused = iter.bytepos;
222 dest->strlen = iter.charpos;
223 return dest;
225 real_exception(interp, NULL, UNIMPLEMENTED,
226 "to_unicode inplace for iso-8859-1 not implemented");
231 =item C<static STRING * to_charset>
233 RT#48260: Not yet documented!!!
235 =cut
239 PARROT_CANNOT_RETURN_NULL
240 PARROT_WARN_UNUSED_RESULT
241 static STRING *
242 to_charset(PARROT_INTERP, ARGIN(STRING *src), ARGIN_NULLOK(STRING *dest))
244 const charset_converter_t conversion_func =
245 Parrot_find_charset_converter(interp, src->charset, Parrot_iso_8859_1_charset_ptr);
247 if (conversion_func)
248 return conversion_func(interp, src, dest);
249 else
250 return to_latin1(interp, src, dest);
256 =item C<static STRING* compose>
258 RT#48260: Not yet documented!!!
260 =cut
264 /* A noop. can't compose iso-8859-1 */
265 PARROT_CANNOT_RETURN_NULL
266 PARROT_WARN_UNUSED_RESULT
267 static STRING*
268 compose(PARROT_INTERP, ARGIN(STRING *src))
270 return string_copy(interp, src);
275 =item C<static STRING* decompose>
277 RT#48260: Not yet documented!!!
279 =cut
283 PARROT_CANNOT_RETURN_NULL
284 static STRING*
285 decompose(PARROT_INTERP, SHIM(STRING *src))
287 real_exception(interp, NULL, UNIMPLEMENTED,
288 "decompose for iso-8859-1 not implemented");
293 =item C<static void upcase>
295 RT#48260: Not yet documented!!!
297 =cut
301 static void
302 upcase(PARROT_INTERP, ARGIN(STRING *source_string))
304 unsigned char *buffer;
305 UINTVAL offset = 0;
307 if (!source_string->strlen)
308 return;
310 Parrot_unmake_COW(interp, source_string);
311 buffer = (unsigned char *)source_string->strstart;
312 for (offset = 0; offset < source_string->strlen; offset++) {
313 unsigned int c = buffer[offset]; /* XXX use encoding ? */
314 if (c >= 0xe0 && c != 0xf7)
315 c &= ~0x20;
316 else
317 c = toupper((unsigned char)c);
318 buffer[offset] = (unsigned char)c;
324 =item C<static void downcase>
326 RT#48260: Not yet documented!!!
328 =cut
332 static void
333 downcase(PARROT_INTERP, ARGIN(STRING *source_string))
335 if (source_string->strlen) {
336 UINTVAL offset;
337 unsigned char *buffer;
339 Parrot_unmake_COW(interp, source_string);
340 buffer = (unsigned char *)source_string->strstart;
341 for (offset = 0; offset < source_string->strlen; offset++) {
342 unsigned int c = buffer[offset];
343 if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
344 c |= 0x20;
345 else
346 c = tolower((unsigned char)c);
347 buffer[offset] = (unsigned char)c;
354 =item C<static void titlecase>
356 RT#48260: Not yet documented!!!
358 =cut
362 static void
363 titlecase(PARROT_INTERP, ARGIN(STRING *source_string))
365 unsigned char *buffer;
366 unsigned int c;
367 UINTVAL offset;
369 if (!source_string->strlen)
370 return;
372 Parrot_unmake_COW(interp, source_string);
373 buffer = (unsigned char *)source_string->strstart;
374 c = buffer[0];
375 if (c >= 0xe0 && c != 0xf7)
376 c &= ~0x20;
377 else
378 c = toupper((unsigned char)c);
379 buffer[0] = (unsigned char)c;
381 for (offset = 1; offset < source_string->strlen; offset++) {
382 c = buffer[offset];
383 if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
384 c |= 0x20;
385 else
386 c = tolower((unsigned char)c);
387 buffer[offset] = (unsigned char)c;
393 =item C<static void upcase_first>
395 RT#48260: Not yet documented!!!
397 =cut
401 static void
402 upcase_first(PARROT_INTERP, ARGIN(STRING *source_string))
404 if (source_string->strlen) {
405 unsigned char *buffer;
406 unsigned int c;
408 Parrot_unmake_COW(interp, source_string);
409 buffer = (unsigned char *)source_string->strstart;
410 c = buffer[0];
411 if (c >= 0xe0 && c != 0xf7)
412 c &= ~0x20;
413 else
414 c = toupper((unsigned char)c);
415 buffer[0] = (unsigned char)c;
421 =item C<static void downcase_first>
423 RT#48260: Not yet documented!!!
425 =cut
429 static void
430 downcase_first(PARROT_INTERP, ARGIN(STRING *source_string))
432 if (source_string->strlen) {
433 unsigned char *buffer;
434 unsigned int c;
436 Parrot_unmake_COW(interp, source_string);
437 buffer = (unsigned char *)source_string->strstart;
438 c = buffer[0];
439 if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
440 c &= ~0x20;
441 else
442 c = tolower((unsigned char)c);
443 buffer[0] = (unsigned char)c;
449 =item C<static void titlecase_first>
451 RT#48260: Not yet documented!!!
453 =cut
457 static void
458 titlecase_first(PARROT_INTERP, ARGIN(STRING *source_string))
460 upcase_first(interp, source_string);
466 =item C<static UINTVAL validate>
468 RT#48260: Not yet documented!!!
470 =cut
474 static UINTVAL
475 validate(PARROT_INTERP, ARGIN(STRING *src))
477 UINTVAL offset;
479 for (offset = 0; offset < string_length(interp, src); ++offset) {
480 const UINTVAL codepoint = ENCODING_GET_CODEPOINT(interp, src, offset);
481 if (codepoint >= 0x100)
482 return 0;
484 return 1;
489 =item C<static INTVAL is_cclass>
491 RT#48260: Not yet documented!!!
493 =cut
497 static INTVAL
498 is_cclass(PARROT_INTERP, INTVAL flags,
499 ARGIN(const STRING *source_string), UINTVAL offset)
501 UINTVAL codepoint;
503 if (offset >= source_string->strlen) return 0;
504 codepoint = ENCODING_GET_CODEPOINT(interp, source_string, offset);
506 if (codepoint >= sizeof (Parrot_ascii_typetable) /
507 sizeof (Parrot_ascii_typetable[0])) {
508 return 0;
510 return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
515 =item C<static INTVAL find_cclass>
517 RT#48260: Not yet documented!!!
519 =cut
523 static INTVAL
524 find_cclass(PARROT_INTERP, INTVAL flags,
525 ARGIN(STRING *source_string), UINTVAL offset, UINTVAL count)
527 UINTVAL pos = offset;
528 UINTVAL end = offset + count;
529 UINTVAL codepoint;
531 PARROT_ASSERT(source_string != 0);
532 end = source_string->strlen < end ? source_string->strlen : end;
533 for (; pos < end; ++pos) {
534 codepoint = ENCODING_GET_CODEPOINT(interp, source_string, pos);
535 if ((Parrot_iso_8859_1_typetable[codepoint] & flags) != 0) {
536 return pos;
539 return end;
544 =item C<static INTVAL find_not_cclass>
546 RT#48260: Not yet documented!!!
548 =cut
552 static INTVAL
553 find_not_cclass(PARROT_INTERP, INTVAL flags,
554 ARGIN(STRING *source_string), UINTVAL offset, UINTVAL count)
556 UINTVAL pos = offset;
557 UINTVAL end = offset + count;
559 PARROT_ASSERT(source_string);
560 end = source_string->strlen < end ? source_string->strlen : end;
561 for (; pos < end; ++pos) {
562 const UINTVAL codepoint = ENCODING_GET_CODEPOINT(interp, source_string, pos);
563 if ((Parrot_iso_8859_1_typetable[codepoint] & flags) == 0) {
564 return pos;
567 return end;
573 =item C<static STRING * string_from_codepoint>
575 RT#48260: Not yet documented!!!
577 =cut
581 PARROT_CANNOT_RETURN_NULL
582 static STRING *
583 string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
585 char real_codepoint = (char)codepoint;
586 STRING * const return_string = string_make(interp, &real_codepoint, 1,
587 "iso-8859-1", 0);
588 return return_string;
593 =item C<const CHARSET * Parrot_charset_iso_8859_1_init>
595 RT#48260: Not yet documented!!!
597 =cut
601 PARROT_CANNOT_RETURN_NULL
602 const CHARSET *
603 Parrot_charset_iso_8859_1_init(PARROT_INTERP)
605 CHARSET * const return_set = Parrot_new_charset(interp);
606 static const CHARSET base_set = {
607 "iso-8859-1",
608 ascii_get_graphemes,
609 ascii_get_graphemes_inplace,
610 set_graphemes,
611 to_charset,
612 compose,
613 decompose,
614 upcase,
615 downcase,
616 titlecase,
617 upcase_first,
618 downcase_first,
619 titlecase_first,
620 ascii_compare,
621 ascii_cs_index,
622 ascii_cs_rindex,
623 validate,
624 is_cclass,
625 find_cclass,
626 find_not_cclass,
627 string_from_codepoint,
628 ascii_compute_hash,
629 NULL
632 STRUCT_COPY_FROM_STRUCT(return_set, base_set);
633 return_set->preferred_encoding = Parrot_fixed_8_encoding_ptr;
634 Parrot_register_charset(interp, "iso-8859-1", return_set);
635 return return_set;
640 =item C<STRING * charset_cvt_iso_8859_1_to_ascii>
642 RT#48260: Not yet documented!!!
644 =cut
648 PARROT_CANNOT_RETURN_NULL
649 PARROT_WARN_UNUSED_RESULT
650 STRING *
651 charset_cvt_iso_8859_1_to_ascii(PARROT_INTERP, ARGIN(STRING *src),
652 ARGMOD_NULLOK(STRING *dest))
654 UINTVAL offs;
655 if (dest) {
656 Parrot_reallocate_string(interp, dest, src->strlen);
657 dest->bufused = src->bufused;
658 dest->strlen = src->strlen;
660 for (offs = 0; offs < src->strlen; ++offs) {
661 UINTVAL c = ENCODING_GET_BYTE(interp, src, offs);
662 if (c >= 0x80)
663 real_exception(interp, NULL, LOSSY_CONVERSION, "lossy conversion to ascii");
664 if (dest)
665 ENCODING_SET_BYTE(interp, dest, offs, c);
667 if (dest)
668 return dest;
669 src->charset = Parrot_ascii_charset_ptr;
670 return src;
675 =back
677 =cut
683 * Local variables:
684 * c-file-style: "parrot"
685 * End:
686 * vim: expandtab shiftwidth=4: