2 Copyright (C) 2004-2007, The Perl Foundation.
7 src/charset/iso-8859-1.c
11 This file implements the charset functions for iso-8859-1 data
19 #include "parrot/parrot.h"
20 #include "iso-8859-1.h"
23 /* HEADERIZER HFILE: src/charset/iso-8859-1.h */
25 /* HEADERIZER BEGIN: static */
26 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
28 PARROT_CANNOT_RETURN_NULL
29 PARROT_WARN_UNUSED_RESULT
30 static STRING
* compose(PARROT_INTERP
, ARGIN(STRING
*src
))
31 __attribute__nonnull__(1)
32 __attribute__nonnull__(2);
34 PARROT_CANNOT_RETURN_NULL
35 static STRING
* decompose(PARROT_INTERP
, SHIM(STRING
*src
))
36 __attribute__nonnull__(1);
38 static void downcase(PARROT_INTERP
, ARGIN(STRING
*source_string
))
39 __attribute__nonnull__(1)
40 __attribute__nonnull__(2);
42 static void downcase_first(PARROT_INTERP
, ARGIN(STRING
*source_string
))
43 __attribute__nonnull__(1)
44 __attribute__nonnull__(2);
46 static INTVAL
find_cclass(PARROT_INTERP
,
48 ARGIN(STRING
*source_string
),
51 __attribute__nonnull__(1)
52 __attribute__nonnull__(3);
54 static INTVAL
find_not_cclass(PARROT_INTERP
,
56 ARGIN(STRING
*source_string
),
59 __attribute__nonnull__(1)
60 __attribute__nonnull__(3);
62 static INTVAL
is_cclass(PARROT_INTERP
,
64 ARGIN(const STRING
*source_string
),
66 __attribute__nonnull__(1)
67 __attribute__nonnull__(3);
69 static void set_graphemes(PARROT_INTERP
,
70 ARGIN(STRING
*source_string
),
72 UINTVAL replace_count
,
73 ARGMOD(STRING
*insert_string
))
74 __attribute__nonnull__(1)
75 __attribute__nonnull__(2)
76 __attribute__nonnull__(5)
77 FUNC_MODIFIES(*insert_string
);
79 PARROT_CANNOT_RETURN_NULL
80 static STRING
* string_from_codepoint(PARROT_INTERP
, UINTVAL codepoint
)
81 __attribute__nonnull__(1);
83 static void titlecase(PARROT_INTERP
, ARGIN(STRING
*source_string
))
84 __attribute__nonnull__(1)
85 __attribute__nonnull__(2);
87 static void titlecase_first(PARROT_INTERP
, ARGIN(STRING
*source_string
))
88 __attribute__nonnull__(1)
89 __attribute__nonnull__(2);
91 PARROT_CANNOT_RETURN_NULL
92 PARROT_WARN_UNUSED_RESULT
93 static STRING
* to_charset(PARROT_INTERP
,
95 ARGIN_NULLOK(STRING
*dest
))
96 __attribute__nonnull__(1)
97 __attribute__nonnull__(2);
99 PARROT_CANNOT_RETURN_NULL
100 static STRING
* to_latin1(PARROT_INTERP
,
102 ARGMOD_NULLOK(STRING
*dest
))
103 __attribute__nonnull__(1)
104 __attribute__nonnull__(2);
106 PARROT_CANNOT_RETURN_NULL
107 static STRING
* to_unicode(PARROT_INTERP
,
109 ARGMOD_NULLOK(STRING
*dest
))
110 __attribute__nonnull__(1)
111 __attribute__nonnull__(2);
113 static void upcase(PARROT_INTERP
, ARGIN(STRING
*source_string
))
114 __attribute__nonnull__(1)
115 __attribute__nonnull__(2);
117 static void upcase_first(PARROT_INTERP
, ARGIN(STRING
*source_string
))
118 __attribute__nonnull__(1)
119 __attribute__nonnull__(2);
121 static UINTVAL
validate(PARROT_INTERP
, ARGIN(STRING
*src
))
122 __attribute__nonnull__(1)
123 __attribute__nonnull__(2);
125 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
126 /* HEADERIZER END: static */
132 =item C<static void set_graphemes>
134 RT#48260: Not yet documented!!!
141 set_graphemes(PARROT_INTERP
, ARGIN(STRING
*source_string
),
142 UINTVAL offset
, UINTVAL replace_count
, ARGMOD(STRING
*insert_string
))
144 ENCODING_SET_BYTES(interp
, source_string
, offset
,
145 replace_count
, insert_string
);
150 =item C<static STRING * to_latin1>
152 RT#48260: Not yet documented!!!
158 PARROT_CANNOT_RETURN_NULL
160 to_latin1(PARROT_INTERP
, ARGIN(STRING
*src
), ARGMOD_NULLOK(STRING
*dest
))
162 UINTVAL offs
, src_len
;
165 ENCODING_ITER_INIT(interp
, src
, &iter
);
166 src_len
= src
->strlen
;
168 Parrot_reallocate_string(interp
, dest
, src_len
);
169 dest
->strlen
= src_len
;
172 /* latin1 is never bigger then source */
175 dest
->bufused
= src_len
;
176 dest
->charset
= Parrot_iso_8859_1_charset_ptr
;
177 dest
->encoding
= Parrot_fixed_8_encoding_ptr
;
178 for (offs
= 0; offs
< src_len
; ++offs
) {
179 const UINTVAL c
= iter
.get_and_advance(interp
, &iter
);
181 real_exception(interp
, NULL
, LOSSY_CONVERSION
, "lossy conversion to ascii");
182 ENCODING_SET_BYTE(interp
, dest
, offs
, c
);
189 =item C<static STRING * to_unicode>
191 RT#48260: Not yet documented!!!
197 PARROT_CANNOT_RETURN_NULL
199 to_unicode(PARROT_INTERP
, ARGIN(STRING
*src
), ARGMOD_NULLOK(STRING
*dest
))
205 dest
->charset
= Parrot_unicode_charset_ptr
;
206 dest
->encoding
= CHARSET_GET_PREFERRED_ENCODING(interp
, dest
);
207 Parrot_reallocate_string(interp
, dest
, src
->strlen
);
208 ENCODING_ITER_INIT(interp
, dest
, &iter
);
209 for (offs
= 0; offs
< src
->strlen
; ++offs
) {
210 const UINTVAL c
= ENCODING_GET_BYTE(interp
, src
, offs
);
212 if (iter
.bytepos
>= PObj_buflen(dest
) - 4) {
213 UINTVAL need
= (UINTVAL
)((src
->strlen
- offs
) * 1.5);
216 Parrot_reallocate_string(interp
, dest
,
217 PObj_buflen(dest
) + need
);
219 iter
.set_and_advance(interp
, &iter
, c
);
221 dest
->bufused
= iter
.bytepos
;
222 dest
->strlen
= iter
.charpos
;
225 real_exception(interp
, NULL
, UNIMPLEMENTED
,
226 "to_unicode inplace for iso-8859-1 not implemented");
231 =item C<static STRING * to_charset>
233 RT#48260: Not yet documented!!!
239 PARROT_CANNOT_RETURN_NULL
240 PARROT_WARN_UNUSED_RESULT
242 to_charset(PARROT_INTERP
, ARGIN(STRING
*src
), ARGIN_NULLOK(STRING
*dest
))
244 const charset_converter_t conversion_func
=
245 Parrot_find_charset_converter(interp
, src
->charset
, Parrot_iso_8859_1_charset_ptr
);
248 return conversion_func(interp
, src
, dest
);
250 return to_latin1(interp
, src
, dest
);
256 =item C<static STRING* compose>
258 RT#48260: Not yet documented!!!
264 /* A noop. can't compose iso-8859-1 */
265 PARROT_CANNOT_RETURN_NULL
266 PARROT_WARN_UNUSED_RESULT
268 compose(PARROT_INTERP
, ARGIN(STRING
*src
))
270 return string_copy(interp
, src
);
275 =item C<static STRING* decompose>
277 RT#48260: Not yet documented!!!
283 PARROT_CANNOT_RETURN_NULL
285 decompose(PARROT_INTERP
, SHIM(STRING
*src
))
287 real_exception(interp
, NULL
, UNIMPLEMENTED
,
288 "decompose for iso-8859-1 not implemented");
293 =item C<static void upcase>
295 RT#48260: Not yet documented!!!
302 upcase(PARROT_INTERP
, ARGIN(STRING
*source_string
))
304 unsigned char *buffer
;
307 if (!source_string
->strlen
)
310 Parrot_unmake_COW(interp
, source_string
);
311 buffer
= (unsigned char *)source_string
->strstart
;
312 for (offset
= 0; offset
< source_string
->strlen
; offset
++) {
313 unsigned int c
= buffer
[offset
]; /* XXX use encoding ? */
314 if (c
>= 0xe0 && c
!= 0xf7)
317 c
= toupper((unsigned char)c
);
318 buffer
[offset
] = (unsigned char)c
;
324 =item C<static void downcase>
326 RT#48260: Not yet documented!!!
333 downcase(PARROT_INTERP
, ARGIN(STRING
*source_string
))
335 if (source_string
->strlen
) {
337 unsigned char *buffer
;
339 Parrot_unmake_COW(interp
, source_string
);
340 buffer
= (unsigned char *)source_string
->strstart
;
341 for (offset
= 0; offset
< source_string
->strlen
; offset
++) {
342 unsigned int c
= buffer
[offset
];
343 if (c
>= 0xc0 && c
!= 0xd7 && c
<= 0xde)
346 c
= tolower((unsigned char)c
);
347 buffer
[offset
] = (unsigned char)c
;
354 =item C<static void titlecase>
356 RT#48260: Not yet documented!!!
363 titlecase(PARROT_INTERP
, ARGIN(STRING
*source_string
))
365 unsigned char *buffer
;
369 if (!source_string
->strlen
)
372 Parrot_unmake_COW(interp
, source_string
);
373 buffer
= (unsigned char *)source_string
->strstart
;
375 if (c
>= 0xe0 && c
!= 0xf7)
378 c
= toupper((unsigned char)c
);
379 buffer
[0] = (unsigned char)c
;
381 for (offset
= 1; offset
< source_string
->strlen
; offset
++) {
383 if (c
>= 0xc0 && c
!= 0xd7 && c
<= 0xde)
386 c
= tolower((unsigned char)c
);
387 buffer
[offset
] = (unsigned char)c
;
393 =item C<static void upcase_first>
395 RT#48260: Not yet documented!!!
402 upcase_first(PARROT_INTERP
, ARGIN(STRING
*source_string
))
404 if (source_string
->strlen
) {
405 unsigned char *buffer
;
408 Parrot_unmake_COW(interp
, source_string
);
409 buffer
= (unsigned char *)source_string
->strstart
;
411 if (c
>= 0xe0 && c
!= 0xf7)
414 c
= toupper((unsigned char)c
);
415 buffer
[0] = (unsigned char)c
;
421 =item C<static void downcase_first>
423 RT#48260: Not yet documented!!!
430 downcase_first(PARROT_INTERP
, ARGIN(STRING
*source_string
))
432 if (source_string
->strlen
) {
433 unsigned char *buffer
;
436 Parrot_unmake_COW(interp
, source_string
);
437 buffer
= (unsigned char *)source_string
->strstart
;
439 if (c
>= 0xc0 && c
!= 0xd7 && c
<= 0xde)
442 c
= tolower((unsigned char)c
);
443 buffer
[0] = (unsigned char)c
;
449 =item C<static void titlecase_first>
451 RT#48260: Not yet documented!!!
458 titlecase_first(PARROT_INTERP
, ARGIN(STRING
*source_string
))
460 upcase_first(interp
, source_string
);
466 =item C<static UINTVAL validate>
468 RT#48260: Not yet documented!!!
475 validate(PARROT_INTERP
, ARGIN(STRING
*src
))
479 for (offset
= 0; offset
< string_length(interp
, src
); ++offset
) {
480 const UINTVAL codepoint
= ENCODING_GET_CODEPOINT(interp
, src
, offset
);
481 if (codepoint
>= 0x100)
489 =item C<static INTVAL is_cclass>
491 RT#48260: Not yet documented!!!
498 is_cclass(PARROT_INTERP
, INTVAL flags
,
499 ARGIN(const STRING
*source_string
), UINTVAL offset
)
503 if (offset
>= source_string
->strlen
) return 0;
504 codepoint
= ENCODING_GET_CODEPOINT(interp
, source_string
, offset
);
506 if (codepoint
>= sizeof (Parrot_ascii_typetable
) /
507 sizeof (Parrot_ascii_typetable
[0])) {
510 return (Parrot_iso_8859_1_typetable
[codepoint
] & flags
) ? 1 : 0;
515 =item C<static INTVAL find_cclass>
517 RT#48260: Not yet documented!!!
524 find_cclass(PARROT_INTERP
, INTVAL flags
,
525 ARGIN(STRING
*source_string
), UINTVAL offset
, UINTVAL count
)
527 UINTVAL pos
= offset
;
528 UINTVAL end
= offset
+ count
;
531 PARROT_ASSERT(source_string
!= 0);
532 end
= source_string
->strlen
< end
? source_string
->strlen
: end
;
533 for (; pos
< end
; ++pos
) {
534 codepoint
= ENCODING_GET_CODEPOINT(interp
, source_string
, pos
);
535 if ((Parrot_iso_8859_1_typetable
[codepoint
] & flags
) != 0) {
544 =item C<static INTVAL find_not_cclass>
546 RT#48260: Not yet documented!!!
553 find_not_cclass(PARROT_INTERP
, INTVAL flags
,
554 ARGIN(STRING
*source_string
), UINTVAL offset
, UINTVAL count
)
556 UINTVAL pos
= offset
;
557 UINTVAL end
= offset
+ count
;
559 PARROT_ASSERT(source_string
);
560 end
= source_string
->strlen
< end
? source_string
->strlen
: end
;
561 for (; pos
< end
; ++pos
) {
562 const UINTVAL codepoint
= ENCODING_GET_CODEPOINT(interp
, source_string
, pos
);
563 if ((Parrot_iso_8859_1_typetable
[codepoint
] & flags
) == 0) {
573 =item C<static STRING * string_from_codepoint>
575 RT#48260: Not yet documented!!!
581 PARROT_CANNOT_RETURN_NULL
583 string_from_codepoint(PARROT_INTERP
, UINTVAL codepoint
)
585 char real_codepoint
= (char)codepoint
;
586 STRING
* const return_string
= string_make(interp
, &real_codepoint
, 1,
588 return return_string
;
593 =item C<const CHARSET * Parrot_charset_iso_8859_1_init>
595 RT#48260: Not yet documented!!!
601 PARROT_CANNOT_RETURN_NULL
603 Parrot_charset_iso_8859_1_init(PARROT_INTERP
)
605 CHARSET
* const return_set
= Parrot_new_charset(interp
);
606 static const CHARSET base_set
= {
609 ascii_get_graphemes_inplace
,
627 string_from_codepoint
,
632 STRUCT_COPY_FROM_STRUCT(return_set
, base_set
);
633 return_set
->preferred_encoding
= Parrot_fixed_8_encoding_ptr
;
634 Parrot_register_charset(interp
, "iso-8859-1", return_set
);
640 =item C<STRING * charset_cvt_iso_8859_1_to_ascii>
642 RT#48260: Not yet documented!!!
648 PARROT_CANNOT_RETURN_NULL
649 PARROT_WARN_UNUSED_RESULT
651 charset_cvt_iso_8859_1_to_ascii(PARROT_INTERP
, ARGIN(STRING
*src
),
652 ARGMOD_NULLOK(STRING
*dest
))
656 Parrot_reallocate_string(interp
, dest
, src
->strlen
);
657 dest
->bufused
= src
->bufused
;
658 dest
->strlen
= src
->strlen
;
660 for (offs
= 0; offs
< src
->strlen
; ++offs
) {
661 UINTVAL c
= ENCODING_GET_BYTE(interp
, src
, offs
);
663 real_exception(interp
, NULL
, LOSSY_CONVERSION
, "lossy conversion to ascii");
665 ENCODING_SET_BYTE(interp
, dest
, offs
, c
);
669 src
->charset
= Parrot_ascii_charset_ptr
;
684 * c-file-style: "parrot"
686 * vim: expandtab shiftwidth=4: