1 /* Skeleton for a conversion module.
2 Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 /* This file can be included to provide definitions of several things
22 many modules have in common. It can be customized using the following
25 DEFINE_INIT define the default initializer. This requires the
26 following symbol to be defined.
28 CHARSET_NAME string with official name of the coded character
31 DEFINE_FINI define the default destructor function.
33 MIN_NEEDED_FROM minimal number of bytes needed for the from-charset.
34 MIN_NEEDED_TO likewise for the to-charset.
36 MAX_NEEDED_FROM maximal number of bytes needed for the from-charset.
37 This macro is optional, it defaults to MIN_NEEDED_FROM.
38 MAX_NEEDED_TO likewise for the to-charset.
40 DEFINE_DIRECTION_OBJECTS
41 two objects will be defined to be used when the
42 `gconv' function must only distinguish two
43 directions. This is implied by DEFINE_INIT.
44 If this macro is not defined the following
45 macro must be available.
47 FROM_DIRECTION this macro is supposed to return a value != 0
48 if we convert from the current character set,
49 otherwise it return 0.
51 EMIT_SHIFT_TO_INIT this symbol is optional. If it is defined it
52 defines some code which writes out a sequence
53 of characters which bring the current state into
56 FROM_LOOP name of the function implementing the conversion
57 from the current characters.
58 TO_LOOP likewise for the other direction
60 ONE_DIRECTION optional. If defined to 1, only one conversion
61 direction is defined instead of two. In this
62 case, FROM_DIRECTION should be defined to 1, and
63 FROM_LOOP and TO_LOOP should have the same value.
65 SAVE_RESET_STATE in case of an error we must reset the state for
66 the rerun so this macro must be defined for
67 stateful encodings. It takes an argument which
68 is nonzero when saving.
70 RESET_INPUT_BUFFER If the input character sets allow this the macro
71 can be defined to reset the input buffer pointers
72 to cover only those characters up to the error.
74 FUNCTION_NAME if not set the conversion function is named `gconv'.
76 PREPARE_LOOP optional code preparing the conversion loop. Can
77 contain variable definitions.
78 END_LOOP also optional, may be used to store information
80 EXTRA_LOOP_ARGS optional macro specifying extra arguments passed
96 # define DL_CALL_FCT(fct, args) fct args
99 /* The direction objects. */
100 #if DEFINE_DIRECTION_OBJECTS || DEFINE_INIT
101 static int from_object
;
102 static int to_object
;
104 # ifndef FROM_DIRECTION
105 # define FROM_DIRECTION (step->__data == &from_object)
108 # ifndef FROM_DIRECTION
109 # error "FROM_DIRECTION must be provided if direction objects are not used"
114 /* How many bytes are needed at most for the from-charset. */
115 #ifndef MAX_NEEDED_FROM
116 # define MAX_NEEDED_FROM MIN_NEEDED_FROM
119 /* Same for the to-charset. */
120 #ifndef MAX_NEEDED_TO
121 # define MAX_NEEDED_TO MIN_NEEDED_TO
125 /* Define macros which can access unaligned buffers. These macros are
126 supposed to be used only in code outside the inner loops. For the inner
127 loops we have other definitions which allow optimized access. */
128 #ifdef _STRING_ARCH_unaligned
129 /* We can handle unaligned memory access. */
130 # define get16u(addr) *((__const uint16_t *) (addr))
131 # define get32u(addr) *((__const uint32_t *) (addr))
133 /* We need no special support for writing values either. */
134 # define put16u(addr, val) *((uint16_t *) (addr)) = (val)
135 # define put32u(addr, val) *((uint32_t *) (addr)) = (val)
137 /* Distinguish between big endian and little endian. */
138 # if __BYTE_ORDER == __LITTLE_ENDIAN
139 # define get16u(addr) \
140 (((__const unsigned char *) (addr))[1] << 8 \
141 | ((__const unsigned char *) (addr))[0])
142 # define get32u(addr) \
143 (((((__const unsigned char *) (addr))[3] << 8 \
144 | ((__const unsigned char *) (addr))[2]) << 8 \
145 | ((__const unsigned char *) (addr))[1]) << 8 \
146 | ((__const unsigned char *) (addr))[0])
148 # define put16u(addr, val) \
149 ({ uint16_t __val = (val); \
150 ((unsigned char *) (addr))[0] = __val; \
151 ((unsigned char *) (addr))[1] = __val >> 8; \
153 # define put32u(addr, val) \
154 ({ uint32_t __val = (val); \
155 ((unsigned char *) (addr))[0] = __val; \
157 ((unsigned char *) (addr))[1] = __val; \
159 ((unsigned char *) (addr))[2] = __val; \
161 ((unsigned char *) (addr))[3] = __val; \
164 # define get16u(addr) \
165 (((__const unsigned char *) (addr))[0] << 8 \
166 | ((__const unsigned char *) (addr))[1])
167 # define get32u(addr) \
168 (((((__const unsigned char *) (addr))[0] << 8 \
169 | ((__const unsigned char *) (addr))[1]) << 8 \
170 | ((__const unsigned char *) (addr))[2]) << 8 \
171 | ((__const unsigned char *) (addr))[3])
173 # define put16u(addr, val) \
174 ({ uint16_t __val = (val); \
175 ((unsigned char *) (addr))[1] = __val; \
176 ((unsigned char *) (addr))[0] = __val >> 8; \
178 # define put32u(addr, val) \
179 ({ uint32_t __val = (val); \
180 ((unsigned char *) (addr))[3] = __val; \
182 ((unsigned char *) (addr))[2] = __val; \
184 ((unsigned char *) (addr))[1] = __val; \
186 ((unsigned char *) (addr))[0] = __val; \
192 /* For conversions from a fixed width character set to another fixed width
193 character set we can define RESET_INPUT_BUFFER in a very fast way. */
194 #if !defined RESET_INPUT_BUFFER && !defined SAVE_RESET_STATE
195 # if MIN_NEEDED_FROM == MAX_NEEDED_FROM && MIN_NEEDED_TO == MAX_NEEDED_TO
196 /* We have to use these `if's here since the compiler cannot know that
197 (outbuf - outerr) is always divisible by MIN_NEEDED_TO. */
198 # define RESET_INPUT_BUFFER \
199 if (MIN_NEEDED_FROM % MIN_NEEDED_TO == 0) \
200 *inptrp -= (outbuf - outerr) * (MIN_NEEDED_FROM / MIN_NEEDED_TO); \
201 else if (MIN_NEEDED_TO % MIN_NEEDED_FROM == 0) \
202 *inptrp -= (outbuf - outerr) / (MIN_NEEDED_TO / MIN_NEEDED_FROM); \
204 *inptrp -= ((outbuf - outerr) / MIN_NEEDED_TO) * MIN_NEEDED_FROM
209 /* The default init function. It simply matches the name and initializes
210 the step data to point to one of the objects above. */
212 # ifndef CHARSET_NAME
213 # error "CHARSET_NAME not defined"
216 extern int gconv_init (struct __gconv_step
*step
);
218 gconv_init (struct __gconv_step
*step
)
220 /* Determine which direction. */
221 if (strcmp (step
->__from_name
, CHARSET_NAME
) == 0)
223 step
->__data
= &from_object
;
225 step
->__min_needed_from
= MIN_NEEDED_FROM
;
226 step
->__max_needed_from
= MAX_NEEDED_FROM
;
227 step
->__min_needed_to
= MIN_NEEDED_TO
;
228 step
->__max_needed_to
= MAX_NEEDED_TO
;
230 else if (__builtin_expect (strcmp (step
->__to_name
, CHARSET_NAME
), 0) == 0)
232 step
->__data
= &to_object
;
234 step
->__min_needed_from
= MIN_NEEDED_TO
;
235 step
->__max_needed_from
= MAX_NEEDED_TO
;
236 step
->__min_needed_to
= MIN_NEEDED_FROM
;
237 step
->__max_needed_to
= MAX_NEEDED_FROM
;
240 return __GCONV_NOCONV
;
242 #ifdef SAVE_RESET_STATE
243 step
->__stateful
= 1;
245 step
->__stateful
= 0;
253 /* The default destructor function does nothing in the moment and so
254 we don't define it at all. But we still provide the macro just in
255 case we need it some day. */
260 /* If no arguments have to passed to the loop function define the macro
262 #ifndef EXTRA_LOOP_ARGS
263 # define EXTRA_LOOP_ARGS
267 /* This is the actual conversion function. */
268 #ifndef FUNCTION_NAME
269 # define FUNCTION_NAME gconv
272 /* The macros are used to access the function to convert single characters. */
273 #define SINGLE(fct) SINGLE2 (fct)
274 #define SINGLE2(fct) fct##_single
277 extern int FUNCTION_NAME (struct __gconv_step
*step
,
278 struct __gconv_step_data
*data
,
279 const unsigned char **inptrp
,
280 const unsigned char *inend
,
281 unsigned char **outbufstart
, size_t *irreversible
,
282 int do_flush
, int consume_incomplete
);
284 FUNCTION_NAME (struct __gconv_step
*step
, struct __gconv_step_data
*data
,
285 const unsigned char **inptrp
, const unsigned char *inend
,
286 unsigned char **outbufstart
, size_t *irreversible
, int do_flush
,
287 int consume_incomplete
)
289 struct __gconv_step
*next_step
= step
+ 1;
290 struct __gconv_step_data
*next_data
= data
+ 1;
294 fct
= (data
->__flags
& __GCONV_IS_LAST
) ? NULL
: next_step
->__fct
;
296 /* If the function is called with no input this means we have to reset
297 to the initial state. The possibly partly converted input is
299 if (__builtin_expect (do_flush
, 0))
301 /* This should never happen during error handling. */
302 assert (outbufstart
== NULL
);
306 #ifdef EMIT_SHIFT_TO_INIT
309 /* We preserve the initial values of the pointer variables. */
310 unsigned char *outbuf
= data
->__outbuf
;
311 unsigned char *outstart
= outbuf
;
312 unsigned char *outend
= data
->__outbufend
;
318 # ifdef SAVE_RESET_STATE
319 SAVE_RESET_STATE (1);
322 /* Emit the escape sequence to reset the state. */
325 /* Call the steps down the chain if there are any but only if we
326 successfully emitted the escape sequence. This should only
327 fail if the output buffer is full. If the input is invalid
328 it should be discarded since the user wants to start from a
330 if (status
== __GCONV_OK
)
332 if (data
->__flags
& __GCONV_IS_LAST
)
333 /* Store information about how many bytes are available. */
334 data
->__outbuf
= outbuf
;
337 /* Write out all output which was produced. */
338 if (outbuf
> outstart
)
340 const unsigned char *outerr
= outstart
;
343 result
= DL_CALL_FCT (fct
, (next_step
, next_data
,
344 &outerr
, outbuf
, NULL
,
346 consume_incomplete
));
348 if (result
!= __GCONV_EMPTY_INPUT
)
350 if (__builtin_expect (outerr
!= outbuf
, 0))
352 /* We have a problem. Undo the conversion. */
355 /* Restore the state. */
356 # ifdef SAVE_RESET_STATE
357 SAVE_RESET_STATE (0);
361 /* Change the status. */
366 if (status
== __GCONV_OK
)
367 /* Now flush the remaining steps. */
368 status
= DL_CALL_FCT (fct
, (next_step
, next_data
, NULL
,
369 NULL
, NULL
, irreversible
, 1,
370 consume_incomplete
));
377 /* Clear the state object. There might be bytes in there from
378 previous calls with CONSUME_INCOMPLETE == 1. But don't emit
380 memset (data
->__statep
, '\0', sizeof (*data
->__statep
));
382 if (! (data
->__flags
& __GCONV_IS_LAST
))
383 /* Now flush the remaining steps. */
384 status
= DL_CALL_FCT (fct
, (next_step
, next_data
, NULL
, NULL
,
385 NULL
, irreversible
, do_flush
,
386 consume_incomplete
));
391 /* We preserve the initial values of the pointer variables. */
392 const unsigned char *inptr
= *inptrp
;
393 unsigned char *outbuf
= (__builtin_expect (outbufstart
== NULL
, 1)
394 ? data
->__outbuf
: *outbufstart
);
395 unsigned char *outend
= data
->__outbufend
;
396 unsigned char *outstart
;
397 /* This variable is used to count the number of characters we
398 actually converted. */
399 size_t lirreversible
= 0;
400 size_t *lirreversiblep
= irreversible
? &lirreversible
: NULL
;
401 #if defined _STRING_ARCH_unaligned \
402 || MIN_NEEDED_FROM == 1 || MAX_NEEDED_FROM % MIN_NEEDED_FROM != 0 \
403 || MIN_NEEDED_TO == 1 || MAX_NEEDED_TO % MIN_NEEDED_TO != 0
407 # define GEN_unaligned(name) GEN_unaligned2 (name)
408 # define GEN_unaligned2(name) name##_unaligned
415 #if MAX_NEEDED_FROM > 1 || MAX_NEEDED_TO > 1
416 /* If the function is used to implement the mb*towc*() or wc*tomb*()
417 functions we must test whether any bytes from the last call are
418 stored in the `state' object. */
419 if (((MAX_NEEDED_FROM
> 1 && MAX_NEEDED_TO
> 1)
420 || (MAX_NEEDED_FROM
> 1 && FROM_DIRECTION
)
421 || (MAX_NEEDED_TO
> 1 && !FROM_DIRECTION
))
422 && consume_incomplete
&& (data
->__statep
->__count
& 7) != 0)
424 /* Yep, we have some bytes left over. Process them now.
425 But this must not happen while we are called from an
427 assert (outbufstart
== NULL
);
429 # if MAX_NEEDED_FROM > 1
430 if (MAX_NEEDED_TO
== 1 || FROM_DIRECTION
)
431 status
= SINGLE(FROM_LOOP
) (step
, data
, inptrp
, inend
, &outbuf
,
432 outend
, lirreversiblep
435 # if MAX_NEEDED_FROM > 1 && MAX_NEEDED_TO > 1 && !ONE_DIRECTION
438 # if MAX_NEEDED_TO > 1 && !ONE_DIRECTION
439 status
= SINGLE(TO_LOOP
) (step
, data
, inptrp
, inend
, &outbuf
,
440 outend
, lirreversiblep EXTRA_LOOP_ARGS
);
443 if (__builtin_expect (status
, __GCONV_OK
) != __GCONV_OK
)
448 #if !defined _STRING_ARCH_unaligned \
449 && MIN_NEEDED_FROM != 1 && MAX_NEEDED_FROM % MIN_NEEDED_FROM == 0 \
450 && MIN_NEEDED_TO != 1 && MAX_NEEDED_TO % MIN_NEEDED_TO == 0
451 /* The following assumes that encodings, which have a variable length
452 what might unalign a buffer even though it is a aligned in the
453 beginning, either don't have the minimal number of bytes as a divisor
454 of the maximum length or have a minimum length of 1. This is true
455 for all known and supported encodings. */
456 unaligned
= ((FROM_DIRECTION
457 && ((uintptr_t) inptr
% MIN_NEEDED_FROM
!= 0
458 || ((data
->__flags
& __GCONV_IS_LAST
)
459 && (uintptr_t) outbuf
% MIN_NEEDED_TO
!= 0)))
461 && (((data
->__flags
& __GCONV_IS_LAST
)
462 && (uintptr_t) outbuf
% MIN_NEEDED_FROM
!= 0)
463 || (uintptr_t) inptr
% MIN_NEEDED_TO
!= 0)));
468 struct __gconv_trans_data
*trans
;
470 /* Remember the start value for this round. */
472 /* The outbuf buffer is empty. */
475 #ifdef SAVE_RESET_STATE
476 SAVE_RESET_STATE (1);
479 if (__builtin_expect (!unaligned
, 1))
482 /* Run the conversion loop. */
483 status
= FROM_LOOP (step
, data
, inptrp
, inend
, &outbuf
, outend
,
484 lirreversiblep EXTRA_LOOP_ARGS
);
486 /* Run the conversion loop. */
487 status
= TO_LOOP (step
, data
, inptrp
, inend
, &outbuf
, outend
,
488 lirreversiblep EXTRA_LOOP_ARGS
);
490 #if !defined _STRING_ARCH_unaligned \
491 && MIN_NEEDED_FROM != 1 && MAX_NEEDED_FROM % MIN_NEEDED_FROM == 0 \
492 && MIN_NEEDED_TO != 1 && MAX_NEEDED_TO % MIN_NEEDED_TO == 0
496 /* Run the conversion loop. */
497 status
= GEN_unaligned (FROM_LOOP
) (step
, data
, inptrp
, inend
,
502 /* Run the conversion loop. */
503 status
= GEN_unaligned (TO_LOOP
) (step
, data
, inptrp
, inend
,
510 /* If we were called as part of an error handling module we
511 don't do anything else here. */
512 if (__builtin_expect (outbufstart
!= NULL
, 0))
514 *outbufstart
= outbuf
;
518 /* Give the transliteration module the chance to store the
519 original text and the result in case it needs a context. */
520 for (trans
= data
->__trans
; trans
!= NULL
; trans
= trans
->__next
)
521 if (trans
->__trans_context_fct
!= NULL
)
522 DL_CALL_FCT (trans
->__trans_context_fct
,
523 (trans
->__data
, inptr
, *inptrp
, outstart
, outbuf
));
525 /* We finished one use of the loops. */
526 ++data
->__invocation_counter
;
528 /* If this is the last step leave the loop, there is nothing
530 if (__builtin_expect (data
->__flags
& __GCONV_IS_LAST
, 0))
532 /* Store information about how many bytes are available. */
533 data
->__outbuf
= outbuf
;
535 /* Remember how many non-identical characters we
536 converted in a irreversible way. */
537 *irreversible
+= lirreversible
;
542 /* Write out all output which was produced. */
543 if (__builtin_expect (outbuf
> outstart
, 1))
545 const unsigned char *outerr
= data
->__outbuf
;
548 result
= DL_CALL_FCT (fct
, (next_step
, next_data
, &outerr
,
549 outbuf
, NULL
, irreversible
, 0,
550 consume_incomplete
));
552 if (result
!= __GCONV_EMPTY_INPUT
)
554 if (__builtin_expect (outerr
!= outbuf
, 0))
556 #ifdef RESET_INPUT_BUFFER
559 /* We have a problem with the in on of the functions
560 below. Undo the conversion upto the error point. */
563 /* Reload the pointers. */
567 /* Restore the state. */
568 # ifdef SAVE_RESET_STATE
569 SAVE_RESET_STATE (0);
572 if (__builtin_expect (!unaligned
, 1))
575 /* Run the conversion loop. */
576 nstatus
= FROM_LOOP (step
, data
, inptrp
, inend
,
581 /* Run the conversion loop. */
582 nstatus
= TO_LOOP (step
, data
, inptrp
, inend
,
587 # if !defined _STRING_ARCH_unaligned \
588 && MIN_NEEDED_FROM != 1 && MAX_NEEDED_FROM % MIN_NEEDED_FROM == 0 \
589 && MIN_NEEDED_TO != 1 && MAX_NEEDED_TO % MIN_NEEDED_TO == 0
593 /* Run the conversion loop. */
594 nstatus
= GEN_unaligned (FROM_LOOP
) (step
, data
,
601 /* Run the conversion loop. */
602 nstatus
= GEN_unaligned (TO_LOOP
) (step
, data
,
610 /* We must run out of output buffer space in this
612 assert (outbuf
== outerr
);
613 assert (nstatus
== __GCONV_FULL_OUTPUT
);
615 /* If we haven't consumed a single byte decrement
616 the invocation counter. */
617 if (__builtin_expect (outbuf
== outstart
, 0))
618 --data
->__invocation_counter
;
619 #endif /* reset input buffer */
622 /* Change the status. */
626 /* All the output is consumed, we can make another run
627 if everything was ok. */
628 if (status
== __GCONV_FULL_OUTPUT
)
631 outbuf
= data
->__outbuf
;
635 if (status
!= __GCONV_OK
)
638 /* Reset the output buffer pointer for the next round. */
639 outbuf
= data
->__outbuf
;
646 /* If we are supposed to consume all character store now all of the
647 remaining characters in the `state' object. */
648 #if MAX_NEEDED_FROM > 1 || MAX_NEEDED_TO > 1
649 if (((MAX_NEEDED_FROM
> 1 && MAX_NEEDED_TO
> 1)
650 || (MAX_NEEDED_FROM
> 1 && FROM_DIRECTION
)
651 || (MAX_NEEDED_TO
> 1 && !FROM_DIRECTION
))
652 && __builtin_expect (consume_incomplete
, 0)
653 && status
== __GCONV_INCOMPLETE_INPUT
)
656 mbstate_t *state
= data
->__statep
;
662 /* Make sure the remaining bytes fit into the state objects
664 assert (inend
- *inptrp
< 4);
666 for (cnt
= 0; *inptrp
< inend
; ++cnt
)
667 data
->__statep
->__value
.__wchb
[cnt
] = *(*inptrp
)++;
668 data
->__statep
->__count
&= ~7;
669 data
->__statep
->__count
|= cnt
;
681 #undef MIN_NEEDED_FROM
683 #undef MAX_NEEDED_FROM
685 #undef DEFINE_DIRECTION_OBJECTS
686 #undef FROM_DIRECTION
687 #undef EMIT_SHIFT_TO_INIT
690 #undef SAVE_RESET_STATE
691 #undef RESET_INPUT_BUFFER