1 /* Skeleton for a conversion module.
2 Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
16 You should have received a copy of the GNU Library General Public
17 License along with the GNU C Library; see the file COPYING.LIB. If not,
18 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
21 /* This file can be included to provide definitions of several things
22 many modules have in common. It can be customized using the following
25 DEFINE_INIT define the default initializer. This requires the
26 following symbol to be defined.
28 CHARSET_NAME string with official name of the coded character
31 DEFINE_FINI define the default destructor function.
33 MIN_NEEDED_FROM minimal number of bytes needed for the from-charset.
34 MIN_NEEDED_TO likewise for the to-charset.
36 MAX_NEEDED_FROM maximal number of bytes needed for the from-charset.
37 This macro is optional, it defaults to MIN_NEEDED_FROM.
38 MAX_NEEDED_TO likewise for the to-charset.
40 DEFINE_DIRECTION_OBJECTS
41 two objects will be defined to be used when the
42 `gconv' function must only distinguish two
43 directions. This is implied by DEFINE_INIT.
44 If this macro is not defined the following
45 macro must be available.
47 FROM_DIRECTION this macro is supposed to return a value != 0
48 if we convert from the current character set,
49 otherwise it return 0.
51 EMIT_SHIFT_TO_INIT this symbol is optional. If it is defined it
52 defines some code which writes out a sequence
53 of characters which bring the current state into
56 FROM_LOOP name of the function implementing the conversion
57 from the current characters.
58 TO_LOOP likewise for the other direction
60 ONE_DIRECTION optional. If defined to 1, only one conversion
61 direction is defined instead of two. In this
62 case, FROM_DIRECTION should be defined to 1, and
63 FROM_LOOP and TO_LOOP should have the same value.
65 SAVE_RESET_STATE in case of an error we must reset the state for
66 the rerun so this macro must be defined for
67 stateful encodings. It takes an argument which
68 is nonzero when saving.
70 RESET_INPUT_BUFFER If the input character sets allow this the macro
71 can be defined to reset the input buffer pointers
72 to cover only those characters up to the error.
74 FUNCTION_NAME if not set the conversion function is named `gconv'.
76 PREPARE_LOOP optional code preparing the conversion loop. Can
77 contain variable definitions.
78 END_LOOP also optional, may be used to store information
80 EXTRA_LOOP_ARGS optional macro specifying extra arguments passed
96 # define DL_CALL_FCT(fct, args) fct args
99 /* The direction objects. */
100 #if DEFINE_DIRECTION_OBJECTS || DEFINE_INIT
101 static int from_object
;
102 static int to_object
;
104 # ifndef FROM_DIRECTION
105 # define FROM_DIRECTION (step->__data == &from_object)
108 # ifndef FROM_DIRECTION
109 # error "FROM_DIRECTION must be provided if direction objects are not used"
114 /* How many bytes are needed at most for the from-charset. */
115 #ifndef MAX_NEEDED_FROM
116 # define MAX_NEEDED_FROM MIN_NEEDED_FROM
119 /* Same for the to-charset. */
120 #ifndef MAX_NEEDED_TO
121 # define MAX_NEEDED_TO MIN_NEEDED_TO
125 /* Define macros which can access unaligned buffers. These macros are
126 supposed to be used only in code outside the inner loops. For the inner
127 loops we have other definitions which allow optimized access. */
128 #ifdef _STRING_ARCH_unaligned
129 /* We can handle unaligned memory access. */
130 # define get16u(addr) *((__const uint16_t *) (addr))
131 # define get32u(addr) *((__const uint32_t *) (addr))
133 /* We need no special support for writing values either. */
134 # define put16u(addr, val) *((uint16_t *) (addr)) = (val)
135 # define put32u(addr, val) *((uint32_t *) (addr)) = (val)
137 /* Distinguish between big endian and little endian. */
138 # if __BYTE_ORDER == __LITTLE_ENDIAN
139 # define get16u(addr) \
140 (((__const unsigned char *) (addr))[1] << 8 \
141 | ((__const unsigned char *) (addr))[0])
142 # define get32u(addr) \
143 (((((__const unsigned char *) (addr))[3] << 8 \
144 | ((__const unsigned char *) (addr))[2]) << 8 \
145 | ((__const unsigned char *) (addr))[1]) << 8 \
146 | ((__const unsigned char *) (addr))[0])
148 # define put16u(addr, val) \
149 ({ uint16_t __val = (val); \
150 ((unsigned char *) (addr))[0] = __val; \
151 ((unsigned char *) (addr))[1] = __val >> 8; \
153 # define put32u(addr, val) \
154 ({ uint32_t __val = (val); \
155 ((unsigned char *) (addr))[0] = __val; \
157 ((unsigned char *) (addr))[1] = __val; \
159 ((unsigned char *) (addr))[2] = __val; \
161 ((unsigned char *) (addr))[3] = __val; \
164 # define get16u(addr) \
165 (((__const unsigned char *) (addr))[0] << 8 \
166 | ((__const unsigned char *) (addr))[1])
167 # define get32u(addr) \
168 (((((__const unsigned char *) (addr))[0] << 8 \
169 | ((__const unsigned char *) (addr))[1]) << 8 \
170 | ((__const unsigned char *) (addr))[2]) << 8 \
171 | ((__const unsigned char *) (addr))[3])
173 # define put16u(addr, val) \
174 ({ uint16_t __val = (val); \
175 ((unsigned char *) (addr))[1] = __val; \
176 ((unsigned char *) (addr))[0] = __val >> 8; \
178 # define put32u(addr, val) \
179 ({ uint32_t __val = (val); \
180 ((unsigned char *) (addr))[3] = __val; \
182 ((unsigned char *) (addr))[2] = __val; \
184 ((unsigned char *) (addr))[1] = __val; \
186 ((unsigned char *) (addr))[0] = __val; \
192 /* For conversions from a fixed width character set to another fixed width
193 character set we can define RESET_INPUT_BUFFER in a very fast way. */
194 #if !defined RESET_INPUT_BUFFER && !defined SAVE_RESET_STATE
195 # if MIN_NEEDED_FROM == MAX_NEEDED_FROM && MIN_NEEDED_TO == MAX_NEEDED_TO
196 /* We have to use these `if's here since the compiler cannot know that
197 (outbuf - outerr) is always divisible by MIN_NEEDED_TO. */
198 # define RESET_INPUT_BUFFER \
199 if (MIN_NEEDED_FROM % MIN_NEEDED_TO == 0) \
200 *inptrp -= (outbuf - outerr) * (MIN_NEEDED_FROM / MIN_NEEDED_TO); \
201 else if (MIN_NEEDED_TO % MIN_NEEDED_FROM == 0) \
202 *inptrp -= (outbuf - outerr) / (MIN_NEEDED_TO / MIN_NEEDED_FROM); \
204 *inptrp -= ((outbuf - outerr) / MIN_NEEDED_TO) * MIN_NEEDED_FROM
209 /* The default init function. It simply matches the name and initializes
210 the step data to point to one of the objects above. */
212 # ifndef CHARSET_NAME
213 # error "CHARSET_NAME not defined"
216 extern int gconv_init (struct __gconv_step
*step
);
218 gconv_init (struct __gconv_step
*step
)
220 /* Determine which direction. */
221 if (strcmp (step
->__from_name
, CHARSET_NAME
) == 0)
223 step
->__data
= &from_object
;
225 step
->__min_needed_from
= MIN_NEEDED_FROM
;
226 step
->__max_needed_from
= MAX_NEEDED_FROM
;
227 step
->__min_needed_to
= MIN_NEEDED_TO
;
228 step
->__max_needed_to
= MAX_NEEDED_TO
;
230 else if (__builtin_expect (strcmp (step
->__to_name
, CHARSET_NAME
), 0) == 0)
232 step
->__data
= &to_object
;
234 step
->__min_needed_from
= MIN_NEEDED_TO
;
235 step
->__max_needed_from
= MAX_NEEDED_TO
;
236 step
->__min_needed_to
= MIN_NEEDED_FROM
;
237 step
->__max_needed_to
= MAX_NEEDED_FROM
;
240 return __GCONV_NOCONV
;
242 #ifdef SAVE_RESET_STATE
243 step
->__stateful
= 1;
245 step
->__stateful
= 0;
253 /* The default destructor function does nothing in the moment and so
254 we don't define it at all. But we still provide the macro just in
255 case we need it some day. */
260 /* If no arguments have to passed to the loop function define the macro
262 #ifndef EXTRA_LOOP_ARGS
263 # define EXTRA_LOOP_ARGS
267 /* This is the actual conversion function. */
268 #ifndef FUNCTION_NAME
269 # define FUNCTION_NAME gconv
272 /* The macros are used to access the function to convert single characters. */
273 #define SINGLE(fct) SINGLE2 (fct)
274 #define SINGLE2(fct) fct##_single
277 extern int FUNCTION_NAME (struct __gconv_step
*step
,
278 struct __gconv_step_data
*data
,
279 const unsigned char **inptrp
,
280 const unsigned char *inend
,
281 unsigned char **outbufstart
, size_t *irreversible
,
282 int do_flush
, int consume_incomplete
);
284 FUNCTION_NAME (struct __gconv_step
*step
, struct __gconv_step_data
*data
,
285 const unsigned char **inptrp
, const unsigned char *inend
,
286 unsigned char **outbufstart
, size_t *irreversible
, int do_flush
,
287 int consume_incomplete
)
289 struct __gconv_step
*next_step
= step
+ 1;
290 struct __gconv_step_data
*next_data
= data
+ 1;
294 fct
= (data
->__flags
& __GCONV_IS_LAST
) ? NULL
: next_step
->__fct
;
296 /* If the function is called with no input this means we have to reset
297 to the initial state. The possibly partly converted input is
299 if (__builtin_expect (do_flush
, 0))
303 /* This should never happen during error handling. */
304 assert (outbufstart
== NULL
);
306 #ifdef EMIT_SHIFT_TO_INIT
307 /* Emit the escape sequence to reset the state. */
310 /* Clear the state object. There might be bytes in there from
311 previous calls with CONSUME_INCOMPLETE == 1. */
312 memset (data
->__statep
, '\0', sizeof (*data
->__statep
));
314 /* Call the steps down the chain if there are any but only if we
315 successfully emitted the escape sequence. This should only
316 fail if the output buffer is full. If the input is invalid
317 it should be discarded since the user wants to start from a
319 if (status
== __GCONV_OK
&& ! (data
->__flags
& __GCONV_IS_LAST
))
320 status
= DL_CALL_FCT (fct
, (next_step
, next_data
, NULL
, NULL
,
321 NULL
, irreversible
, 1,
322 consume_incomplete
));
326 /* We preserve the initial values of the pointer variables. */
327 const unsigned char *inptr
= *inptrp
;
328 unsigned char *outbuf
= (__builtin_expect (outbufstart
== NULL
, 1)
329 ? data
->__outbuf
: *outbufstart
);
330 unsigned char *outend
= data
->__outbufend
;
331 unsigned char *outstart
;
332 /* This variable is used to count the number of characters we
333 actually converted. */
334 size_t lirreversible
= 0;
335 size_t *lirreversiblep
= irreversible
? &lirreversible
: NULL
;
336 #if defined _STRING_ARCH_unaligned \
337 || MIN_NEEDED_FROM == 1 || MAX_NEEDED_FROM % MIN_NEEDED_FROM != 0 \
338 || MIN_NEEDED_TO == 1 || MAX_NEEDED_TO % MIN_NEEDED_TO != 0
342 # define GEN_unaligned(name) GEN_unaligned2 (name)
343 # define GEN_unaligned2(name) name##_unaligned
350 #if MAX_NEEDED_FROM > 1 || MAX_NEEDED_TO > 1
351 /* If the function is used to implement the mb*towc*() or wc*tomb*()
352 functions we must test whether any bytes from the last call are
353 stored in the `state' object. */
354 if (((MAX_NEEDED_FROM
> 1 && MAX_NEEDED_TO
> 1)
355 || (MAX_NEEDED_FROM
> 1 && FROM_DIRECTION
)
356 || (MAX_NEEDED_TO
> 1 && !FROM_DIRECTION
))
357 && consume_incomplete
&& (data
->__statep
->__count
& 7) != 0)
359 /* Yep, we have some bytes left over. Process them now.
360 But this must not happen while we are called from an
362 assert (outbufstart
== NULL
);
364 # if MAX_NEEDED_FROM > 1
365 if (MAX_NEEDED_TO
== 1 || FROM_DIRECTION
)
366 status
= SINGLE(FROM_LOOP
) (step
, data
, inptrp
, inend
, &outbuf
,
367 outend
, lirreversiblep
370 # if MAX_NEEDED_FROM > 1 && MAX_NEEDED_TO > 1 && !ONE_DIRECTION
373 # if MAX_NEEDED_TO > 1 && !ONE_DIRECTION
374 status
= SINGLE(TO_LOOP
) (step
, data
, inptrp
, inend
, &outbuf
,
375 outend
, lirreversiblep EXTRA_LOOP_ARGS
);
378 if (__builtin_expect (status
, __GCONV_OK
) != __GCONV_OK
)
383 #if !defined _STRING_ARCH_unaligned \
384 && MIN_NEEDED_FROM != 1 && MAX_NEEDED_FROM % MIN_NEEDED_FROM == 0 \
385 && MIN_NEEDED_TO != 1 && MAX_NEEDED_TO % MIN_NEEDED_TO == 0
386 /* The following assumes that encodings, which have a variable length
387 what might unalign a buffer even though it is a aligned in the
388 beginning, either don't have the minimal number of bytes as a divisor
389 of the maximum length or have a minimum length of 1. This is true
390 for all known and supported encodings. */
391 unaligned
= ((FROM_DIRECTION
392 && ((uintptr_t) inptr
% MIN_NEEDED_FROM
!= 0
393 || ((data
->__flags
& __GCONV_IS_LAST
)
394 && (uintptr_t) outbuf
% MIN_NEEDED_TO
!= 0)))
396 && (((data
->__flags
& __GCONV_IS_LAST
)
397 && (uintptr_t) outbuf
% MIN_NEEDED_FROM
!= 0)
398 || (uintptr_t) inptr
% MIN_NEEDED_TO
!= 0)));
403 struct __gconv_trans_data
*trans
;
405 /* Remember the start value for this round. */
407 /* The outbuf buffer is empty. */
410 #ifdef SAVE_RESET_STATE
411 SAVE_RESET_STATE (1);
414 if (__builtin_expect (!unaligned
, 1))
417 /* Run the conversion loop. */
418 status
= FROM_LOOP (step
, data
, inptrp
, inend
, &outbuf
, outend
,
419 lirreversiblep EXTRA_LOOP_ARGS
);
421 /* Run the conversion loop. */
422 status
= TO_LOOP (step
, data
, inptrp
, inend
, &outbuf
, outend
,
423 lirreversiblep EXTRA_LOOP_ARGS
);
425 #if !defined _STRING_ARCH_unaligned \
426 && MIN_NEEDED_FROM != 1 && MAX_NEEDED_FROM % MIN_NEEDED_FROM == 0 \
427 && MIN_NEEDED_TO != 1 && MAX_NEEDED_TO % MIN_NEEDED_TO == 0
431 /* Run the conversion loop. */
432 status
= GEN_unaligned (FROM_LOOP
) (step
, data
, inptrp
, inend
,
437 /* Run the conversion loop. */
438 status
= GEN_unaligned (TO_LOOP
) (step
, data
, inptrp
, inend
,
445 /* If we were called as part of an error handling module we
446 don't do anything else here. */
447 if (__builtin_expect (outbufstart
!= NULL
, 0))
449 *outbufstart
= outbuf
;
453 /* Give the transliteration module the chance to store the
454 original text and the result in case it needs a context. */
455 for (trans
= data
->__trans
; trans
!= NULL
; trans
= trans
->__next
)
456 if (trans
->__trans_context_fct
!= NULL
)
457 DL_CALL_FCT (trans
->__trans_context_fct
,
458 (trans
->__data
, inptr
, *inptrp
, outstart
, outbuf
));
460 /* We finished one use of the loops. */
461 ++data
->__invocation_counter
;
463 /* If this is the last step leave the loop, there is nothing
465 if (__builtin_expect (data
->__flags
& __GCONV_IS_LAST
, 0))
467 /* Store information about how many bytes are available. */
468 data
->__outbuf
= outbuf
;
470 /* Remember how many non-identical characters we
471 converted in a irreversible way. */
472 *irreversible
+= lirreversible
;
477 /* Write out all output which was produced. */
478 if (__builtin_expect (outbuf
> outstart
, 1))
480 const unsigned char *outerr
= data
->__outbuf
;
483 result
= DL_CALL_FCT (fct
, (next_step
, next_data
, &outerr
,
484 outbuf
, NULL
, irreversible
, 0,
485 consume_incomplete
));
487 if (result
!= __GCONV_EMPTY_INPUT
)
489 if (__builtin_expect (outerr
!= outbuf
, 0))
491 #ifdef RESET_INPUT_BUFFER
494 /* We have a problem with the in on of the functions
495 below. Undo the conversion upto the error point. */
498 /* Reload the pointers. */
502 /* Reset the state. */
503 # ifdef SAVE_RESET_STATE
504 SAVE_RESET_STATE (0);
507 if (__builtin_expect (!unaligned
, 1))
510 /* Run the conversion loop. */
511 nstatus
= FROM_LOOP (step
, data
, inptrp
, inend
,
516 /* Run the conversion loop. */
517 nstatus
= TO_LOOP (step
, data
, inptrp
, inend
,
522 # if !defined _STRING_ARCH_unaligned \
523 && MIN_NEEDED_FROM != 1 && MAX_NEEDED_FROM % MIN_NEEDED_FROM == 0 \
524 && MIN_NEEDED_TO != 1 && MAX_NEEDED_TO % MIN_NEEDED_TO == 0
528 /* Run the conversion loop. */
529 nstatus
= GEN_unaligned (FROM_LOOP
) (step
, data
,
536 /* Run the conversion loop. */
537 nstatus
= GEN_unaligned (TO_LOOP
) (step
, data
,
545 /* We must run out of output buffer space in this
547 assert (outbuf
== outerr
);
548 assert (nstatus
== __GCONV_FULL_OUTPUT
);
550 /* If we haven't consumed a single byte decrement
551 the invocation counter. */
552 if (__builtin_expect (outbuf
== outstart
, 0))
553 --data
->__invocation_counter
;
554 #endif /* reset input buffer */
557 /* Change the status. */
561 /* All the output is consumed, we can make another run
562 if everything was ok. */
563 if (status
== __GCONV_FULL_OUTPUT
)
566 outbuf
= data
->__outbuf
;
570 if (status
!= __GCONV_OK
)
573 /* Reset the output buffer pointer for the next round. */
574 outbuf
= data
->__outbuf
;
581 /* If we are supposed to consume all character store now all of the
582 remaining characters in the `state' object. */
583 #if MAX_NEEDED_FROM > 1 || MAX_NEEDED_TO > 1
584 if (((MAX_NEEDED_FROM
> 1 && MAX_NEEDED_TO
> 1)
585 || (MAX_NEEDED_FROM
> 1 && FROM_DIRECTION
)
586 || (MAX_NEEDED_TO
> 1 && !FROM_DIRECTION
))
587 && __builtin_expect (consume_incomplete
, 0)
588 && status
== __GCONV_INCOMPLETE_INPUT
)
591 mbstate_t *state
= data
->__statep
;
597 /* Make sure the remaining bytes fit into the state objects
599 assert (inend
- *inptrp
< 4);
601 for (cnt
= 0; *inptrp
< inend
; ++cnt
)
602 data
->__statep
->__value
.__wchb
[cnt
] = *(*inptrp
)++;
603 data
->__statep
->__count
&= ~7;
604 data
->__statep
->__count
|= cnt
;
616 #undef MIN_NEEDED_FROM
618 #undef MAX_NEEDED_FROM
620 #undef DEFINE_DIRECTION_OBJECTS
621 #undef FROM_DIRECTION
622 #undef EMIT_SHIFT_TO_INIT
625 #undef SAVE_RESET_STATE
626 #undef RESET_INPUT_BUFFER