1 /* Skeleton for a conversion module.
2 Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
16 You should have received a copy of the GNU Library General Public
17 License along with the GNU C Library; see the file COPYING.LIB. If not,
18 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
21 /* This file can be included to provide definitions of several things
22 many modules have in common. It can be customized using the following
25 DEFINE_INIT define the default initializer. This requires the
26 following symbol to be defined.
28 CHARSET_NAME string with official name of the coded character
31 DEFINE_FINI define the default destructor function.
33 MIN_NEEDED_FROM minimal number of bytes needed for the from-charset.
34 MIN_NEEDED_TO likewise for the to-charset.
36 MAX_NEEDED_FROM maximal number of bytes needed for the from-charset.
37 This macro is optional, it defaults to MIN_NEEDED_FROM.
38 MAX_NEEDED_TO likewise for the to-charset.
40 DEFINE_DIRECTION_OBJECTS
41 two objects will be defined to be used when the
42 `gconv' function must only distinguish two
43 directions. This is implied by DEFINE_INIT.
44 If this macro is not defined the following
45 macro must be available.
47 FROM_DIRECTION this macro is supposed to return a value != 0
48 if we convert from the current character set,
49 otherwise it return 0.
51 EMIT_SHIFT_TO_INIT this symbol is optional. If it is defined it
52 defines some code which writes out a sequence
53 of characters which bring the current state into
56 FROM_LOOP name of the function implementing the conversion
57 from the current characters.
58 TO_LOOP likewise for the other direction
60 RESET_STATE in case of an error we must reset the state for
61 the rerun so this macro must be defined for
62 stateful encodings. It takes an argument which
63 is nonzero when saving.
65 RESET_INPUT_BUFFER If the input character sets allow this the macro
66 can be defined to reset the input buffer pointers
67 to cover only those characters up to the error.
69 FUNCTION_NAME if not set the conversion function is named `gconv'.
71 PREPARE_LOOP optional code preparing the conversion loop. Can
72 contain variable definitions.
73 END_LOOP also optional, may be used to store information
75 EXTRA_LOOP_ARGS optional macro specifying extra arguments passed
91 # define DL_CALL_FCT(fct, args) fct args
94 /* The direction objects. */
95 #if DEFINE_DIRECTION_OBJECTS || DEFINE_INIT
96 static int from_object
;
99 # ifndef FROM_DIRECTION
100 # define FROM_DIRECTION (step->__data == &from_object)
103 # ifndef FROM_DIRECTION
104 # error "FROM_DIRECTION must be provided if direction objects are not used"
109 /* How many bytes are needed at most for the from-charset. */
110 #ifndef MAX_NEEDED_FROM
111 # define MAX_NEEDED_FROM MIN_NEEDED_FROM
114 /* Same for the to-charset. */
115 #ifndef MAX_NEEDED_TO
116 # define MAX_NEEDED_TO MIN_NEEDED_TO
120 /* Define macros which can access unaligned buffers. These macros are
121 supposed to be used only in code outside the inner loops. For the inner
122 loops we have other definitions which allow optimized access. */
123 #ifdef _STRING_ARCH_unaligned
124 /* We can handle unaligned memory access. */
125 # define get16u(addr) *((uint16_t *) (addr))
126 # define get32u(addr) *((uint32_t *) (addr))
128 /* We need no special support for writing values either. */
129 # define put16u(addr, val) *((uint16_t *) (addr)) = (val)
130 # define put32u(addr, val) *((uint32_t *) (addr)) = (val)
132 /* Distinguish between big endian and little endian. */
133 # if __BYTE_ORDER == __LITTLE_ENDIAN
134 # define get16u(addr) \
135 (((__const unsigned char *) (addr))[1] << 8 \
136 | ((__const unsigned char *) (addr))[0])
137 # define get32u(addr) \
138 (((((__const unsigned char *) (addr))[3] << 8 \
139 | ((__const unsigned char *) (addr))[2]) << 8 \
140 | ((__const unsigned char *) (addr))[1]) << 8 \
141 | ((__const unsigned char *) (addr))[0])
143 # define put16u(addr, val) \
144 ({ uint16_t __val = (val); \
145 ((unsigned char *) (addr))[0] = __val; \
146 ((unsigned char *) (addr))[1] = __val >> 8; \
148 # define put32u(addr, val) \
149 ({ uint32_t __val = (val); \
150 ((unsigned char *) (addr))[0] = __val; \
152 ((unsigned char *) (addr))[1] = __val; \
154 ((unsigned char *) (addr))[2] = __val; \
156 ((unsigned char *) (addr))[3] = __val; \
159 # define get16u(addr) \
160 (((__const unsigned char *) (addr))[0] << 8 \
161 | ((__const unsigned char *) (addr))[1])
162 # define get32u(addr) \
163 (((((__const unsigned char *) (addr))[0] << 8 \
164 | ((__const unsigned char *) (addr))[1]) << 8 \
165 | ((__const unsigned char *) (addr))[2]) << 8 \
166 | ((__const unsigned char *) (addr))[3])
168 # define put16u(addr, val) \
169 ({ uint16_t __val = (val); \
170 ((unsigned char *) (addr))[1] = __val; \
171 ((unsigned char *) (addr))[0] = __val >> 8; \
173 # define put32u(addr, val) \
174 ({ uint32_t __val = (val); \
175 ((unsigned char *) (addr))[3] = __val; \
177 ((unsigned char *) (addr))[2] = __val; \
179 ((unsigned char *) (addr))[1] = __val; \
181 ((unsigned char *) (addr))[0] = __val; \
187 /* For conversions from a fixed width character sets to another fixed width
188 character set we we can define RESET_INPUT_BUFFER is necessary. */
189 #if !defined RESET_INPUT_BUFFER && !defined SAVE_RESET_STATE
190 # if MIN_NEEDED_FROM == MAX_NEEDED_FROM && MIN_NEEDED_TO == MAX_NEEDED_TO
191 /* We have to use these `if's here since the compiler cannot know that
192 (outbuf - outerr) is always divisible by MIN_NEEDED_TO. */
193 # define RESET_INPUT_BUFFER \
194 if (MIN_NEEDED_FROM % MIN_NEEDED_TO == 0) \
195 *inptrp -= (outbuf - outerr) * (MIN_NEEDED_FROM / MIN_NEEDED_TO); \
196 else if (MIN_NEEDED_TO % MIN_NEEDED_FROM == 0) \
197 *inptrp -= (outbuf - outerr) / (MIN_NEEDED_TO / MIN_NEEDED_FROM); \
199 *inptrp -= ((outbuf - outerr) / MIN_NEEDED_TO) * MIN_NEEDED_FROM
204 /* The default init function. It simply matches the name and initializes
205 the step data to point to one of the objects above. */
207 # ifndef CHARSET_NAME
208 # error "CHARSET_NAME not defined"
212 gconv_init (struct __gconv_step
*step
)
214 /* Determine which direction. */
215 if (strcmp (step
->__from_name
, CHARSET_NAME
) == 0)
217 step
->__data
= &from_object
;
219 step
->__min_needed_from
= MIN_NEEDED_FROM
;
220 step
->__max_needed_from
= MAX_NEEDED_FROM
;
221 step
->__min_needed_to
= MIN_NEEDED_TO
;
222 step
->__max_needed_to
= MAX_NEEDED_TO
;
224 else if (__builtin_expect (strcmp (step
->__to_name
, CHARSET_NAME
), 0) == 0)
226 step
->__data
= &to_object
;
228 step
->__min_needed_from
= MIN_NEEDED_TO
;
229 step
->__max_needed_from
= MAX_NEEDED_TO
;
230 step
->__min_needed_to
= MIN_NEEDED_FROM
;
231 step
->__max_needed_to
= MAX_NEEDED_FROM
;
234 return __GCONV_NOCONV
;
237 step
->__stateful
= 1;
239 step
->__stateful
= 0;
247 /* The default destructor function does nothing in the moment and so
248 be define it at all. But we still provide the macro just in case
249 we need it some day. */
254 /* If no arguments have to passed to the loop function define the macro
256 #ifndef EXTRA_LOOP_ARGS
257 # define EXTRA_LOOP_ARGS
261 /* This is the actual conversion function. */
262 #ifndef FUNCTION_NAME
263 # define FUNCTION_NAME gconv
266 /* The macros are used to access the function to convert single characters. */
267 #define SINGLE(fct) SINGLE2 (fct)
268 #define SINGLE2(fct) fct##_single
272 FUNCTION_NAME (struct __gconv_step
*step
, struct __gconv_step_data
*data
,
273 const unsigned char **inptrp
, const unsigned char *inend
,
274 unsigned char **outbufstart
, size_t *irreversible
, int do_flush
,
275 int consume_incomplete
)
277 struct __gconv_step
*next_step
= step
+ 1;
278 struct __gconv_step_data
*next_data
= data
+ 1;
282 fct
= (data
->__flags
& __GCONV_IS_LAST
) ? NULL
: next_step
->__fct
;
284 /* If the function is called with no input this means we have to reset
285 to the initial state. The possibly partly converted input is
287 if (__builtin_expect (do_flush
, 0))
291 /* This should never happen during error handling. */
292 assert (outbufstart
== NULL
);
294 #ifdef EMIT_SHIFT_TO_INIT
295 /* Emit the escape sequence to reset the state. */
298 /* Clear the state object. There might be bytes in there from
299 previous calls with CONSUME_INCOMPLETE == 1. */
300 memset (data
->__statep
, '\0', sizeof (*data
->__statep
));
302 /* Call the steps down the chain if there are any but only if we
303 successfully emitted the escape sequence. This should only
304 fail if the output buffer is full. If the input is invalid
305 it should be discarded since the user wants to start from a
307 if (status
== __GCONV_OK
&& ! (data
->__flags
& __GCONV_IS_LAST
))
308 status
= DL_CALL_FCT (fct
, (next_step
, next_data
, NULL
, NULL
,
309 NULL
, irreversible
, 1,
310 consume_incomplete
));
314 /* We preserve the initial values of the pointer variables. */
315 const unsigned char *inptr
= *inptrp
;
316 unsigned char *outbuf
= (__builtin_expect (outbufstart
== NULL
, 1)
317 ? data
->__outbuf
: *outbufstart
);
318 unsigned char *outend
= data
->__outbufend
;
319 unsigned char *outstart
;
320 /* This variable is used to count the number of characters we
321 actually converted. */
322 size_t lirreversible
= 0;
323 #if defined _STRING_ARCH_unaligned \
324 || MIN_NEEDED_FROM == 1 || MAX_NEEDED_FROM % MIN_NEEDED_FROM != 0 \
325 || MIN_NEEDED_TO == 1 || MAX_NEEDED_TO % MIN_NEEDED_TO != 0
329 # define GEN_unaligned(name) GEN_unaligned2 (name)
330 # define GEN_unaligned2(name) name##_unaligned
337 #if MAX_NEEDED_FROM > 1 || MAX_NEEDED_TO > 1
338 /* If the function is used to implement the mb*towc*() or wc*tomb*()
339 functions we must test whether any bytes from the last call are
340 stored in the `state' object. */
341 if (((MAX_NEEDED_FROM
> 1 && FROM_DIRECTION
)
342 || (MAX_NEEDED_TO
> 1 && !FROM_DIRECTION
))
343 && consume_incomplete
&& (data
->__statep
->__count
& 7) != 0)
345 /* Yep, we have some bytes left over. Process them now.
346 But this must not happen while we are called from an
348 assert (outbufstart
== NULL
);
350 # if MAX_NEEDED_FROM > 1
351 if (MAX_NEEDED_TO
== 1 || FROM_DIRECTION
)
352 status
= SINGLE(FROM_LOOP
) (step
, data
, inptrp
, inend
, &outbuf
,
353 outend
, &lirreversible
356 # if MAX_NEEDED_FROM > 1 && MAX_NEEDED_TO > 1 && !ONE_DIRECTION
359 # if MAX_NEEDED_TO > 1 && !ONE_DIRECTION
360 status
= SINGLE(TO_LOOP
) (step
, data
, inptrp
, inend
, &outbuf
,
361 outend
, &lirreversible EXTRA_LOOP_ARGS
);
364 if (__builtin_expect (status
, __GCONV_OK
) != __GCONV_OK
)
369 #if !defined _STRING_ARCH_unaligned \
370 && MIN_NEEDED_FROM != 1 && MAX_NEEDED_FROM % MIN_NEEDED_FROM == 0 \
371 && MIN_NEEDED_TO != 1 && MAX_NEEDED_TO % MIN_NEEDED_TO == 0
372 /* The following assumes that encodings, which have a variable length
373 what might unalign a buffer even though it is a aligned in the
374 beginning, either don't have the minimal number of bytes as a divisor
375 of the maximum length or have a minimum length of 1. This is true
376 for all known and supported encodings. */
377 unaligned
= ((FROM_DIRECTION
378 && ((uintptr_t) inptr
% MIN_NEEDED_FROM
!= 0
379 || ((data
->__flags
& __GCONV_IS_LAST
)
380 && (uintptr_t) outbuf
% MIN_NEEDED_TO
!= 0)))
382 && (((data
->__flags
& __GCONV_IS_LAST
)
383 && (uintptr_t) outbuf
% MIN_NEEDED_FROM
!= 0)
384 || (uintptr_t) inptr
% MIN_NEEDED_TO
!= 0)));
389 struct __gconv_trans_data
*trans
;
391 /* Remember the start value for this round. */
393 /* The outbuf buffer is empty. */
396 #ifdef SAVE_RESET_STATE
397 SAVE_RESET_STATE (1);
400 if (__builtin_expect (!unaligned
, 1))
403 /* Run the conversion loop. */
404 status
= FROM_LOOP (step
, data
, inptrp
, inend
, &outbuf
, outend
,
405 &lirreversible EXTRA_LOOP_ARGS
);
407 /* Run the conversion loop. */
408 status
= TO_LOOP (step
, data
, inptrp
, inend
, &outbuf
, outend
,
409 &lirreversible EXTRA_LOOP_ARGS
);
411 #if !defined _STRING_ARCH_unaligned \
412 && MIN_NEEDED_FROM != 1 && MAX_NEEDED_FROM % MIN_NEEDED_FROM == 0 \
413 && MIN_NEEDED_TO != 1 && MAX_NEEDED_TO % MIN_NEEDED_TO == 0
417 /* Run the conversion loop. */
418 status
= GEN_unaligned (FROM_LOOP
) (step
, data
, inptrp
, inend
,
423 /* Run the conversion loop. */
424 status
= GEN_unaligned (TO_LOOP
) (step
, data
, inptrp
, inend
,
431 /* If we were called as part of an error handling module we
432 don't do anything else here. */
433 if (__builtin_expect (outbufstart
!= NULL
, 0))
435 *outbufstart
= outbuf
;
439 /* Give the transliteration module the chance to store the
440 original text and the result in case it needs a context. */
441 for (trans
= data
->__trans
; trans
!= NULL
; trans
= trans
->__next
)
442 if (trans
->__trans_context_fct
!= NULL
)
443 DL_CALL_FCT (trans
->__trans_context_fct
,
444 (trans
->__data
, inptr
, *inptrp
, outstart
, outbuf
));
446 /* We finished one use of the loops. */
447 ++data
->__invocation_counter
;
449 /* If this is the last step leave the loop, there is nothing
451 if (__builtin_expect (data
->__flags
& __GCONV_IS_LAST
, 0))
453 /* Store information about how many bytes are available. */
454 data
->__outbuf
= outbuf
;
456 /* Remember how many non-identical characters we
457 converted in a irreversible way. */
458 *irreversible
+= lirreversible
;
463 /* Write out all output which was produced. */
464 if (__builtin_expect (outbuf
> outstart
, 1))
466 const unsigned char *outerr
= data
->__outbuf
;
469 result
= DL_CALL_FCT (fct
, (next_step
, next_data
, &outerr
,
470 outbuf
, NULL
, irreversible
, 0,
471 consume_incomplete
));
473 if (result
!= __GCONV_EMPTY_INPUT
)
475 if (__builtin_expect (outerr
!= outbuf
, 0))
477 #ifdef RESET_INPUT_BUFFER
480 /* We have a problem with the in on of the functions
481 below. Undo the conversion upto the error point. */
484 /* Reload the pointers. */
488 /* Reset the state. */
489 # ifdef SAVE_RESET_STATE
490 SAVE_RESET_STATE (0);
493 /* XXX Handle unaligned access here as well. */
495 /* Run the conversion loop. */
496 nstatus
= FROM_LOOP (step
, data
,
497 (const unsigned char **) inptrp
,
498 (const unsigned char *) inend
,
499 (unsigned char **) &outbuf
,
500 (unsigned char *) outerr
,
501 &lirreversible EXTRA_LOOP_ARGS
);
503 /* Run the conversion loop. */
504 nstatus
= TO_LOOP (step
, data
,
505 (const unsigned char **) inptrp
,
506 (const unsigned char *) inend
,
507 (unsigned char **) &outbuf
,
508 (unsigned char *) outerr
,
509 &lirreversible EXTRA_LOOP_ARGS
);
511 /* We must run out of output buffer space in this
513 assert (outbuf
== outerr
);
514 assert (nstatus
== __GCONV_FULL_OUTPUT
);
516 /* If we haven't consumed a single byte decrement
517 the invocation counter. */
518 if (__builtin_expect (outbuf
== outstart
, 0))
519 --data
->__invocation_counter
;
520 #endif /* reset input buffer */
523 /* Change the status. */
527 /* All the output is consumed, we can make another run
528 if everything was ok. */
529 if (status
== __GCONV_FULL_OUTPUT
)
533 while (status
== __GCONV_OK
);
539 /* If we are supposed to consume all character store now all of the
540 remaining characters in the `state' object. */
541 #if MAX_NEEDED_FROM > 1 || MAX_NEEDED_TO > 1
542 if (((MAX_NEEDED_FROM
> 1 && FROM_DIRECTION
)
543 || (MAX_NEEDED_TO
> 1 && !FROM_DIRECTION
))
544 && __builtin_expect (consume_incomplete
, 0)
545 && status
== __GCONV_INCOMPLETE_INPUT
)
548 mbstate_t *state
= data
->__statep
;
554 /* Make sure the remaining bytes fit into the state objects
556 assert (inend
- *inptrp
< 4);
558 for (cnt
= 0; *inptrp
< inend
; ++cnt
)
559 data
->__statep
->__value
.__wchb
[cnt
] = *(*inptrp
)++;
560 data
->__statep
->__count
&= ~7;
561 data
->__statep
->__count
|= cnt
;
573 #undef MIN_NEEDED_FROM
575 #undef MAX_NEEDED_FROM
577 #undef DEFINE_DIRECTION_OBJECTS
578 #undef FROM_DIRECTION
579 #undef EMIT_SHIFT_TO_INIT
583 #undef RESET_INPUT_BUFFER