* new version 2.19.9999
[alpine.git] / pith / charconv / utf8.c
blobd2f34e61655e20d06f80eb458cc7ecbd1e4ec01d
1 #if !defined(lint) && !defined(DOS)
2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
3 #endif
5 /*
6 * ========================================================================
7 * Copyright 2006-2008 University of Washington
8 * Copyright 2013-2015 Eduardo Chappa
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
14 * http://www.apache.org/licenses/LICENSE-2.0
16 * ========================================================================
20 /* includable WITHOUT dependency on c-client */
21 #include "../../c-client/mail.h"
22 #include "../../c-client/utf8.h"
24 #ifdef _WINDOWS
25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
26 #undef ERROR
27 #endif
29 #include <system.h>
31 #include "../../c-client/fs.h"
33 /* includable WITHOUT dependency on pico */
34 #include "../../pico/keydefs.h"
36 #include "../osdep/collate.h"
37 #include "../filttype.h"
39 #include "utf8.h"
41 #include <stdarg.h>
44 unsigned single_width_chars_a_to_b(UCS *, int, int);
47 static char locale_charmap[50];
49 static int native_utf8;
50 static void *display_data;
52 void
53 init_utf8_display(int utf8, void *rmap)
55 native_utf8 = utf8;
56 display_data = rmap;
61 * Argument is a UCS-4 wide character.
62 * Returns the environment dependent cell width of the
63 * character when printed to the screen.
64 * This will be -1 if the character is not printable.
65 * It will be >= zero if it is printable.
67 * Note that in the case it is not printable but it is still sent to
68 * Writechar, Writechar will print a '?' with width 1.
70 int
71 wcellwidth(UCS ucs)
73 char dummy[32];
74 long w;
77 * We believe that on modern unix systems wchar_t is a UCS-4 character.
78 * That's the assumption here.
81 if(native_utf8){ /* display is UTF-8 capable */
82 w = ucs4_width((unsigned long) ucs);
83 return((w & U4W_ERROR) ? -1 : w);
85 else if(display_data){
86 if(wtomb(dummy, ucs) < 0)
87 return(-1);
88 else{
89 w = ucs4_width((unsigned long) ucs);
90 return((w & U4W_ERROR) ? -1 : w);
93 #ifndef _WINDOWS
94 else
95 return(wcwidth((wchar_t) ucs));
96 #else
97 return(0);
98 #endif
103 * Argument is a UCS-4 wide character.
104 * It is converted to the multibyte version (for example UTF8 or EUC-JP).
105 * Dest is a buffer at least xx chars wide where the multi-byte version
106 * of the wide character will be written.
107 * The returned value is the number of bytes written to dest or -1
108 * if the conversion can't be done.
111 wtomb(char *dest, UCS ucs)
114 * We believe that on modern unix systems wchar_t is a UCS-4 character.
115 * That's the assumption here.
118 if(native_utf8){
119 unsigned char *newdptr;
121 newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
122 return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
124 else if(display_data){
125 unsigned long ucs4;
126 int ret;
128 ucs4 = (unsigned long) ucs;
129 ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
130 if(ret >= 0)
131 ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
132 else
133 ret = -1;
135 return(ret);
137 else
138 return(wcrtomb(dest, (wchar_t) ucs, NULL));
143 * This function does not necessarily update inputp and remaining_octets, so
144 * don't rely on that. The c-client version does but the other doesn't.
147 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
149 UCS ucs;
151 if(input_cs){
152 CHARSET *cast_input_cs;
154 cast_input_cs = (CHARSET *) input_cs;
156 switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
157 case U8G_ENDSTRG:
158 case U8G_ENDSTRI:
159 return(CCONV_NEEDMORE);
161 default:
162 if(ucs & U8G_ERROR || ucs == UBOGON)
163 return(CCONV_BADCHAR);
165 return(ucs);
168 else{
169 size_t ret;
170 wchar_t w;
173 * Warning: input_cs and remaining_octets are unused in this
174 * half of the if/else.
176 * Unfortunately, we can't tell the difference between a source string
177 * that is just not long enough and one that has characters that can't
178 * be converted even though it is long enough. We return NEEDMORE in both cases.
180 ret = mbstowcs(&w, (char *) (*inputp), 1);
181 if(ret == (size_t)(-1))
182 return(CCONV_NEEDMORE);
183 else{
184 ucs = (UCS) w;
185 return(ucs);
191 void
192 set_locale_charmap(char *charmap)
194 if(charmap){
195 strncpy(locale_charmap, charmap, sizeof(locale_charmap));
196 locale_charmap[sizeof(locale_charmap)-1] = '\0';
198 else
199 locale_charmap[0] = '\0';
204 * This ensures that the string is UTF-8. If str is already a UTF-8 string,
205 * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
206 * The caller is responsible for freeing the returned value.
208 * Args str -- the string to convert
210 char *
211 convert_to_utf8(char *str, char *fromcharset, int flags)
213 char *ret = NULL;
214 char *fcharset;
215 SIZEDTEXT src, result;
216 const CHARSET *cs;
217 int try;
219 src.data = (unsigned char *) str;
220 src.size = strlen(str);
222 /* already UTF-8, return NULL */
223 if(!(flags & CU8_NOINFER)
224 && (cs = utf8_infercharset(&src))
225 && (cs->type == CT_ASCII || cs->type == CT_UTF8))
226 return(ret);
228 try = 1;
229 while(try < 5){
230 switch(try){
231 case 1:
232 fcharset = fromcharset;
233 if(fcharset && strucmp("UTF-8", fcharset) != 0)
234 break; /* give it a try */
235 else
236 try++; /* fall through */
238 case 2:
239 if(!(flags & CU8_NOINFER)){
240 fcharset = cs ? cs->name : NULL;
241 if(fcharset && strucmp("UTF-8", fcharset) != 0)
242 break;
243 else
244 try++; /* fall through */
246 else
247 try++; /* fall through */
249 case 3:
250 fcharset = locale_charmap;
251 if(fcharset && strucmp("UTF-8", fcharset) != 0)
252 break;
253 else
254 try++; /* fall through */
256 default:
257 fcharset = "ISO-8859-1"; /* this will "work" */
258 break;
261 memset(&result, 0, sizeof(result));
263 if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
264 if(!(result.size == src.size && result.data == src.data)){
265 ret = (char *) fs_get((result.size+1) * sizeof(char));
266 strncpy(ret, (char *) result.data, result.size);
267 ret[result.size] = '\0';
269 /* else no conversion necessary */
271 return(ret);
274 try++;
277 /* won't make it to here */
278 return(ret);
283 * Convert from UTF-8 to user's locale charset.
284 * This actually uses the wtomb routine to do the conversion, and that
285 * relies on setup_for_input_output having been called.
286 * If no conversion is necessary, NULL is returned, otherwise an allocated
287 * string in the locale charset is returned and the caller is responsible
288 * for freeing it.
290 char *
291 convert_to_locale(char *utf8str)
293 #define CHNK 500
294 char *inp, *retp, *ret = NULL;
295 CBUF_S cb;
296 int r, alloced;
298 if(native_utf8 || !utf8str || !utf8str[0])
299 return(NULL);
301 cb.cbuf[0] = '\0';
302 cb.cbufp = cb.cbufend = cb.cbuf;
303 inp = utf8str;
305 alloced = CHNK;
306 ret = (char *) fs_get(alloced * sizeof(char));
307 retp = ret;
310 * There's gotta be a better way to do this but utf8_to_locale was
311 * available and everything looks like a nail when all you have
312 * is a hammer.
314 while(*inp){
316 * We're placing the outgoing stream of characters in ret, a multi-byte
317 * array of characters in the user's locale charset. See if there is
318 * enough room for the next wide characters worth of output chars
319 * and allocate more space if not.
321 if((alloced - (retp-ret)) < MAX(MB_LEN_MAX,32)){
322 alloced += CHNK;
323 fs_resize((void **) &ret, alloced * sizeof(char));
326 r = utf8_to_locale((int) *inp++, &cb,
327 (unsigned char *) retp, alloced-(retp-ret));
329 retp += r;
332 *retp = '\0';
334 fs_resize((void **) &ret, strlen(ret)+1);
336 return(ret);
341 * Pass in a stream of UTF-8 characters in 'c' and return obuf
342 * filled in with multi-byte characters. The return value is the
343 * number of valid characters in obuf to be used.
346 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
348 int outchars = 0;
350 if(!(cb && cb->cbufp))
351 return(0);
353 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
354 unsigned char *inputp;
355 unsigned long remaining_octets;
356 UCS ucs;
358 *(cb->cbufp)++ = (unsigned char) c;
359 inputp = cb->cbuf;
360 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
361 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
363 switch(ucs){
364 case U8G_ENDSTRG: /* incomplete character, wait */
365 case U8G_ENDSTRI: /* incomplete character, wait */
366 break;
368 default:
369 if(ucs & U8G_ERROR || ucs == UBOGON){
371 * None of these cases is supposed to happen. If it
372 * does happen then the input stream isn't UTF-8
373 * so something is wrong. Treat each character in the
374 * input buffer as a separate error character and
375 * print a '?' for each.
377 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
378 obuf[outchars++] = '?';
380 cb->cbufp = cb->cbuf;
382 else{
383 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
385 * This happens when we have a UTF-8 character that
386 * we aren't able to print in our locale. For example,
387 * if the locale is setup with the terminal
388 * expecting ISO-8859-1 characters then there are
389 * lots of UTF-8 characters that can't be printed.
390 * Print a '?' instead.
392 obuf[outchars++] = '?';
394 else{
396 * Convert the ucs into the multibyte
397 * character that corresponds to the
398 * ucs in the users locale.
400 outchars = wtomb((char *) obuf, ucs);
401 if(outchars < 0){
402 obuf[0] = '?';
403 outchars = 1;
407 /* update the input buffer */
408 if(inputp >= cb->cbufp) /* this should be the case */
409 cb->cbufp = cb->cbuf;
410 else{ /* extra chars for some reason? */
411 unsigned char *q, *newcbufp;
413 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
414 q = cb->cbuf;
415 while(inputp < cb->cbufp)
416 *q++ = *inputp++;
418 cb->cbufp = newcbufp;
422 break;
425 else{ /* error */
426 obuf[0] = '?';
427 outchars = 1;
428 cb->cbufp = cb->cbuf; /* start over */
431 return(outchars);
436 * Returns the screen cells width of the UCS-4 string argument.
437 * The source string is zero terminated.
439 unsigned
440 ucs4_str_width(UCS *ucsstr)
442 unsigned width = 0;
443 int w;
445 if(ucsstr)
446 while(*ucsstr){
447 w = wcellwidth(*ucsstr++);
448 if(w != U4W_CTLSRGT)
449 width += (w < 0 ? 1 : w);
452 return width;
457 * Returns the screen cells width of the UCS-4 string argument
458 * from ucsstr[a] through (inclusive) ucsstr[b].
459 * No checking is done to make sure a starts in the middle
460 * of a UCS-4 array.
462 unsigned
463 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
465 unsigned width = 0;
466 int i, w;
468 if(ucsstr)
469 for(i = a; i <= b && ucsstr[i]; i++){
470 w = wcellwidth(ucsstr[i]);
471 if(w != U4W_CTLSRGT)
472 width += (w < 0 ? 1 : w);
475 return width;
480 * Returns the screen cells width of the UCS-4 string argument
481 * from ustart through (exclusive) uend.
482 * No checking is done to make sure it starts in the middle
483 * of a UCS-4 array.
485 unsigned
486 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
488 UCS *u;
489 unsigned width = 0;
490 int w;
492 if(!ustart)
493 return width;
495 if(ustart)
496 for(u = ustart; u < uend; u++){
497 w = wcellwidth(*u);
498 if(w != U4W_CTLSRGT)
499 width += (w < 0 ? 1 : w);
502 return(width);
507 * Return the largest possible pointer into ucs4str so that the width
508 * of the string from ucs4str to the pointer (exclusive)
509 * is maxwidth or less. Also stops at a null character.
511 UCS *
512 ucs4_particular_width(UCS *ucs4str, int maxwidth)
514 UCS *u;
515 int w_consumed = 0, w, done = 0;
517 u = ucs4str;
519 if(u)
520 while(!done && *u && w_consumed <= maxwidth){
521 w = wcellwidth(*u);
522 w = (w >= 0 ? w : 1);
523 if(w_consumed + w <= maxwidth){
524 w_consumed += w;
525 ++u;
527 else
528 ++done;
531 return(u);
536 * Convert and copy a UTF-8 string into a UCS-4 NULL
537 * terminated array. Just like cpystr only it converts
538 * from UTF-8 to UCS-4.
540 * Returned UCS-4 string needs to be freed by caller.
542 UCS *
543 utf8_to_ucs4_cpystr(char *utf8src)
545 size_t retsize;
546 UCS *ret = NULL;
547 UCS ucs;
548 unsigned long remaining_octets;
549 unsigned char *readptr;
550 size_t arrayindex;
553 * We don't know how big to allocate the return array
554 * because variable numbers of octets in the src array
555 * will combine to make UCS-4 characters. The number of
556 * UCS-4 characters is less than or equal to the number
557 * of src characters, though.
560 if(!utf8src)
561 return NULL;
563 retsize = strlen(utf8src) + 1;
565 ret = (UCS *) fs_get(retsize * sizeof(*ret));
566 memset(ret, 0, retsize * sizeof(*ret));
568 readptr = (unsigned char *) utf8src;
569 remaining_octets = retsize-1;
570 arrayindex = 0;
572 while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
573 ucs = (UCS) utf8_get(&readptr, &remaining_octets);
575 if(ucs & U8G_ERROR || ucs == UBOGON)
576 remaining_octets = 0;
577 else
578 ret[arrayindex++] = ucs;
581 ret[arrayindex] = '\0';
583 /* get rid of excess size */
584 if(arrayindex+1 < retsize)
585 fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
587 return ret;
592 * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
593 * terminated string. Just like cpystr only it converts
594 * from UCS-4 to UTF-8.
596 * Returned UTF-8 string needs to be freed by caller.
598 char *
599 ucs4_to_utf8_cpystr(UCS *ucs4src)
601 unsigned char *ret = NULL;
602 unsigned char *writeptr;
603 int i;
605 if(!ucs4src)
606 return NULL;
609 * Over-allocate and then resize at the end.
612 /* count characters in source */
613 for(i = 0; ucs4src[i]; i++)
616 ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
617 memset(ret, 0, (6*i + 1) * sizeof(*ret));
619 writeptr = ret;
620 for(i = 0; ucs4src[i]; i++)
621 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
623 /* get rid of excess size */
624 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
626 return ((char *) ret);
631 * Similar to above but copy a fixed number of source
632 * characters instead of going until null terminator.
634 char *
635 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
637 unsigned char *ret = NULL;
638 unsigned char *writeptr;
639 int i;
641 if(!ucs4src)
642 return NULL;
645 * Over-allocate and then resize at the end.
648 ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
649 memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
651 writeptr = ret;
652 for(i = 0; i < ucs4src_len; i++)
653 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
655 /* get rid of excess size */
656 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
658 return ((char *) ret);
662 #ifdef _WINDOWS
664 * Convert a UTF-8 argument into an LPTSTR version
665 * of that argument. The result is allocated here
666 * and should be freed by the caller.
668 LPTSTR
669 utf8_to_lptstr(LPSTR arg_utf8)
671 int lptstr_len;
672 LPTSTR lptstr_ret = NULL;
674 lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
675 if(lptstr_len > 0)
677 lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
678 lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
679 arg_utf8, -1, lptstr_ret, lptstr_len );
682 if(!lptstr_len)
684 /* check GetLastError()? */
685 lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
686 lptstr_ret[0] = 0;
689 return lptstr_ret;
694 * Convert an LPTSTR argument into a UTF-8 version
695 * of that argument. The result is allocated here
696 * and should be freed by the caller.
698 LPSTR
699 lptstr_to_utf8(LPTSTR arg_lptstr)
701 int utf8str_len;
702 LPSTR utf8str_ret = NULL;
704 utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
705 if(utf8str_len > 0)
707 utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
708 utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
709 arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
712 if(!utf8str_len)
714 /* check GetLastError()? */
715 utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
716 utf8str_ret[0] = 0;
719 return utf8str_ret;
724 * Convert a UCS4 argument into an LPTSTR version
725 * of that argument. The result is allocated here
726 * and should be freed by the caller.
728 LPTSTR
729 ucs4_to_lptstr(UCS *arg_ucs4)
731 LPTSTR ret_lptstr = NULL;
732 size_t len;
733 size_t i;
735 if(arg_ucs4){
736 len = ucs4_strlen(arg_ucs4);
737 ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
738 /* bogus conversion ignores UTF-16 */
739 for(i = 0; i < len; i++)
740 ret_lptstr[i] = arg_ucs4[i];
742 ret_lptstr[len] = '\0';
745 return(ret_lptstr);
750 * Convert an LPTSTR argument into a UCS4 version
751 * of that argument. The result is MemAlloc'd here
752 * and should be freed by the caller.
754 UCS *
755 lptstr_to_ucs4(LPTSTR arg_lptstr)
757 UCS *ret_ucs4 = NULL;
758 size_t len;
759 size_t i;
761 if(arg_lptstr){
762 len = _tcslen(arg_lptstr);
763 ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
764 /* bogus conversion ignores UTF-16 */
765 for(i = 0; i < len; i++)
766 ret_ucs4[i] = arg_lptstr[i];
768 ret_ucs4[len] = '\0';
771 return(ret_ucs4);
774 #endif /* _WINDOWS */
778 * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
779 * 1-at-a-time filled in with UCS characters. The return value is the
780 * number of valid characters in obuf to be used. It can only
781 * be 1 or 0 characters since we're only getting one UTF-8 character
782 * at a time.
785 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
787 int width = 0, outchars = 0;
789 if(!(cb && cb->cbufp))
790 return(0);
792 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
793 unsigned char *inputp;
794 unsigned long remaining_octets;
795 UCS ucs;
797 *cb->cbufp++ = (unsigned char) c;
798 inputp = cb->cbuf;
799 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
800 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
802 switch(ucs){
803 case U8G_ENDSTRG: /* incomplete character, wait */
804 case U8G_ENDSTRI: /* incomplete character, wait */
805 break;
807 default:
808 if(ucs & U8G_ERROR || ucs == UBOGON){
810 * None of these cases is supposed to happen. If it
811 * does happen then the input stream isn't UTF-8
812 * so something is wrong.
814 outchars++;
815 *obuf = '?';
816 cb->cbufp = cb->cbuf;
817 width = 1;
819 else{
820 outchars++;
821 if(ucs < 0x80 && ucs >= 0x20)
822 width = 1;
824 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
826 * This happens when we have a UTF-8 character that
827 * we aren't able to print in our locale. For example,
828 * if the locale is setup with the terminal
829 * expecting ISO-8859-1 characters then there are
830 * lots of UTF-8 characters that can't be printed.
831 * Print a '?' instead.
832 * Don't think this should happen in Windows.
834 *obuf = '?';
836 else{
837 *obuf = ucs;
840 /* update the input buffer */
841 if(inputp >= cb->cbufp) /* this should be the case */
842 cb->cbufp = cb->cbuf;
843 else{ /* extra chars for some reason? */
844 unsigned char *q, *newcbufp;
846 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
847 q = cb->cbuf;
848 while(inputp < cb->cbufp)
849 *q++ = *inputp++;
851 cb->cbufp = newcbufp;
855 break;
858 else{ /* error */
859 *obuf = '?';
860 outchars = 1;
861 width = 1;
862 cb->cbufp = cb->cbuf; /* start over */
865 if(obufwidth)
866 *obufwidth = width;
868 return(outchars);
873 * Return an allocated copy of a zero-terminated UCS-4 string.
875 UCS *
876 ucs4_cpystr(UCS *ucs4src)
878 size_t arraysize;
879 UCS *ret = NULL;
880 size_t i;
882 if(!ucs4src)
883 return NULL;
885 arraysize = ucs4_strlen(ucs4src);
887 ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
888 memset(ret, 0, (arraysize+1) * sizeof(*ret));
890 for(i = 0; i < arraysize; i++)
891 ret[i] = ucs4src[i];
893 return ret;
897 UCS *
898 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
900 size_t i;
902 if(ucs4src && ucs4dst){
903 for(i = 0; i < n; i++){
904 ucs4dst[i] = ucs4src[i];
905 if(ucs4dst[i] == '\0')
906 break;
910 return ucs4dst;
914 UCS *
915 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
917 size_t i;
918 UCS *u;
920 if(ucs4src && ucs4dst){
921 for(u = ucs4dst; *u; u++)
924 for(i = 0; i < n; i++){
925 u[i] = ucs4src[i];
926 if(u[i] == '\0')
927 break;
930 if(i == n)
931 u[i] = '\0';
934 return ucs4dst;
939 * Like strlen only this returns the number of non-zero characters
940 * in a zero-terminated UCS-4 array.
942 size_t
943 ucs4_strlen(UCS *ucs4str)
945 size_t i = 0;
947 if(ucs4str)
948 while(ucs4str[i])
949 i++;
951 return(i);
956 ucs4_strcmp(UCS *s1, UCS *s2)
958 for(; *s1 == *s2; s1++, s2++)
959 if(*s1 == '\0')
960 return 0;
962 return((*s1 < *s2) ? -1 : 1);
966 UCS *
967 ucs4_strchr(UCS *s, UCS c)
969 if(!s)
970 return NULL;
972 while(*s && *s != c)
973 s++;
975 if(*s || !c)
976 return s;
977 else
978 return NULL;
982 UCS *
983 ucs4_strrchr(UCS *s, UCS c)
985 UCS *ret = NULL;
987 if(!s)
988 return ret;
990 while(*s){
991 if(*s == c)
992 ret = s;
994 s++;
997 return ret;
1002 * Returns the screen cells width of the UTF-8 string argument.
1004 unsigned
1005 utf8_width(char *str)
1007 unsigned width = 0;
1008 int this_width;
1009 UCS ucs;
1010 unsigned long remaining_octets;
1011 char *readptr;
1013 if(!(str && *str))
1014 return(width);
1016 readptr = str;
1017 remaining_octets = readptr ? strlen(readptr) : 0;
1019 while(remaining_octets > 0 && *readptr){
1021 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1023 if(ucs & U8G_ERROR || ucs == UBOGON){
1025 * This should not happen, but do something to handle it anyway.
1026 * Treat each character as a single width character, which is what should
1027 * probably happen when we actually go to write it out.
1029 remaining_octets--;
1030 readptr++;
1031 this_width = 1;
1033 else{
1034 this_width = wcellwidth(ucs);
1037 * If this_width is -1 that means we can't print this character
1038 * with our current locale. Writechar will print a '?'.
1040 if(this_width < 0)
1041 this_width = 1;
1044 width += (unsigned) this_width;
1047 return(width);
1052 * Copy UTF-8 characters from src into dst.
1053 * This is intended to be used if you want to truncate a string at
1054 * the start instead of the end. For example, you have a long string
1055 * like
1056 * this_is_a_long_string
1057 * but not enough space to fit it into a particular field. You want to
1058 * end up with
1059 * s_a_long_string
1060 * where that fits in a particular width. Perhaps you'd use this with ...
1061 * to get
1062 * ...s_a_long_string
1063 * This right adjusts the end of the string in the width space and
1064 * cuts it off at the start. If there is enough width for the whole
1065 * string it will copy the string into dst with no padding.
1067 * Copy enough characters so that the result will have screen width of
1068 * want_width screen cells in current locale.
1070 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1071 * to dst. This is just for protection, it shouldn't be relied on to
1072 * do anything useful. Dstlen should be large enough. Otherwise you'll get
1073 * characters truncated in the middle or something like that.
1075 * Returned value is the number of bytes written to dst, not including
1076 * the possible terminating null.
1078 * If we can't hit want_width exactly because of double width characters
1079 * then we will pad the end of the string with space in order to make
1080 * the width exact.
1082 size_t
1083 utf8_to_width_rhs(char *dst, /* destination buffer */
1084 char *src, /* source string */
1085 size_t dstlen, /* space in dest */
1086 unsigned want_width) /* desired screen width */
1088 int this_width;
1089 unsigned width_consumed = 0;
1090 UCS ucs;
1091 unsigned long remaining_octets;
1092 char *readptr, *goodreadptr, *savereadptr, *endptr;
1093 size_t nb = 0;
1095 if(!src){
1096 if(dstlen > 0)
1097 dst[0] = '\0';
1099 return nb;
1103 * Start at the end of the source string and go backwards until we
1104 * get to the desired width, but not more than the width.
1106 readptr = src + strlen(src);
1107 endptr = readptr;
1108 goodreadptr = readptr;
1109 width_consumed = 0;
1110 savereadptr = readptr;
1112 for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1113 readptr = savereadptr-1){
1115 savereadptr = readptr;
1116 remaining_octets = goodreadptr - readptr;
1117 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1120 * Handling the error case is tough because an error will be the normal thing that
1121 * happens as we back through the string. So we're just going to punt on the
1122 * error for now.
1124 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1125 if(remaining_octets > 0){
1127 * This means there are some bad octets after this good
1128 * character so things are not going to work out well.
1129 * Bail out.
1131 savereadptr = src; /* we're done */
1133 else{
1134 this_width = wcellwidth(ucs);
1136 if(this_width < 0)
1137 this_width = 1;
1139 if(width_consumed + (unsigned) this_width <= want_width){ /* ok */
1140 width_consumed += (unsigned) this_width;
1141 goodreadptr = savereadptr;
1143 else
1144 savereadptr = src; /* we're done */
1150 * Copy characters from goodreadptr to endptr into dst.
1152 nb = MIN(endptr-goodreadptr, dstlen-1);
1153 strncpy(dst, goodreadptr, nb);
1154 dst[nb] = '\0';
1157 * Pad out with spaces in order to hit width exactly.
1159 while(width_consumed < want_width && nb < dstlen-1){
1160 dst[nb++] = ' ';
1161 dst[nb] = '\0';
1162 width_consumed++;
1165 return nb;
1170 * The arguments being converted are UTF-8 strings.
1171 * This routine attempts to make it possible to use screen cell
1172 * widths in a format specifier. In a one-byte per screen cell
1173 * world we might have used %10.10s to cause a string to occupy
1174 * 10 screen positions. Since the width and precision are really
1175 * referring to numbers of bytes instead of screen positions that
1176 * won't work with UTF-8 input. We emulate that behavior with
1177 * the format string %w. %m.nw means to use the m and n as
1178 * screen width indicators instead of bytes indicators.
1180 * There is no reason to use this routine unless you want to use
1181 * min field with or precision with the specifier. A plain %w without
1182 * widths is equivalent exactly to a plain %s in a regular printf.
1184 * Double-width characters complicate things. It may not be possible
1185 * to satisfy the request exactly. For example, %3w for an input
1186 * string that is made up of two double-width characters.
1187 * This routine will arbitrarily use a trailing space character if
1188 * needed to make the width come out correctly where a half of a
1189 * double-width character would have been needed. We'll see how
1190 * that works for us.
1192 * %w only works for strings (it's a %s replacement).
1194 * Buffer overflow is handled by the size argument. %.30s will work
1195 * to limit a particular string to 30 bytes, but you lose that
1196 * ability with %w, since it may write more than precision bytes
1197 * in order to get to the desired width. It is best to choose
1198 * size large enough so that it doesn't come into play, otherwise
1199 * it may be possible to get partial UTF-8 characters because of
1200 * the truncation.
1202 * The return value isn't quite the same as the return value
1203 * of snprintf. It is the number of bytes written, not counting
1204 * the trailing null, just like snprintf. However, if it is
1205 * truncated due to size then the output is size, not the
1206 * number of characters that would have been written.
1209 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1211 char newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1212 char *start_of_specifier;
1213 char *input_str;
1214 int int_arg;
1215 double double_arg;
1216 void *ptr_arg;
1217 unsigned got_width;
1218 int more_flags, ret, w;
1219 int min_field_width, field_precision, modifier;
1220 int flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1221 va_list args;
1223 newfmt[0] = '\0';
1224 q = newfmt;
1226 pdest = dest;
1228 #define IS_ROOM_IN_DEST(n_more_chars) \
1229 ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1232 * Strategy: Look through the fmt string for %w's. Replace the
1233 * %w's in the format string with %s's but with possibly different
1234 * width and precision arguments which will make it come out right.
1235 * Then call the regular system vsnprintf with the altered format
1236 * string but same arguments.
1238 * That would be nice but it doesn't quite work. Why? Because a
1239 * %*w will need to have the value in the integer argument the *
1240 * refers to modified. Can't do it as far as I can tell. Or we could
1241 * remove the integer argument somehow before calling printf. Can't
1242 * do it. Or we could somehow add an additional conversion specifier
1243 * that caused nothing to be printed but ate up the integer arg.
1244 * Can't figure out how to do that either.
1246 * Since we can't figure out how to do it, the alternative is to
1247 * construct the result one piece at a time, pasting together the
1248 * pieces from the different conversions.
1250 va_start(args, fmt);
1252 while(*fmt && IS_ROOM_IN_DEST(1)){
1253 if(*fmt == '%'){
1254 start_of_specifier = fmt++;
1256 min_field_width = field_precision = -1;
1257 flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1259 /* flags */
1260 more_flags = 1;
1261 while(more_flags){
1262 switch(*fmt){
1263 case '-':
1264 flags_minus++;
1265 fmt++;
1266 break;
1268 case '+':
1269 flags_plus++;
1270 fmt++;
1271 break;
1273 case ' ':
1274 flags_space++;
1275 fmt++;
1276 break;
1278 case '0':
1279 flags_zero++;
1280 fmt++;
1281 break;
1283 case '#':
1284 flags_pound++;
1285 fmt++;
1286 break;
1288 default:
1289 more_flags = 0;
1290 break;
1294 /* minimum field width */
1295 if(*fmt == '*'){
1296 min_field_width = va_arg(args, int);
1297 fmt++;
1299 else if(*fmt >= '0' && *fmt <= '9'){
1300 width_str = fmt;
1301 while (*fmt >= '0' && *fmt <= '9')
1302 fmt++;
1304 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1305 if(sizeof(buf) > fmt-width_str)
1306 buf[fmt-width_str] = '\0';
1308 buf[sizeof(buf)-1] = '\0';
1310 min_field_width = atoi(width_str);
1313 /* field precision */
1314 if(*fmt == '.'){
1315 fmt++;
1316 if(*fmt == '*'){
1317 field_precision = va_arg(args, int);
1318 fmt++;
1320 else if(*fmt >= '0' && *fmt <= '9'){
1321 width_str = fmt;
1322 while (*fmt >= '0' && *fmt <= '9')
1323 fmt++;
1325 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1326 if(sizeof(buf) > fmt-width_str)
1327 buf[fmt-width_str] = '\0';
1329 buf[sizeof(buf)-1] = '\0';
1331 field_precision = atoi(width_str);
1335 /* length modifier */
1336 if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1337 modifier = *fmt++;
1339 /* conversion character */
1340 switch(*fmt){
1341 case 'w':
1343 * work with va_arg(char *) to figure out width
1344 * and precision needed to produce the screen width
1345 * and precision asked for in %w using some of the
1346 * utf8 width routines we have.
1349 input_str = va_arg(args, char *);
1350 if(field_precision >=0 || min_field_width >= 0)
1351 w = utf8_width(input_str);
1353 if(field_precision >= 0){
1354 if(w <= field_precision)
1355 field_precision = -1; /* print it all */
1356 else{
1358 * We need to cut off some of the input_str
1359 * in this case.
1361 end = utf8_count_forw_width(input_str, field_precision, &got_width);
1362 field_precision = (int) (end - input_str);
1363 /* new w with this field_precision */
1364 w = got_width;
1368 /* need some padding */
1369 if(min_field_width >= 0)
1370 min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1371 MAX(0, min_field_width - w);
1374 * Now we just need to get the new format string
1375 * set correctly in newfmt.
1377 q = newfmt;
1378 if(q-newfmt < sizeof(newfmt))
1379 *q++ = '%';
1381 if(flags_minus && q-newfmt < sizeof(newfmt))
1382 *q++ = '-';
1383 if(flags_plus && q-newfmt < sizeof(newfmt))
1384 *q++ = '+';
1385 if(flags_space && q-newfmt < sizeof(newfmt))
1386 *q++ = ' ';
1387 if(flags_zero && q-newfmt < sizeof(newfmt))
1388 *q++ = '0';
1389 if(flags_pound && q-newfmt < sizeof(newfmt))
1390 *q++ = '#';
1392 if(min_field_width >= 0){
1393 snprintf(buf, sizeof(buf), "%d", min_field_width);
1394 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1397 if(field_precision >= 0){
1398 if(q-newfmt < sizeof(newfmt))
1399 *q++ = '.';
1401 snprintf(buf, sizeof(buf), "%d", field_precision);
1402 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1405 if(q-newfmt < sizeof(newfmt))
1406 *q++ = 's';
1408 if(q-newfmt < sizeof(newfmt))
1409 *q++ = '\0';
1411 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1412 pdest += strlen(pdest);
1414 break;
1416 case '\0':
1417 fmt--;
1418 break;
1420 default:
1421 /* make a new format which leaves out the dynamic '*' arguments */
1422 q = newfmt;
1423 if(q-newfmt < sizeof(newfmt))
1424 *q++ = '%';
1426 if(flags_minus && q-newfmt < sizeof(newfmt))
1427 *q++ = '-';
1428 if(flags_plus && q-newfmt < sizeof(newfmt))
1429 *q++ = '+';
1430 if(flags_space && q-newfmt < sizeof(newfmt))
1431 *q++ = ' ';
1432 if(flags_zero && q-newfmt < sizeof(newfmt))
1433 *q++ = '0';
1434 if(flags_pound && q-newfmt < sizeof(newfmt))
1435 *q++ = '#';
1437 if(min_field_width >= 0){
1438 snprintf(buf, sizeof(buf), "%d", min_field_width);
1439 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1442 if(field_precision >= 0){
1443 if(q-newfmt < sizeof(newfmt))
1444 *q++ = '.';
1446 snprintf(buf, sizeof(buf), "%d", field_precision);
1447 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1450 if(q-newfmt < sizeof(newfmt))
1451 *q++ = *fmt;
1453 if(q-newfmt < sizeof(newfmt))
1454 *q++ = '\0';
1456 switch(*fmt){
1457 case 'd': case 'i': case 'o':
1458 case 'x': case 'X': case 'u': case 'c':
1459 int_arg = va_arg(args, int);
1460 snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1461 pdest += strlen(pdest);
1462 break;
1464 case 's':
1465 input_str = va_arg(args, char *);
1466 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1467 pdest += strlen(pdest);
1468 break;
1470 case 'f': case 'e': case 'E':
1471 case 'g': case 'G':
1472 double_arg = va_arg(args, double);
1473 snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1474 pdest += strlen(pdest);
1475 break;
1477 case 'p':
1478 ptr_arg = va_arg(args, void *);
1479 snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1480 pdest += strlen(pdest);
1481 break;
1483 case '%':
1484 if(IS_ROOM_IN_DEST(1))
1485 *pdest++ = '%';
1487 break;
1489 default:
1490 /* didn't think of this type */
1491 assert(0);
1492 break;
1495 break;
1498 fmt++;
1500 else{
1501 if(IS_ROOM_IN_DEST(1))
1502 *pdest++ = *fmt++;
1506 ret = pdest - dest;
1508 if(IS_ROOM_IN_DEST(1))
1509 *pdest++ = '\0';
1511 va_end(args);
1513 return ret;
1518 * Copy UTF-8 characters from src into dst.
1519 * Copy enough characters so that the result will have (<=) screen width of
1520 * want_width screen cells in current locale.
1522 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1523 * to dst.
1525 * Returned value is the number of bytes written to dst, not including
1526 * the possible terminating null.
1527 * Got_width is another returned value. It is the width in screen cells of
1528 * the string placed in dst. It will be the same as want_width if there
1529 * are enough characters in the src to do that and if the character widths
1530 * hit the width exactly. It will be less than want_width if we run out
1531 * of src characters or if the next character width would skip over the
1532 * width we want, because it is double width.
1534 * Zero width characters are collected and included at the end of the string.
1535 * That is, if we make it to want_width but there is still a zero length
1536 * character sitting in src, we add that to dst. This might be an accent
1537 * or something like that.
1539 size_t
1540 utf8_to_width(char *dst, /* destination buffer */
1541 char *src, /* source string */
1542 size_t dstlen, /* space in dst */
1543 unsigned want_width, /* desired screen width */
1544 unsigned *got_width) /* returned screen width in dst */
1546 int this_width;
1547 unsigned width_consumed = 0;
1548 UCS ucs;
1549 unsigned long remaining_octets;
1550 char *writeptr, *readptr, *savereadptr, *endptr;
1551 int ran_out_of_space = 0;
1553 readptr = src;
1555 remaining_octets = readptr ? strlen(readptr) : 0;
1557 writeptr = dst;
1558 endptr = writeptr + dstlen;
1560 if(readptr && writeptr){
1561 while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1562 savereadptr = readptr;
1563 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1565 if(ucs & U8G_ERROR || ucs == UBOGON)
1566 remaining_octets = 0;
1567 else{
1568 this_width = wcellwidth(ucs);
1571 * If this_width is -1 that means we can't print this character
1572 * with our current locale. Writechar will print a '?'.
1574 if(this_width < 0)
1575 this_width = 1;
1577 if(width_consumed + (unsigned) this_width <= want_width){
1578 /* append this utf8 character to dst if it will fit */
1579 if(writeptr + (readptr - savereadptr) < endptr){
1580 width_consumed += this_width;
1581 while(savereadptr < readptr)
1582 *writeptr++ = *savereadptr++;
1584 else
1585 ran_out_of_space++; /* no more utf8 to dst */
1587 else
1588 remaining_octets = 0; /* we're done */
1592 if(writeptr < endptr)
1593 *writeptr = '\0';
1596 if(got_width)
1597 *got_width = width_consumed;
1599 return(writeptr ? (writeptr - dst) : 0);
1604 * Str is a UTF-8 string.
1605 * Count forward width screencell positions and return a pointer to the
1606 * end of the string that is width wide.
1607 * The returned pointer points at the next character (where the null would
1608 * be placed).
1610 * Got_width is another returned value. It is the width in screen cells of
1611 * the string from str to the returned pointer. It will be the same as
1612 * want_width if there are enough characters in the str to do that
1613 * and if the character widths hit the width exactly. It will be less
1614 * than want_width if we run out of characters or if the next character
1615 * width would skip over the width we want, because it is double width.
1617 char *
1618 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1620 int this_width;
1621 unsigned width_consumed = 0;
1622 UCS ucs;
1623 unsigned long remaining_octets;
1624 char *readptr;
1625 char *retptr;
1627 retptr = readptr = str;
1629 remaining_octets = readptr ? strlen(readptr) : 0;
1631 while(width_consumed <= want_width && remaining_octets > 0){
1633 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1635 if(ucs & U8G_ERROR || ucs == UBOGON){
1637 * This should not happen, but do something to handle it anyway.
1638 * Treat each character as a single width character, which is what should
1639 * probably happen when we actually go to write it out.
1641 remaining_octets--;
1642 readptr++;
1643 this_width = 1;
1645 else{
1646 this_width = wcellwidth(ucs);
1649 * If this_width is -1 that means we can't print this character
1650 * with our current locale. Writechar will print a '?'.
1652 if(this_width < 0)
1653 this_width = 1;
1656 if(width_consumed + (unsigned) this_width <= want_width){
1657 width_consumed += (unsigned) this_width;
1658 retptr = readptr;
1660 else
1661 remaining_octets = 0; /* we're done */
1664 if(got_width)
1665 *got_width = width_consumed;
1667 return(retptr);
1672 * Copy a null terminator into a UTF-8 string in place so that the string is
1673 * no more than a certain screen width wide. If the string is already less
1674 * than or equal in width to the requested width, no change is made.
1676 * The actual width accomplished is returned. Note that it may be less than
1677 * max_width due to double width characters as well as due to the fact that
1678 * it fits wholly in the max_width.
1680 * Returned value is the actual screen width of str when done.
1682 * A side effect is that a terminating null may have been written into
1683 * the passed in string.
1685 unsigned
1686 utf8_truncate(char *str, unsigned max_width)
1688 int this_width;
1689 unsigned width_consumed = 0;
1690 UCS ucs;
1691 unsigned long remaining_octets;
1692 char *readptr, *savereadptr;
1694 readptr = str;
1696 remaining_octets = readptr ? strlen(readptr) : 0;
1698 if(readptr){
1699 while(width_consumed <= max_width && remaining_octets > 0){
1701 savereadptr = readptr;
1702 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1704 if(ucs & U8G_ERROR || ucs == UBOGON){
1706 * This should not happen, but do something to handle it anyway.
1707 * Treat each character as a single width character, which is what should
1708 * probably happen when we actually go to write it out.
1710 remaining_octets--;
1711 readptr++;
1712 this_width = 1;
1714 else{
1715 this_width = wcellwidth(ucs);
1718 * If this_width is -1 that means we can't print this character
1719 * with our current locale. Writechar will print a '?'.
1721 if(this_width < 0)
1722 this_width = 1;
1725 if(width_consumed + (unsigned) this_width <= max_width){
1726 width_consumed += (unsigned) this_width;
1728 else{
1729 remaining_octets = 0; /* we're done */
1730 *savereadptr = '\0';
1735 return(width_consumed);
1740 * Copy UTF-8 characters from src into dst.
1741 * Copy enough characters so that the result will have screen width of
1742 * want_width screen cells in current locale.
1743 * If there aren't enough characters in src to get to want_width, pad on
1744 * left or right according to left_adjust argument.
1746 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1747 * to dst. Dst will be null terminated if there is enough room, but not
1748 * if that would overflow dst's len.
1750 * Returned value is the number of bytes written to dst, not including
1751 * the possible terminating null.
1753 size_t
1754 utf8_pad_to_width(char *dst, /* destination buffer */
1755 char *src, /* source string */
1756 size_t dstlen, /* space in dst */
1757 unsigned want_width, /* desired screen width */
1758 int left_adjust) /* adjust left or right in want_width columns */
1760 unsigned got_width = 0;
1761 int need_more, howmany;
1762 size_t len_left, bytes_used;
1764 bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1765 len_left = dstlen - bytes_used;
1767 need_more = want_width - got_width;
1768 howmany = MIN(need_more, len_left);
1770 if(howmany > 0){
1771 char *end, *newend, *p, *q;
1773 end = dst + bytes_used;
1774 newend = end + howmany;
1775 if(left_adjust){
1777 * Add padding to end of string. Simply append
1778 * the needed number of spaces, or however many will fit
1779 * if we don't have enough space.
1781 for(q = end; q < newend; q++)
1782 *q = ' ';
1784 else{
1786 * Add padding to start of string.
1789 /* slide existing string over */
1790 for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1791 *q = *p;
1793 /* fill rest with spaces */
1794 for(; q >= dst; q--)
1795 *q = ' ';
1798 bytes_used += howmany;
1801 if(bytes_used < dstlen)
1802 dst[bytes_used] = '\0';
1804 return(bytes_used);
1809 * Str is a UTF-8 string.
1810 * Start_here is a pointer into the string. It points one position past
1811 * the last byte that should be considered a part of the length string.
1812 * Count back want_width screencell positions and return a pointer to the
1813 * start of the string that is want_width wide and ends with start_here.
1815 * Since characters may be more than one cell width wide we may end up
1816 * skipping over the exact width. That is, if we need to we'll go back
1817 * too far (by one cell width). Account for that in the call by looking
1818 * at got_width.
1820 * Note that this call gives a possible got_width == want_width+1 as
1821 * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1822 * That was just what was needed at the time, maybe it needs to be
1823 * optional.
1825 char *
1826 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1828 unsigned width_consumed = 0;
1829 int this_width;
1830 UCS ucs;
1831 unsigned long remaining_octets;
1832 char *ptr, *savereadptr, *goodreadptr;
1834 savereadptr = start_here;
1835 goodreadptr = start_here;
1837 for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1839 savereadptr = ptr;
1840 remaining_octets = goodreadptr - ptr;
1841 ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1843 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1844 if(remaining_octets > 0){
1846 * This means there are some bad octets after this good
1847 * character so things are not going to work out well.
1848 * Bail out.
1850 savereadptr = str; /* we're done */
1852 else{
1853 this_width = wcellwidth(ucs);
1856 * If this_width is -1 that means we can't print this character
1857 * with our current locale. Writechar will print a '?'.
1859 if(this_width < 0)
1860 this_width = 1;
1862 width_consumed += (unsigned) this_width;
1863 goodreadptr = savereadptr;
1868 if(got_width)
1869 *got_width = width_consumed;
1871 return(savereadptr);
1875 /*----------------------------------------------------------------------
1876 copy the source string onto the destination string returning with
1877 the destination string pointer at the end of the destination text
1879 motivation for this is to avoid twice passing over a string that's
1880 being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1882 This doesn't really belong here but it is used here.
1883 ----*/
1884 void
1885 sstrncpy(char **d, char *s, int n)
1887 while(n-- > 0 && (**d = *s++) != '\0')
1888 (*d)++;
1893 * If use_system_routines is set then NULL is the return value and it is
1894 * not an error. Display_charmap and keyboard_charmap should come over as
1895 * malloced strings and will be filled in with the result.
1897 * Returns a void pointer to the input_cs CHARSET which is
1898 * passed to mbtow via kbseq().
1899 * If !use_system_routines && NULL is returned, that is an error and err should
1900 * have a message.
1901 * display_charmap and keyboard_charmap should be malloced data and may be
1902 * realloced and changed here.
1905 setup_for_input_output(int use_system_routines, char **display_charmap,
1906 char **keyboard_charmap, void **input_cs_arg, char **err)
1908 const CHARSET *cs;
1909 const CHARSET *input_cs = NULL;
1910 int already_tried = 0;
1911 int supported = 0;
1912 char buf[1000];
1914 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1916 if(err)
1917 *err = NULL;
1919 if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1920 *err = cpstr("Bad call to setup_for_input_output");
1921 return(-1);
1924 if(use_system_routines){
1925 #if PREREQ_FOR_SYS_TRANSLATION
1926 char *dcm;
1928 dcm = nl_langinfo_codeset_wrapper();
1929 dcm = dcm ? dcm : "US-ASCII";
1931 init_utf8_display(0, NULL);
1932 if(*display_charmap){
1933 if(dcm && strucmp(*display_charmap, dcm)){
1934 snprintf(buf, sizeof(buf),
1935 _("Display character set \"%s\" is ignored when using system translation"),
1936 *display_charmap);
1938 *err = cpstr(buf);
1941 fs_give((void **) display_charmap);
1944 if(*keyboard_charmap){
1945 if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
1946 snprintf(buf, sizeof(buf),
1947 _("Keyboard character set \"%s\" is ignored when using system translation"),
1948 *keyboard_charmap);
1950 *err = cpstr(buf);
1953 fs_give((void **) keyboard_charmap);
1956 *display_charmap = cpstr(dcm);
1957 *keyboard_charmap = cpstr(dcm);
1958 #else
1959 *err = cpstr("Bad call to setup_for_input_output");
1960 #endif
1962 *input_cs_arg = NULL;
1963 return(0);
1967 try_again1:
1968 if(!(*display_charmap))
1969 *display_charmap = cpstr("US-ASCII");
1971 if(!(*keyboard_charmap))
1972 *keyboard_charmap = cpstr(*display_charmap);
1974 if(*keyboard_charmap){
1975 supported = input_charset_is_supported(*keyboard_charmap);
1977 if(supported){
1978 if(!strucmp(*keyboard_charmap, "utf-8"))
1979 input_cs = utf8_charset(*keyboard_charmap);
1980 else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
1981 input_cs = cs;
1983 else{
1984 if(err && !*err){
1985 int iso2022jp = 0;
1987 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
1988 iso2022jp = 1;
1990 snprintf(buf, sizeof(buf),
1991 /* TRANSLATORS: The first argument is the name of the character
1992 set the user is trying to use (which is unsupported by alpine).
1993 The second argument is " (except for posting)" if they are
1994 trying to use ISO-2022-JP for something other than posting. */
1995 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
1996 *keyboard_charmap,
1997 iso2022jp ? _(" (except for posting)") : "");
1999 *err = cpstr(buf);
2002 input_cs = NULL;
2003 fs_give((void **) keyboard_charmap);
2004 *keyboard_charmap = cpstr("US-ASCII");
2005 if(!already_tried){
2006 already_tried++;
2007 goto try_again1;
2013 try_again2:
2014 if(!(*display_charmap))
2015 *display_charmap = cpstr("US-ASCII");
2017 if(*display_charmap){
2018 supported = output_charset_is_supported(*display_charmap);
2019 if(supported){
2020 if(!strucmp(*display_charmap, "utf-8"))
2021 init_utf8_display(1, NULL);
2022 else if((cs = utf8_charset(*display_charmap)) != NULL)
2023 init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2025 else{
2026 if(err && !*err){
2027 int iso2022jp = 0;
2029 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2030 iso2022jp = 1;
2032 snprintf(buf, sizeof(buf),
2033 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2034 *display_charmap,
2035 iso2022jp ? _(" (except for posting)") : "");
2037 *err = cpstr(buf);
2040 fs_give((void **) display_charmap);
2041 if(!already_tried){
2042 already_tried++;
2043 goto try_again2;
2047 else{
2048 if(err && !*err)
2049 *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2052 #undef cpstr
2054 *input_cs_arg = (void *) input_cs;
2056 return(0);
2061 input_charset_is_supported(char *input_charset)
2063 const CHARSET *cs;
2065 if(!(input_charset && *input_charset))
2066 return 0;
2068 if(!strucmp(input_charset, "utf-8"))
2069 return 1;
2071 if((cs = utf8_charset(input_charset)) != NULL){
2074 * This was true 2006-09-25.
2076 switch(cs->type){
2077 case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2078 case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2079 case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2080 case CT_UCS4: case CT_UTF16:
2081 return 1;
2082 break;
2084 default:
2085 break;
2089 return 0;
2094 output_charset_is_supported(char *output_charset)
2096 const CHARSET *cs;
2098 if(!(output_charset && *output_charset))
2099 return 0;
2101 if(!strucmp(output_charset, "utf-8"))
2102 return 1;
2104 if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2105 return 1;
2107 return 0;
2112 posting_charset_is_supported(char *posting_charset)
2114 return(posting_charset && *posting_charset
2115 && (!strucmp(posting_charset, "ISO-2022-JP")
2116 || output_charset_is_supported(posting_charset)));
2121 * This function is only defined in this special case and so calls
2122 * to it should be wrapped in the same macro conditionals.
2124 * Returns the default display charset for a UNIX terminal emulator,
2125 * it is what nl_langinfo(CODESET) should return but we need to
2126 * wrap nl_langinfo because we know of strange behaving implementations.
2128 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2129 char *
2130 nl_langinfo_codeset_wrapper(void)
2132 char *ret = NULL;
2134 ret = nl_langinfo(CODESET);
2137 * If the value returned from nl_langinfo() is not a real charset,
2138 * see if we can figure out what they meant. If we can't figure it
2139 * out return NULL and let the caller decide what to do.
2141 if(ret && *ret && !output_charset_is_supported(ret)){
2142 if(!strcmp("ANSI_X3.4-1968", ret)
2143 || !strcmp("646", ret)
2144 || !strcmp("ASCII", ret)
2145 || !strcmp("C", ret)
2146 || !strcmp("POSIX", ret))
2147 ret = "US-ASCII";
2148 else if(!strucmp(ret, "UTF8"))
2149 ret = "UTF-8";
2150 else if(!strucmp(ret, "EUCJP"))
2151 ret = "EUC-JP";
2152 else if(!strucmp(ret, "EUCKP"))
2153 ret = "EUC-KP";
2154 else if(!strucmp(ret, "SJIS"))
2155 ret = "SHIFT-JIS";
2156 else if(strstr(ret, "8859")){
2157 char *p;
2159 /* check for digits after 8859 */
2160 p = strstr(ret, "8859");
2161 p += 4;
2162 if(!isdigit(*p))
2163 p++;
2165 if(isdigit(*p)){
2166 static char buf[12];
2168 memset(buf, 0, sizeof(buf));
2169 strncpy(buf, "ISO-8859-", sizeof(buf));
2170 buf[9] = *p++;
2171 if(isdigit(*p))
2172 buf[10] = *p;
2174 ret = buf;
2179 if(ret && !output_charset_is_supported(ret))
2180 ret = NULL;
2182 return(ret);
2184 #endif
2188 * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2189 * needed the return value will point to orig. If a conversion is done,
2190 * the return string should be freed by the caller.
2191 * If not possible, returns NULL.
2193 char *
2194 utf8_to_charset(char *orig, char *charset, int report_err)
2196 SIZEDTEXT src, dst;
2197 char *ret = orig;
2199 if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2200 return ret;
2202 src.size = strlen(orig);
2203 src.data = (unsigned char *) orig;
2205 if(!strucmp(charset, "us-ascii")){
2206 size_t i;
2208 for(i = 0; i < src.size; i++)
2209 if(src.data[i] & 0x80)
2210 return NULL;
2212 return ret;
2216 * This works for ISO-2022-JP because of special code in utf8_cstext
2217 * but not for other 2022 charsets.
2219 memset(&dst, 0, sizeof(dst));
2220 if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2221 ret = (char *) dst.data; /* c-client already null terminates it */
2222 else
2223 ret = NULL;
2225 if((unsigned char *) ret != dst.data && dst.data)
2226 fs_give((void **) &dst.data);
2228 return ret;
2233 * Turn a number into a string with comma's
2235 * Args: number -- The long to be turned into a string.
2237 * Result: pointer to static string representing number with commas
2238 * Can use up to 3 comatose results at once.
2240 char *
2241 comatose(long int number)
2243 long i, x, done_one;
2244 static char buf[3][50];
2245 static int whichbuf = 0;
2246 char *b;
2248 whichbuf = (whichbuf + 1) % 3;
2250 if(number == 0){
2251 strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2252 buf[whichbuf][sizeof(buf[0])-1] = '\0';
2253 return(buf[whichbuf]);
2256 done_one = 0;
2257 b = buf[whichbuf];
2258 for(i = 1000000000; i >= 1; i /= 1000) {
2259 x = number / i;
2260 number = number % i;
2261 if(x != 0 || done_one) {
2262 if(b != buf[whichbuf] && (b-buf[whichbuf]) < sizeof(buf[0]))
2263 *b++ = ',';
2265 snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2266 b += strlen(b);
2267 done_one = 1;
2271 if(b-buf[whichbuf] < sizeof(buf[0]))
2272 *b = '\0';
2274 return(buf[whichbuf]);
2278 /* leave out the commas */
2279 char *
2280 tose(long int number)
2282 static char buf[3][50];
2283 static int whichbuf = 0;
2285 whichbuf = (whichbuf + 1) % 3;
2287 snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2289 return(buf[whichbuf]);
2294 * line_paint - where the real work of managing what is displayed gets done.
2296 void
2297 line_paint(int offset, /* current dot offset into vl */
2298 struct display_line *displ,
2299 int *passwd) /* flag to hide display of chars */
2301 int i, w, w2, already_got_one = 0;
2302 int vfirst, vlast, dfirst, dlast, vi, di;
2303 int new_vbase;
2304 unsigned (*width_a_to_b)(UCS *, int, int);
2307 * Set passwd to 10 in caller if you want to conceal the
2308 * password but not print asterisks for feedback.
2310 * Set passwd to 1 in caller to conceal by printing asterisks.
2312 if(passwd && *passwd >= 10){ /* don't show asterisks */
2313 if(*passwd > 10)
2314 return;
2315 else
2316 *passwd = 11; /* only blat once */
2318 i = 0;
2319 (*displ->movecursor)(displ->row, displ->col);
2320 while(i++ <= displ->dwid)
2321 (*displ->writechar)(' ');
2323 (*displ->movecursor)(displ->row, displ->col);
2324 return;
2327 if(passwd && *passwd)
2328 width_a_to_b = single_width_chars_a_to_b;
2329 else
2330 width_a_to_b = ucs4_str_width_a_to_b;
2333 * vl is the virtual line (the actual data). We operate on it by typing
2334 * characters to be added and deleting and so forth. In this routine we
2335 * copy a subset of those UCS-4 characters in vl into dl, the display
2336 * array, and show that subset on the screen.
2338 * Offset is the location of the cursor in vl.
2340 * We will display the string starting from vbase.
2341 * We have dwid screen cells to work in.
2342 * We may have to adjust vbase in order to display the
2343 * part of the string that contains the cursor.
2345 * We'll make the display look like
2346 * vl a b c d e f g h i j k l m
2347 * xxxxxxxxxxxxx <- width dwid window
2348 * < d e f g h >
2350 * vbase
2351 * The < will be there if vbase > 0.
2352 * The > will be there if the string from vbase to the
2353 * end can't all fit in the window.
2356 memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2359 * Adjust vbase so offset is not out of the window to the right.
2360 * (The +2 in w + 2 is for a possible " >" if the string goes past
2361 * the right hand edge of the window and if the last visible character
2362 * is double wide. We don't want the offset to be under that > character.)
2364 for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2365 w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2366 w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2368 * offset is off the window to the right
2369 * It looks like a b c d e f g h
2370 * | |
2371 * vbase offset
2372 * and offset is either past the right edge,
2373 * or right at the right edge (and maybe under >),
2374 * or one before right at the edge (and maybe on space
2375 * for half a character).
2377 * Since the characters may be double width it is slightly
2378 * complicated to figure out how far to increase vbase.
2379 * We're going to scoot over past width w/2 characters and
2380 * then see if that's sufficient.
2382 new_vbase = displ->vbase + 1;
2383 for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2384 w2 < displ->dwid/2;
2385 w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2386 new_vbase++;
2388 displ->vbase = new_vbase;
2391 /* adjust so offset is not out of the window to the left */
2392 while(displ->vbase > 0 && displ->vbase >= offset){
2393 /* add about dwid/2 more width */
2394 new_vbase = displ->vbase - 1;
2395 for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2396 w2 < (displ->dwid+1)/2 && new_vbase > 0;
2397 w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2398 new_vbase--;
2400 /* but don't let it get too small, recheck off right end */
2401 for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2402 w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2403 w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2404 new_vbase++;
2406 displ->vbase = MAX(new_vbase, 0);
2409 if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2410 displ->vbase = 0;
2412 vfirst = displ->vbase;
2413 dfirst = 0;
2414 if(displ->vbase > 0){ /* off screen cue left */
2415 dfirst = 1; /* index which matches vfirst */
2416 displ->dl[0] = '<';
2419 vlast = displ->vused-1; /* end */
2420 w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2422 if(w + dfirst > displ->dwid){ /* off window right */
2424 /* find last ucs character to be printed */
2425 while(w + dfirst > displ->dwid - 1) /* -1 for > */
2426 w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2428 /* worry about double-width characters */
2429 if(w + dfirst == displ->dwid - 1){ /* no prob, hit it exactly */
2430 dlast = dfirst + vlast - vfirst + 1; /* +1 for > */
2431 displ->dl[dlast] = '>';
2433 else{
2434 dlast = dfirst + vlast - vfirst + 1;
2435 displ->dl[dlast++] = ' ';
2436 displ->dl[dlast] = '>';
2439 else
2440 dlast = dfirst + vlast - vfirst;
2443 * Copy the relevant part of the virtual line into the display line.
2445 for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2446 if(passwd && *passwd)
2447 displ->dl[di] = '*'; /* to conceal password */
2448 else
2449 displ->dl[di] = displ->vl[vi];
2452 * Add spaces to clear the rest of the line.
2453 * We have dwid total space to fill.
2455 w = (*width_a_to_b)(displ->dl, 0, dlast); /* width through dlast */
2456 for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2457 displ->dl[di++] = ' ';
2460 * Draw from left to right, skipping until we get to
2461 * something that is different. Characters may be different
2462 * widths than they were initially so paint from there the
2463 * rest of the way.
2465 for(di = 0; displ->dl[di]; di++){
2466 if(already_got_one || displ->dl[di] != displ->olddl[di]){
2467 /* move cursor first time */
2468 if(!already_got_one++){
2469 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2470 (*displ->movecursor)(displ->row, displ->col + w);
2473 (*displ->writechar)(displ->dl[di]);
2474 displ->olddl[di] = displ->dl[di];
2478 memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2481 * Move the cursor to the offset.
2483 * The offset is relative to the start of the virtual array. We need
2484 * to find the location on the screen. The offset into the display array
2485 * will be offset-vbase+dfirst. We want to be at the start of that
2486 * character, so we need to find the width of all the characters up
2487 * to that point.
2489 w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2491 (*displ->movecursor)(displ->row, displ->col + w);
2496 * This is just like ucs4_str_width_a_to_b() except all of the characters
2497 * are assumed to be of width 1. This is for printing out *'s when user
2498 * enters a password, while still managing to use the same code to do the
2499 * display.
2501 unsigned
2502 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2504 unsigned width = 0;
2505 int i;
2507 if(ucsstr)
2508 for(i = a; i <= b && ucsstr[i]; i++)
2509 width++;
2511 return width;