Add support for tab-completion when selecting by rule
[alpine.git] / pith / charconv / utf8.c
blob6a1034fda126a4424e1cc5b86c41ad17caa6ee03
1 /*
2 * ========================================================================
3 * Copyright 2013-2022 Eduardo Chappa
4 * Copyright 2006-2008 University of Washington
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * ========================================================================
16 /* includable WITHOUT dependency on c-client */
17 #include "../../c-client/mail.h"
18 #include "../../c-client/utf8.h"
20 #ifdef _WINDOWS
21 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
22 #undef ERROR
23 #else
24 #define _XOPEN_SOURCE
25 #endif
27 #include <system.h>
29 #include "../../c-client/fs.h"
31 /* includable WITHOUT dependency on pico */
32 #include "../../pico/keydefs.h"
34 #include "../osdep/collate.h"
35 #include "../filttype.h"
37 #include "utf8.h"
39 #include <stdarg.h>
42 unsigned single_width_chars_a_to_b(UCS *, int, int);
45 static char locale_charmap[50];
47 static int native_utf8;
48 static void *display_data;
50 void
51 init_utf8_display(int utf8, void *rmap)
53 native_utf8 = utf8;
54 display_data = rmap;
59 * Argument is a UCS-4 wide character.
60 * Returns the environment dependent cell width of the
61 * character when printed to the screen.
62 * This will be -1 if the character is not printable.
63 * It will be >= zero if it is printable.
65 * Note that in the case it is not printable but it is still sent to
66 * Writechar, Writechar will print a '?' with width 1.
68 int
69 wcellwidth(UCS ucs)
71 char dummy[32];
72 long w;
75 * We believe that on modern unix systems wchar_t is a UCS-4 character.
76 * That's the assumption here.
79 if(native_utf8){ /* display is UTF-8 capable */
80 w = ucs4_width((unsigned long) ucs);
81 return((w & U4W_ERROR) ? -1 : w);
83 else if(display_data){
84 if(wtomb(dummy, ucs) < 0)
85 return(-1);
86 else{
87 w = ucs4_width((unsigned long) ucs);
88 return((w & U4W_ERROR) ? -1 : w);
91 #if !defined(_WINDOWS) && HAVE_WCWIDTH
92 else
93 return(wcwidth((wchar_t) ucs));
94 #else
95 return(0);
96 #endif
99 /* ambiguous width zone character function. We use the Windows code until
100 * we find a better way to do it in general.
103 pith_ucs4width(UCS ucs)
105 return (ucs >= 0x2100) ? 2 : 1;
106 #if !defined(_WINDOWS) && HAVE_WCWIDTH
107 return wcwidth((wchar_t) ucs);
108 #else
109 return (ucs >= 0x2100) ? 2 : 1;
110 #endif /* _WINDOWS */
114 * Argument is a UCS-4 wide character.
115 * It is converted to the multibyte version (for example UTF8 or EUC-JP).
116 * Dest is a buffer at least xx chars wide where the multi-byte version
117 * of the wide character will be written.
118 * The returned value is the number of bytes written to dest or -1
119 * if the conversion can't be done.
122 wtomb(char *dest, UCS ucs)
124 int rv;
126 * We believe that on modern unix systems wchar_t is a UCS-4 character.
127 * That's the assumption here.
130 if(native_utf8){
131 unsigned char *newdptr;
133 newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
134 return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
136 else if(display_data){
137 unsigned long ucs4;
138 int ret;
140 ucs4 = (unsigned long) ucs;
141 ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
142 if(ret >= 0)
143 ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
144 else
145 ret = -1;
147 return(ret);
149 else
150 #if defined(HAVE_WCRTOMB)
151 rv = wcrtomb(dest, (wchar_t) ucs, NULL);
152 #elif defined(HAVE_WCTOMB)
153 rv = wctomb(dest, (wchar_t) ucs);
154 #else
155 rv = -1;
156 #endif
157 return rv;
162 * This function does not necessarily update inputp and remaining_octets, so
163 * don't rely on that. The c-client version does but the other doesn't.
166 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
168 UCS ucs;
170 if(input_cs){
171 CHARSET *cast_input_cs;
173 cast_input_cs = (CHARSET *) input_cs;
175 switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
176 case U8G_ENDSTRG:
177 case U8G_ENDSTRI:
178 return(CCONV_NEEDMORE);
180 default:
181 if(ucs & U8G_ERROR || ucs == UBOGON)
182 return(CCONV_BADCHAR);
184 return(ucs);
187 else{
188 size_t ret;
189 wchar_t w;
192 * Warning: input_cs and remaining_octets are unused in this
193 * half of the if/else.
195 * Unfortunately, we can't tell the difference between a source string
196 * that is just not long enough and one that has characters that can't
197 * be converted even though it is long enough. We return NEEDMORE in both cases.
199 ret = mbstowcs(&w, (char *) (*inputp), 1);
200 if(ret == (size_t)(-1))
201 return(CCONV_NEEDMORE);
202 else{
203 ucs = (UCS) w;
204 return(ucs);
210 void
211 set_locale_charmap(char *charmap)
213 if(charmap){
214 strncpy(locale_charmap, charmap, sizeof(locale_charmap));
215 locale_charmap[sizeof(locale_charmap)-1] = '\0';
217 else
218 locale_charmap[0] = '\0';
223 * This ensures that the string is UTF-8. If str is already a UTF-8 string,
224 * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
225 * The caller is responsible for freeing the returned value.
227 * Args str -- the string to convert
229 char *
230 convert_to_utf8(char *str, char *fromcharset, int flags)
232 char *ret = NULL;
233 char *fcharset;
234 SIZEDTEXT src, result;
235 const CHARSET *cs = NULL;
236 int try;
238 src.data = (unsigned char *) str;
239 src.size = strlen(str);
241 /* already UTF-8, return NULL */
242 if(!(flags & CU8_NOINFER)
243 && (cs = utf8_infercharset(&src))
244 && (cs->type == CT_ASCII || cs->type == CT_UTF8))
245 return(ret);
247 try = 1;
248 while(try < 5){
249 switch(try){
250 case 1:
251 fcharset = fromcharset;
252 if(fcharset && strucmp("UTF-8", fcharset) != 0)
253 break; /* give it a try */
254 else
255 try++; /* fall through */
257 case 2:
258 if(!(flags & CU8_NOINFER)){
259 fcharset = cs ? cs->name : NULL;
260 if(fcharset && strucmp("UTF-8", fcharset) != 0)
261 break;
262 else
263 try++; /* fall through */
265 else
266 try++; /* fall through */
268 case 3:
269 fcharset = locale_charmap;
270 if(fcharset && strucmp("UTF-8", fcharset) != 0)
271 break;
272 else
273 try++; /* fall through */
275 default:
276 fcharset = "ISO-8859-1"; /* this will "work" */
277 break;
280 memset(&result, 0, sizeof(result));
282 if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
283 if(!(result.size == src.size && result.data == src.data)){
284 ret = (char *) fs_get((result.size+1) * sizeof(char));
285 strncpy(ret, (char *) result.data, result.size);
286 ret[result.size] = '\0';
288 /* else no conversion necessary */
290 if(result.data && result.data != src.data)
291 fs_give((void **) &result.data);
292 result.size = 0;
294 return(ret);
297 try++;
300 /* won't make it to here */
301 return(ret);
306 * Convert from UTF-8 to user's locale charset.
307 * This actually uses the wtomb routine to do the conversion, and that
308 * relies on setup_for_input_output having been called.
309 * If no conversion is necessary, NULL is returned, otherwise an allocated
310 * string in the locale charset is returned and the caller is responsible
311 * for freeing it.
313 char *
314 convert_to_locale(char *utf8str)
316 #define CHNK 500
317 char *inp, *ret = NULL;
318 CBUF_S cb;
319 int alloced;
320 size_t i = 0;
322 if(native_utf8 || !utf8str || !utf8str[0])
323 return(NULL);
325 cb.cbuf[0] = '\0';
326 cb.cbufp = cb.cbufend = cb.cbuf;
327 inp = utf8str;
329 alloced = CHNK;
330 ret = (char *) fs_get(alloced * sizeof(char));
333 * There's gotta be a better way to do this but utf8_to_locale was
334 * available and everything looks like a nail when all you have
335 * is a hammer.
337 while(*inp){
339 * We're placing the outgoing stream of characters in ret, a multi-byte
340 * array of characters in the user's locale charset. See if there is
341 * enough room for the next wide characters worth of output chars
342 * and allocate more space if not.
344 if((alloced - i) < MAX(MB_LEN_MAX,32)){
345 alloced += CHNK;
346 fs_resize((void **) &ret, alloced * sizeof(char));
349 i += utf8_to_locale((int) *inp++, &cb,
350 (unsigned char *) &ret[i], alloced - i);
353 fs_resize((void **) &ret, i + 1);
355 ret[i] = '\0';
357 return(ret);
362 * Pass in a stream of UTF-8 characters in 'c' and return obuf
363 * filled in with multi-byte characters. The return value is the
364 * number of valid characters in obuf to be used.
367 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
369 int outchars = 0;
371 if(!(cb && cb->cbufp))
372 return(0);
374 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
375 unsigned char *inputp;
376 unsigned long remaining_octets;
377 UCS ucs;
379 *(cb->cbufp)++ = (unsigned char) c;
380 inputp = cb->cbuf;
381 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
382 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
384 switch(ucs){
385 case U8G_ENDSTRG: /* incomplete character, wait */
386 case U8G_ENDSTRI: /* incomplete character, wait */
387 break;
389 default:
390 if(ucs & U8G_ERROR || ucs == UBOGON){
392 * None of these cases is supposed to happen. If it
393 * does happen then the input stream isn't UTF-8
394 * so something is wrong. Treat each character in the
395 * input buffer as a separate error character and
396 * print a '?' for each.
398 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
399 obuf[outchars++] = '?';
401 cb->cbufp = cb->cbuf;
403 else{
404 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
406 * This happens when we have a UTF-8 character that
407 * we aren't able to print in our locale. For example,
408 * if the locale is setup with the terminal
409 * expecting ISO-8859-1 characters then there are
410 * lots of UTF-8 characters that can't be printed.
411 * Print a '?' instead.
413 obuf[outchars++] = '?';
415 else{
417 * Convert the ucs into the multibyte
418 * character that corresponds to the
419 * ucs in the users locale.
421 outchars = wtomb((char *) obuf, ucs);
422 if(outchars < 0){
423 obuf[0] = '?';
424 outchars = 1;
428 /* update the input buffer */
429 if(inputp >= cb->cbufp) /* this should be the case */
430 cb->cbufp = cb->cbuf;
431 else{ /* extra chars for some reason? */
432 unsigned char *q, *newcbufp;
434 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
435 q = cb->cbuf;
436 while(inputp < cb->cbufp)
437 *q++ = *inputp++;
439 cb->cbufp = newcbufp;
443 break;
446 else{ /* error */
447 obuf[0] = '?';
448 outchars = 1;
449 cb->cbufp = cb->cbuf; /* start over */
452 return(outchars);
457 * Returns the screen cells width of the UCS-4 string argument.
458 * The source string is zero terminated.
460 unsigned
461 ucs4_str_width(UCS *ucsstr)
463 unsigned width = 0;
464 int w;
466 if(ucsstr)
467 while(*ucsstr){
468 w = wcellwidth(*ucsstr++);
469 if(w != U4W_CTLSRGT)
470 width += (w < 0 ? 1 : w);
473 return width;
478 * Returns the screen cells width of the UCS-4 string argument
479 * from ucsstr[a] through (inclusive) ucsstr[b].
480 * No checking is done to make sure a starts in the middle
481 * of a UCS-4 array.
483 unsigned
484 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
486 unsigned width = 0;
487 int i, w;
489 if(ucsstr)
490 for(i = a; i <= b && ucsstr[i]; i++){
491 w = wcellwidth(ucsstr[i]);
492 if(w != U4W_CTLSRGT)
493 width += (w < 0 ? 1 : w);
496 return width;
501 * Returns the screen cells width of the UCS-4 string argument
502 * from ustart through (exclusive) uend.
503 * No checking is done to make sure it starts in the middle
504 * of a UCS-4 array.
506 unsigned
507 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
509 UCS *u;
510 unsigned width = 0;
511 int w;
513 if(!ustart)
514 return width;
516 if(ustart)
517 for(u = ustart; u < uend; u++){
518 w = wcellwidth(*u);
519 if(w != U4W_CTLSRGT)
520 width += (w < 0 ? 1 : w);
523 return(width);
528 * Return the largest possible pointer into ucs4str so that the width
529 * of the string from ucs4str to the pointer (exclusive)
530 * is maxwidth or less. Also stops at a null character.
532 UCS *
533 ucs4_particular_width(UCS *ucs4str, int maxwidth)
535 UCS *u;
536 int w_consumed = 0, w, done = 0;
538 u = ucs4str;
540 if(u)
541 while(!done && *u && w_consumed <= maxwidth){
542 w = wcellwidth(*u);
543 w = (w >= 0 ? w : 1);
544 if(w_consumed + w <= maxwidth){
545 w_consumed += w;
546 ++u;
548 else
549 ++done;
552 return(u);
557 * Convert and copy a UTF-8 string into a UCS-4 NULL
558 * terminated array. Just like cpystr only it converts
559 * from UTF-8 to UCS-4.
561 * Returned UCS-4 string needs to be freed by caller.
563 UCS *
564 utf8_to_ucs4_cpystr(char *utf8src)
566 size_t retsize;
567 UCS *ret = NULL;
568 UCS ucs;
569 unsigned long remaining_octets;
570 unsigned char *readptr;
571 size_t arrayindex;
574 * We don't know how big to allocate the return array
575 * because variable numbers of octets in the src array
576 * will combine to make UCS-4 characters. The number of
577 * UCS-4 characters is less than or equal to the number
578 * of src characters, though.
581 if(!utf8src)
582 return NULL;
584 retsize = strlen(utf8src) + 1;
586 ret = (UCS *) fs_get(retsize * sizeof(*ret));
587 memset(ret, 0, retsize * sizeof(*ret));
589 readptr = (unsigned char *) utf8src;
590 remaining_octets = retsize-1;
591 arrayindex = 0;
593 while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
594 ucs = (UCS) utf8_get(&readptr, &remaining_octets);
596 if(ucs & U8G_ERROR || ucs == UBOGON)
597 remaining_octets = 0;
598 else
599 ret[arrayindex++] = ucs;
602 ret[arrayindex] = '\0';
604 /* get rid of excess size */
605 if(arrayindex+1 < retsize)
606 fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
608 return ret;
613 * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
614 * terminated string. Just like cpystr only it converts
615 * from UCS-4 to UTF-8.
617 * Returned UTF-8 string needs to be freed by caller.
619 char *
620 ucs4_to_utf8_cpystr(UCS *ucs4src)
622 unsigned char *ret = NULL;
623 unsigned char *writeptr;
624 int i;
626 if(!ucs4src)
627 return NULL;
630 * Over-allocate and then resize at the end.
633 /* count characters in source */
634 for(i = 0; ucs4src[i]; i++)
637 ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
638 memset(ret, 0, (6*i + 1) * sizeof(*ret));
640 writeptr = ret;
641 for(i = 0; ucs4src[i]; i++)
642 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
644 /* get rid of excess size */
645 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
647 return ((char *) ret);
652 * Similar to above but copy a fixed number of source
653 * characters instead of going until null terminator.
655 char *
656 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
658 unsigned char *ret = NULL;
659 unsigned char *writeptr;
660 int i;
662 if(!ucs4src)
663 return NULL;
666 * Over-allocate and then resize at the end.
669 ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
670 memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
672 writeptr = ret;
673 for(i = 0; i < ucs4src_len; i++)
674 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
676 /* get rid of excess size */
677 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
679 return ((char *) ret);
683 * Similar to above but copy what is possible to a
684 * string of a size at most the given retlen.
686 char *
687 ucs4_to_utf8_n_cpystr(UCS *ucs4src, int retlen)
689 unsigned char *ret = NULL;
690 unsigned char *writeptr;
691 int i, oldlen, len;
693 if(!ucs4src)
694 return NULL;
697 * Over-allocate and then resize at the end.
700 /* count characters in source */
701 for(i = 0; ucs4src[i]; i++)
704 ret = (unsigned char *) fs_get((6*i + 1) * sizeof(unsigned char));
705 memset(ret, 0, (6*i + 1) * sizeof(unsigned char));
707 writeptr = ret;
708 oldlen = len = 0;
709 for(i = 0; ucs4src[i] && (len < retlen); i++){
710 oldlen = len;
711 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
712 len = strlen((char *)ret);
714 if(len > retlen){
715 ret[oldlen] = '\0';
716 len = oldlen;
719 /* get rid of excess size */
720 fs_resize((void **) &ret, (len + 1) * sizeof(unsigned char));
722 return ((char *) ret);
726 #ifdef _WINDOWS
728 * Convert a UTF-8 argument into an LPTSTR version
729 * of that argument. The result is allocated here
730 * and should be freed by the caller.
732 LPTSTR
733 utf8_to_lptstr(LPSTR arg_utf8)
735 int lptstr_len;
736 LPTSTR lptstr_ret = NULL;
738 lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
739 if(lptstr_len > 0)
741 lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
742 lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
743 arg_utf8, -1, lptstr_ret, lptstr_len );
746 if(!lptstr_len)
748 /* check GetLastError()? */
749 lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
750 lptstr_ret[0] = 0;
753 return lptstr_ret;
758 * Convert an LPTSTR argument into a UTF-8 version
759 * of that argument. The result is allocated here
760 * and should be freed by the caller.
762 LPSTR
763 lptstr_to_utf8(LPTSTR arg_lptstr)
765 int utf8str_len;
766 LPSTR utf8str_ret = NULL;
768 utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
769 if(utf8str_len > 0)
771 utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
772 utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
773 arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
776 if(!utf8str_len)
778 /* check GetLastError()? */
779 utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
780 utf8str_ret[0] = 0;
783 return utf8str_ret;
788 * Convert a UCS4 argument into an LPTSTR version
789 * of that argument. The result is allocated here
790 * and should be freed by the caller.
792 LPTSTR
793 ucs4_to_lptstr(UCS *arg_ucs4)
795 LPTSTR ret_lptstr = NULL;
796 size_t len;
797 size_t i;
799 if(arg_ucs4){
800 len = ucs4_strlen(arg_ucs4);
801 ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
802 /* bogus conversion ignores UTF-16 */
803 for(i = 0; i < len; i++)
804 ret_lptstr[i] = arg_ucs4[i];
806 ret_lptstr[len] = '\0';
809 return(ret_lptstr);
814 * Convert an LPTSTR argument into a UCS4 version
815 * of that argument. The result is MemAlloc'd here
816 * and should be freed by the caller.
818 UCS *
819 lptstr_to_ucs4(LPTSTR arg_lptstr)
821 UCS *ret_ucs4 = NULL;
822 size_t len;
823 size_t i;
825 if(arg_lptstr){
826 len = _tcslen(arg_lptstr);
827 ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
828 /* bogus conversion ignores UTF-16 */
829 for(i = 0; i < len; i++)
830 ret_ucs4[i] = arg_lptstr[i];
832 ret_ucs4[len] = '\0';
835 return(ret_ucs4);
838 #endif /* _WINDOWS */
842 * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
843 * 1-at-a-time filled in with UCS characters. The return value is the
844 * number of valid characters in obuf to be used. It can only
845 * be 1 or 0 characters since we're only getting one UTF-8 character
846 * at a time.
849 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
851 int width = 0, outchars = 0;
853 if(!(cb && cb->cbufp))
854 return(0);
856 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
857 unsigned char *inputp;
858 unsigned long remaining_octets;
859 UCS ucs;
861 *cb->cbufp++ = (unsigned char) c;
862 inputp = cb->cbuf;
863 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
864 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
866 switch(ucs){
867 case U8G_ENDSTRG: /* incomplete character, wait */
868 case U8G_ENDSTRI: /* incomplete character, wait */
869 break;
871 default:
872 if(ucs & U8G_ERROR || ucs == UBOGON){
874 * None of these cases is supposed to happen. If it
875 * does happen then the input stream isn't UTF-8
876 * so something is wrong.
878 outchars++;
879 *obuf = '?';
880 cb->cbufp = cb->cbuf;
881 width = 1;
883 else{
884 outchars++;
885 if(ucs < 0x80 && ucs >= 0x20)
886 width = 1;
888 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
890 * This happens when we have a UTF-8 character that
891 * we aren't able to print in our locale. For example,
892 * if the locale is setup with the terminal
893 * expecting ISO-8859-1 characters then there are
894 * lots of UTF-8 characters that can't be printed.
895 * Print a '?' instead.
896 * Don't think this should happen in Windows.
898 *obuf = '?';
900 else{
901 *obuf = ucs;
904 /* update the input buffer */
905 if(inputp >= cb->cbufp) /* this should be the case */
906 cb->cbufp = cb->cbuf;
907 else{ /* extra chars for some reason? */
908 unsigned char *q, *newcbufp;
910 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
911 q = cb->cbuf;
912 while(inputp < cb->cbufp)
913 *q++ = *inputp++;
915 cb->cbufp = newcbufp;
919 break;
922 else{ /* error */
923 *obuf = '?';
924 outchars = 1;
925 width = 1;
926 cb->cbufp = cb->cbuf; /* start over */
929 if(obufwidth)
930 *obufwidth = width;
932 return(outchars);
937 * Return an allocated copy of a zero-terminated UCS-4 string.
939 UCS *
940 ucs4_cpystr(UCS *ucs4src)
942 size_t arraysize;
943 UCS *ret = NULL;
944 size_t i;
946 if(!ucs4src)
947 return NULL;
949 arraysize = ucs4_strlen(ucs4src);
951 ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
952 memset(ret, 0, (arraysize+1) * sizeof(*ret));
954 for(i = 0; i < arraysize; i++)
955 ret[i] = ucs4src[i];
957 return ret;
961 UCS *
962 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
964 size_t i;
966 if(ucs4src && ucs4dst){
967 for(i = 0; i < n; i++){
968 ucs4dst[i] = ucs4src[i];
969 if(ucs4dst[i] == '\0')
970 break;
974 return ucs4dst;
978 UCS *
979 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
981 size_t i;
982 UCS *u;
984 if(ucs4src && ucs4dst){
985 for(u = ucs4dst; *u; u++)
988 for(i = 0; i < n; i++){
989 u[i] = ucs4src[i];
990 if(u[i] == '\0')
991 break;
994 if(i == n)
995 u[i] = '\0';
998 return ucs4dst;
1003 * Like strlen only this returns the number of non-zero characters
1004 * in a zero-terminated UCS-4 array.
1006 size_t
1007 ucs4_strlen(UCS *ucs4str)
1009 size_t i = 0;
1011 if(ucs4str)
1012 while(ucs4str[i])
1013 i++;
1015 return(i);
1020 ucs4_strcmp(UCS *s1, UCS *s2)
1022 for(; *s1 == *s2; s1++, s2++)
1023 if(*s1 == '\0')
1024 return 0;
1026 return((*s1 < *s2) ? -1 : 1);
1030 UCS *
1031 ucs4_strchr(UCS *s, UCS c)
1033 if(!s)
1034 return NULL;
1036 while(*s && *s != c)
1037 s++;
1039 if(*s || !c)
1040 return s;
1041 else
1042 return NULL;
1046 UCS *
1047 ucs4_strrchr(UCS *s, UCS c)
1049 UCS *ret = NULL;
1051 if(!s)
1052 return ret;
1054 while(*s){
1055 if(*s == c)
1056 ret = s;
1058 s++;
1061 return ret;
1066 * Returns the screen cells width of the UTF-8 string argument.
1068 unsigned
1069 utf8_width(char *str)
1071 unsigned width = 0;
1072 int this_width;
1073 UCS ucs;
1074 unsigned long remaining_octets;
1075 char *readptr;
1077 if(!(str && *str))
1078 return(width);
1080 readptr = str;
1081 remaining_octets = readptr ? strlen(readptr) : 0;
1083 while(remaining_octets > 0 && *readptr){
1085 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1087 if(ucs & U8G_ERROR || ucs == UBOGON){
1089 * This should not happen, but do something to handle it anyway.
1090 * Treat each character as a single width character, which is what should
1091 * probably happen when we actually go to write it out.
1093 remaining_octets--;
1094 readptr++;
1095 this_width = 1;
1097 else{
1098 this_width = wcellwidth(ucs);
1101 * If this_width is -1 that means we can't print this character
1102 * with our current locale. Writechar will print a '?'.
1104 if(this_width < 0)
1105 this_width = 1;
1108 width += (unsigned) this_width;
1111 return(width);
1116 * Copy UTF-8 characters from src into dst.
1117 * This is intended to be used if you want to truncate a string at
1118 * the start instead of the end. For example, you have a long string
1119 * like
1120 * this_is_a_long_string
1121 * but not enough space to fit it into a particular field. You want to
1122 * end up with
1123 * s_a_long_string
1124 * where that fits in a particular width. Perhaps you'd use this with ...
1125 * to get
1126 * ...s_a_long_string
1127 * This right adjusts the end of the string in the width space and
1128 * cuts it off at the start. If there is enough width for the whole
1129 * string it will copy the string into dst with no padding.
1131 * Copy enough characters so that the result will have screen width of
1132 * want_width screen cells in current locale.
1134 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1135 * to dst. This is just for protection, it shouldn't be relied on to
1136 * do anything useful. Dstlen should be large enough. Otherwise you'll get
1137 * characters truncated in the middle or something like that.
1139 * Returned value is the number of bytes written to dst, not including
1140 * the possible terminating null.
1142 * If we can't hit want_width exactly because of double width characters
1143 * then we will pad the end of the string with space in order to make
1144 * the width exact.
1146 size_t
1147 utf8_to_width_rhs(char *dst, /* destination buffer */
1148 char *src, /* source string */
1149 size_t dstlen, /* space in dest */
1150 unsigned want_width) /* desired screen width */
1152 int this_width;
1153 unsigned width_consumed = 0;
1154 UCS ucs;
1155 unsigned long remaining_octets;
1156 char *readptr, *goodreadptr, *savereadptr, *endptr;
1157 size_t nb = 0;
1159 if(!src){
1160 if(dstlen > 0)
1161 dst[0] = '\0';
1163 return nb;
1167 * Start at the end of the source string and go backwards until we
1168 * get to the desired width, but not more than the width.
1170 readptr = src + strlen(src);
1171 endptr = readptr;
1172 goodreadptr = readptr;
1173 width_consumed = 0;
1174 savereadptr = readptr;
1176 for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1177 readptr = savereadptr-1){
1179 savereadptr = readptr;
1180 remaining_octets = goodreadptr - readptr;
1181 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1184 * Handling the error case is tough because an error will be the normal thing that
1185 * happens as we back through the string. So we're just going to punt on the
1186 * error for now.
1188 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1189 if(remaining_octets > 0){
1191 * This means there are some bad octets after this good
1192 * character so things are not going to work out well.
1193 * Bail out.
1195 savereadptr = src; /* we're done */
1197 else{
1198 this_width = wcellwidth(ucs);
1200 if(this_width < 0)
1201 this_width = 1;
1203 if(width_consumed + (unsigned) this_width <= want_width){ /* ok */
1204 width_consumed += (unsigned) this_width;
1205 goodreadptr = savereadptr;
1207 else
1208 savereadptr = src; /* we're done */
1214 * Copy characters from goodreadptr to endptr into dst.
1216 nb = MIN(endptr-goodreadptr, dstlen-1);
1217 strncpy(dst, goodreadptr, nb);
1218 dst[nb] = '\0';
1221 * Pad out with spaces in order to hit width exactly.
1223 while(width_consumed < want_width && nb < dstlen-1){
1224 dst[nb++] = ' ';
1225 dst[nb] = '\0';
1226 width_consumed++;
1229 return nb;
1234 * The arguments being converted are UTF-8 strings.
1235 * This routine attempts to make it possible to use screen cell
1236 * widths in a format specifier. In a one-byte per screen cell
1237 * world we might have used %10.10s to cause a string to occupy
1238 * 10 screen positions. Since the width and precision are really
1239 * referring to numbers of bytes instead of screen positions that
1240 * won't work with UTF-8 input. We emulate that behavior with
1241 * the format string %w. %m.nw means to use the m and n as
1242 * screen width indicators instead of bytes indicators.
1244 * There is no reason to use this routine unless you want to use
1245 * min field with or precision with the specifier. A plain %w without
1246 * widths is equivalent exactly to a plain %s in a regular printf.
1248 * Double-width characters complicate things. It may not be possible
1249 * to satisfy the request exactly. For example, %3w for an input
1250 * string that is made up of two double-width characters.
1251 * This routine will arbitrarily use a trailing space character if
1252 * needed to make the width come out correctly where a half of a
1253 * double-width character would have been needed. We'll see how
1254 * that works for us.
1256 * %w only works for strings (it's a %s replacement).
1258 * Buffer overflow is handled by the size argument. %.30s will work
1259 * to limit a particular string to 30 bytes, but you lose that
1260 * ability with %w, since it may write more than precision bytes
1261 * in order to get to the desired width. It is best to choose
1262 * size large enough so that it doesn't come into play, otherwise
1263 * it may be possible to get partial UTF-8 characters because of
1264 * the truncation.
1266 * The return value isn't quite the same as the return value
1267 * of snprintf. It is the number of bytes written, not counting
1268 * the trailing null, just like snprintf. However, if it is
1269 * truncated due to size then the output is size, not the
1270 * number of characters that would have been written.
1273 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1275 char newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1276 char *start_of_specifier;
1277 char *input_str;
1278 int int_arg;
1279 double double_arg;
1280 void *ptr_arg;
1281 unsigned got_width;
1282 int more_flags, ret, w;
1283 int min_field_width, field_precision, modifier;
1284 int flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1285 va_list args;
1287 newfmt[0] = '\0';
1288 q = newfmt;
1290 pdest = dest;
1292 #define IS_ROOM_IN_DEST(n_more_chars) \
1293 ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1296 * Strategy: Look through the fmt string for %w's. Replace the
1297 * %w's in the format string with %s's but with possibly different
1298 * width and precision arguments which will make it come out right.
1299 * Then call the regular system vsnprintf with the altered format
1300 * string but same arguments.
1302 * That would be nice but it doesn't quite work. Why? Because a
1303 * %*w will need to have the value in the integer argument the *
1304 * refers to modified. Can't do it as far as I can tell. Or we could
1305 * remove the integer argument somehow before calling printf. Can't
1306 * do it. Or we could somehow add an additional conversion specifier
1307 * that caused nothing to be printed but ate up the integer arg.
1308 * Can't figure out how to do that either.
1310 * Since we can't figure out how to do it, the alternative is to
1311 * construct the result one piece at a time, pasting together the
1312 * pieces from the different conversions.
1314 va_start(args, fmt);
1316 while(*fmt && IS_ROOM_IN_DEST(1)){
1317 if(*fmt == '%'){
1318 start_of_specifier = fmt++;
1320 min_field_width = field_precision = -1;
1321 flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1323 /* flags */
1324 more_flags = 1;
1325 while(more_flags){
1326 switch(*fmt){
1327 case '-':
1328 flags_minus++;
1329 fmt++;
1330 break;
1332 case '+':
1333 flags_plus++;
1334 fmt++;
1335 break;
1337 case ' ':
1338 flags_space++;
1339 fmt++;
1340 break;
1342 case '0':
1343 flags_zero++;
1344 fmt++;
1345 break;
1347 case '#':
1348 flags_pound++;
1349 fmt++;
1350 break;
1352 default:
1353 more_flags = 0;
1354 break;
1358 /* minimum field width */
1359 if(*fmt == '*'){
1360 min_field_width = va_arg(args, int);
1361 fmt++;
1363 else if(*fmt >= '0' && *fmt <= '9'){
1364 width_str = fmt;
1365 while (*fmt >= '0' && *fmt <= '9')
1366 fmt++;
1368 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1369 if(sizeof(buf) > fmt-width_str)
1370 buf[fmt-width_str] = '\0';
1372 buf[sizeof(buf)-1] = '\0';
1374 min_field_width = atoi(width_str);
1377 /* field precision */
1378 if(*fmt == '.'){
1379 fmt++;
1380 if(*fmt == '*'){
1381 field_precision = va_arg(args, int);
1382 fmt++;
1384 else if(*fmt >= '0' && *fmt <= '9'){
1385 width_str = fmt;
1386 while (*fmt >= '0' && *fmt <= '9')
1387 fmt++;
1389 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1390 if(sizeof(buf) > fmt-width_str)
1391 buf[fmt-width_str] = '\0';
1393 buf[sizeof(buf)-1] = '\0';
1395 field_precision = atoi(width_str);
1399 /* length modifier */
1400 if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1401 modifier = *fmt++;
1403 /* conversion character */
1404 switch(*fmt){
1405 case 'w':
1407 * work with va_arg(char *) to figure out width
1408 * and precision needed to produce the screen width
1409 * and precision asked for in %w using some of the
1410 * utf8 width routines we have.
1413 input_str = va_arg(args, char *);
1414 if(field_precision >=0 || min_field_width >= 0)
1415 w = utf8_width(input_str);
1417 if(field_precision >= 0){
1418 if(w <= field_precision)
1419 field_precision = -1; /* print it all */
1420 else{
1422 * We need to cut off some of the input_str
1423 * in this case.
1425 end = utf8_count_forw_width(input_str, field_precision, &got_width);
1426 field_precision = (int) (end - input_str);
1427 /* new w with this field_precision */
1428 w = got_width;
1432 /* need some padding */
1433 if(min_field_width >= 0)
1434 min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1435 MAX(0, min_field_width - w);
1438 * Now we just need to get the new format string
1439 * set correctly in newfmt.
1441 q = newfmt;
1442 if(q-newfmt < sizeof(newfmt))
1443 *q++ = '%';
1445 if(flags_minus && q-newfmt < sizeof(newfmt))
1446 *q++ = '-';
1447 if(flags_plus && q-newfmt < sizeof(newfmt))
1448 *q++ = '+';
1449 if(flags_space && q-newfmt < sizeof(newfmt))
1450 *q++ = ' ';
1451 if(flags_zero && q-newfmt < sizeof(newfmt))
1452 *q++ = '0';
1453 if(flags_pound && q-newfmt < sizeof(newfmt))
1454 *q++ = '#';
1456 if(min_field_width >= 0){
1457 snprintf(buf, sizeof(buf), "%d", min_field_width);
1458 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1461 if(field_precision >= 0){
1462 if(q-newfmt < sizeof(newfmt))
1463 *q++ = '.';
1465 snprintf(buf, sizeof(buf), "%d", field_precision);
1466 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1469 if(q-newfmt < sizeof(newfmt))
1470 *q++ = 's';
1472 if(q-newfmt < sizeof(newfmt))
1473 *q++ = '\0';
1475 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1476 pdest += strlen(pdest);
1478 break;
1480 case '\0':
1481 fmt--;
1482 break;
1484 default:
1485 /* make a new format which leaves out the dynamic '*' arguments */
1486 q = newfmt;
1487 if(q-newfmt < sizeof(newfmt))
1488 *q++ = '%';
1490 if(flags_minus && q-newfmt < sizeof(newfmt))
1491 *q++ = '-';
1492 if(flags_plus && q-newfmt < sizeof(newfmt))
1493 *q++ = '+';
1494 if(flags_space && q-newfmt < sizeof(newfmt))
1495 *q++ = ' ';
1496 if(flags_zero && q-newfmt < sizeof(newfmt))
1497 *q++ = '0';
1498 if(flags_pound && q-newfmt < sizeof(newfmt))
1499 *q++ = '#';
1501 if(min_field_width >= 0){
1502 snprintf(buf, sizeof(buf), "%d", min_field_width);
1503 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1506 if(field_precision >= 0){
1507 if(q-newfmt < sizeof(newfmt))
1508 *q++ = '.';
1510 snprintf(buf, sizeof(buf), "%d", field_precision);
1511 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1514 if(q-newfmt < sizeof(newfmt))
1515 *q++ = *fmt;
1517 if(q-newfmt < sizeof(newfmt))
1518 *q++ = '\0';
1520 switch(*fmt){
1521 case 'd': case 'i': case 'o':
1522 case 'x': case 'X': case 'u': case 'c':
1523 int_arg = va_arg(args, int);
1524 snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1525 pdest += strlen(pdest);
1526 break;
1528 case 's':
1529 input_str = va_arg(args, char *);
1530 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1531 pdest += strlen(pdest);
1532 break;
1534 case 'f': case 'e': case 'E':
1535 case 'g': case 'G':
1536 double_arg = va_arg(args, double);
1537 snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1538 pdest += strlen(pdest);
1539 break;
1541 case 'p':
1542 ptr_arg = va_arg(args, void *);
1543 snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1544 pdest += strlen(pdest);
1545 break;
1547 case '%':
1548 if(IS_ROOM_IN_DEST(1))
1549 *pdest++ = '%';
1551 break;
1553 default:
1554 /* didn't think of this type */
1555 assert(0);
1556 break;
1559 break;
1562 fmt++;
1564 else{
1565 if(IS_ROOM_IN_DEST(1))
1566 *pdest++ = *fmt++;
1570 ret = pdest - dest;
1572 if(IS_ROOM_IN_DEST(1))
1573 *pdest++ = '\0';
1575 va_end(args);
1577 return ret;
1582 * Copy UTF-8 characters from src into dst.
1583 * Copy enough characters so that the result will have (<=) screen width of
1584 * want_width screen cells in current locale.
1586 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1587 * to dst.
1589 * Returned value is the number of bytes written to dst, not including
1590 * the possible terminating null.
1591 * Got_width is another returned value. It is the width in screen cells of
1592 * the string placed in dst. It will be the same as want_width if there
1593 * are enough characters in the src to do that and if the character widths
1594 * hit the width exactly. It will be less than want_width if we run out
1595 * of src characters or if the next character width would skip over the
1596 * width we want, because it is double width.
1598 * Zero width characters are collected and included at the end of the string.
1599 * That is, if we make it to want_width but there is still a zero length
1600 * character sitting in src, we add that to dst. This might be an accent
1601 * or something like that.
1603 size_t
1604 utf8_to_width(char *dst, /* destination buffer */
1605 char *src, /* source string */
1606 size_t dstlen, /* space in dst */
1607 unsigned want_width, /* desired screen width */
1608 unsigned *got_width) /* returned screen width in dst */
1610 int this_width;
1611 unsigned width_consumed = 0;
1612 UCS ucs;
1613 unsigned long remaining_octets;
1614 char *writeptr, *readptr, *savereadptr, *endptr;
1615 int ran_out_of_space = 0;
1617 readptr = src;
1619 remaining_octets = readptr ? strlen(readptr) : 0;
1621 writeptr = dst;
1622 endptr = writeptr + dstlen;
1624 if(readptr && writeptr){
1625 while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1626 savereadptr = readptr;
1627 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1629 if(ucs & U8G_ERROR || ucs == UBOGON)
1630 remaining_octets = 0;
1631 else{
1632 this_width = wcellwidth(ucs);
1635 * If this_width is -1 that means we can't print this character
1636 * with our current locale. Writechar will print a '?'.
1638 if(this_width < 0)
1639 this_width = 1;
1641 if(width_consumed + (unsigned) this_width <= want_width){
1642 /* append this utf8 character to dst if it will fit */
1643 if(writeptr + (readptr - savereadptr) < endptr){
1644 width_consumed += this_width;
1645 while(savereadptr < readptr)
1646 *writeptr++ = *savereadptr++;
1648 else
1649 ran_out_of_space++; /* no more utf8 to dst */
1651 else
1652 remaining_octets = 0; /* we're done */
1656 if(writeptr < endptr)
1657 *writeptr = '\0';
1660 if(got_width)
1661 *got_width = width_consumed;
1663 return(writeptr ? (writeptr - dst) : 0);
1668 * Str is a UTF-8 string.
1669 * Count forward width screencell positions and return a pointer to the
1670 * end of the string that is width wide.
1671 * The returned pointer points at the next character (where the null would
1672 * be placed).
1674 * Got_width is another returned value. It is the width in screen cells of
1675 * the string from str to the returned pointer. It will be the same as
1676 * want_width if there are enough characters in the str to do that
1677 * and if the character widths hit the width exactly. It will be less
1678 * than want_width if we run out of characters or if the next character
1679 * width would skip over the width we want, because it is double width.
1681 char *
1682 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1684 int this_width;
1685 unsigned width_consumed = 0;
1686 UCS ucs;
1687 unsigned long remaining_octets;
1688 char *readptr;
1689 char *retptr;
1691 retptr = readptr = str;
1693 remaining_octets = readptr ? strlen(readptr) : 0;
1695 while(width_consumed <= want_width && remaining_octets > 0){
1697 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1699 if(ucs & U8G_ERROR || ucs == UBOGON){
1701 * This should not happen, but do something to handle it anyway.
1702 * Treat each character as a single width character, which is what should
1703 * probably happen when we actually go to write it out.
1705 remaining_octets--;
1706 readptr++;
1707 this_width = 1;
1709 else{
1710 this_width = wcellwidth(ucs);
1713 * If this_width is -1 that means we can't print this character
1714 * with our current locale. Writechar will print a '?'.
1716 if(this_width < 0)
1717 this_width = 1;
1720 if(width_consumed + (unsigned) this_width <= want_width){
1721 width_consumed += (unsigned) this_width;
1722 retptr = readptr;
1724 else
1725 remaining_octets = 0; /* we're done */
1728 if(got_width)
1729 *got_width = width_consumed;
1731 return(retptr);
1736 * Copy a null terminator into a UTF-8 string in place so that the string is
1737 * no more than a certain screen width wide. If the string is already less
1738 * than or equal in width to the requested width, no change is made.
1740 * The actual width accomplished is returned. Note that it may be less than
1741 * max_width due to double width characters as well as due to the fact that
1742 * it fits wholly in the max_width.
1744 * Returned value is the actual screen width of str when done.
1746 * A side effect is that a terminating null may have been written into
1747 * the passed in string.
1749 unsigned
1750 utf8_truncate(char *str, unsigned max_width)
1752 int this_width;
1753 unsigned width_consumed = 0;
1754 UCS ucs;
1755 unsigned long remaining_octets;
1756 char *readptr, *savereadptr;
1758 readptr = str;
1760 remaining_octets = readptr ? strlen(readptr) : 0;
1762 if(readptr){
1763 while(width_consumed <= max_width && remaining_octets > 0){
1765 savereadptr = readptr;
1766 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1768 if(ucs & U8G_ERROR || ucs == UBOGON){
1770 * This should not happen, but do something to handle it anyway.
1771 * Treat each character as a single width character, which is what should
1772 * probably happen when we actually go to write it out.
1774 remaining_octets--;
1775 readptr++;
1776 this_width = 1;
1778 else{
1779 this_width = wcellwidth(ucs);
1782 * If this_width is -1 that means we can't print this character
1783 * with our current locale. Writechar will print a '?'.
1785 if(this_width < 0)
1786 this_width = 1;
1789 if(width_consumed + (unsigned) this_width <= max_width){
1790 width_consumed += (unsigned) this_width;
1792 else{
1793 remaining_octets = 0; /* we're done */
1794 *savereadptr = '\0';
1799 return(width_consumed);
1804 * Copy UTF-8 characters from src into dst.
1805 * Copy enough characters so that the result will have screen width of
1806 * want_width screen cells in current locale.
1807 * If there aren't enough characters in src to get to want_width, pad on
1808 * left or right according to left_adjust argument.
1810 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1811 * to dst. Dst will be null terminated if there is enough room, but not
1812 * if that would overflow dst's len.
1814 * Returned value is the number of bytes written to dst, not including
1815 * the possible terminating null.
1817 size_t
1818 utf8_pad_to_width(char *dst, /* destination buffer */
1819 char *src, /* source string */
1820 size_t dstlen, /* space in dst */
1821 unsigned want_width, /* desired screen width */
1822 int left_adjust) /* adjust left or right in want_width columns */
1824 unsigned got_width = 0;
1825 int need_more, howmany;
1826 size_t len_left, bytes_used;
1828 bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1829 len_left = dstlen - bytes_used;
1831 need_more = want_width - got_width;
1832 howmany = MIN(need_more, len_left);
1834 if(howmany > 0){
1835 char *end, *newend, *p, *q;
1837 end = dst + bytes_used;
1838 newend = end + howmany;
1839 if(left_adjust){
1841 * Add padding to end of string. Simply append
1842 * the needed number of spaces, or however many will fit
1843 * if we don't have enough space.
1845 for(q = end; q < newend; q++)
1846 *q = ' ';
1848 else{
1850 * Add padding to start of string.
1853 /* slide existing string over */
1854 for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1855 *q = *p;
1857 /* fill rest with spaces */
1858 for(; q >= dst; q--)
1859 *q = ' ';
1862 bytes_used += howmany;
1865 if(bytes_used < dstlen)
1866 dst[bytes_used] = '\0';
1868 return(bytes_used);
1873 * Str is a UTF-8 string.
1874 * Start_here is a pointer into the string. It points one position past
1875 * the last byte that should be considered a part of the length string.
1876 * Count back want_width screencell positions and return a pointer to the
1877 * start of the string that is want_width wide and ends with start_here.
1879 * Since characters may be more than one cell width wide we may end up
1880 * skipping over the exact width. That is, if we need to we'll go back
1881 * too far (by one cell width). Account for that in the call by looking
1882 * at got_width.
1884 * Note that this call gives a possible got_width == want_width+1 as
1885 * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1886 * That was just what was needed at the time, maybe it needs to be
1887 * optional.
1889 char *
1890 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1892 unsigned width_consumed = 0;
1893 int this_width;
1894 UCS ucs;
1895 unsigned long remaining_octets;
1896 char *ptr, *savereadptr, *goodreadptr;
1898 savereadptr = start_here;
1899 goodreadptr = start_here;
1901 for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1903 savereadptr = ptr;
1904 remaining_octets = goodreadptr - ptr;
1905 ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1907 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1908 if(remaining_octets > 0){
1910 * This means there are some bad octets after this good
1911 * character so things are not going to work out well.
1912 * Bail out.
1914 savereadptr = str; /* we're done */
1916 else{
1917 this_width = wcellwidth(ucs);
1920 * If this_width is -1 that means we can't print this character
1921 * with our current locale. Writechar will print a '?'.
1923 if(this_width < 0)
1924 this_width = 1;
1926 width_consumed += (unsigned) this_width;
1927 goodreadptr = savereadptr;
1932 if(got_width)
1933 *got_width = width_consumed;
1935 return(savereadptr);
1939 /*----------------------------------------------------------------------
1940 copy the source string onto the destination string returning with
1941 the destination string pointer at the end of the destination text
1943 motivation for this is to avoid twice passing over a string that's
1944 being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1946 This doesn't really belong here but it is used here.
1947 ----*/
1948 void
1949 sstrncpy(char **d, char *s, int n)
1951 while(n-- > 0 && (**d = *s++) != '\0')
1952 (*d)++;
1957 * If use_system_routines is set then NULL is the return value and it is
1958 * not an error. Display_charmap and keyboard_charmap should come over as
1959 * malloced strings and will be filled in with the result.
1961 * Returns a void pointer to the input_cs CHARSET which is
1962 * passed to mbtow via kbseq().
1963 * If !use_system_routines && NULL is returned, that is an error and err should
1964 * have a message.
1965 * display_charmap and keyboard_charmap should be malloced data and may be
1966 * realloced and changed here.
1969 setup_for_input_output(int use_system_routines, char **display_charmap,
1970 char **keyboard_charmap, void **input_cs_arg, char **err)
1972 const CHARSET *cs;
1973 const CHARSET *input_cs = NULL;
1974 int already_tried = 0;
1975 int supported = 0;
1976 char buf[1000];
1978 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1980 if(err)
1981 *err = NULL;
1983 if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1984 *err = cpstr("Bad call to setup_for_input_output");
1985 return(-1);
1988 if(use_system_routines){
1989 #if PREREQ_FOR_SYS_TRANSLATION
1990 char *dcm;
1992 dcm = nl_langinfo_codeset_wrapper();
1993 dcm = dcm ? dcm : "US-ASCII";
1995 init_utf8_display(0, NULL);
1996 if(*display_charmap){
1997 if(dcm && strucmp(*display_charmap, dcm)){
1998 snprintf(buf, sizeof(buf),
1999 _("Display character set \"%s\" is ignored when using system translation"),
2000 *display_charmap);
2002 *err = cpstr(buf);
2005 fs_give((void **) display_charmap);
2008 if(*keyboard_charmap){
2009 if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
2010 snprintf(buf, sizeof(buf),
2011 _("Keyboard character set \"%s\" is ignored when using system translation"),
2012 *keyboard_charmap);
2014 *err = cpstr(buf);
2017 fs_give((void **) keyboard_charmap);
2020 *display_charmap = cpstr(dcm);
2021 *keyboard_charmap = cpstr(dcm);
2022 #else
2023 *err = cpstr("Bad call to setup_for_input_output");
2024 #endif
2026 *input_cs_arg = NULL;
2027 return(0);
2031 try_again1:
2032 if(!(*display_charmap))
2033 *display_charmap = cpstr("US-ASCII");
2035 if(!(*keyboard_charmap))
2036 *keyboard_charmap = cpstr(*display_charmap);
2038 if(*keyboard_charmap){
2039 supported = input_charset_is_supported(*keyboard_charmap);
2041 if(supported){
2042 if(!strucmp(*keyboard_charmap, "utf-8"))
2043 input_cs = utf8_charset(*keyboard_charmap);
2044 else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
2045 input_cs = cs;
2047 else{
2048 if(err && !*err){
2049 int iso2022jp = 0;
2051 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2052 iso2022jp = 1;
2054 snprintf(buf, sizeof(buf),
2055 /* TRANSLATORS: The first argument is the name of the character
2056 set the user is trying to use (which is unsupported by alpine).
2057 The second argument is " (except for posting)" if they are
2058 trying to use ISO-2022-JP for something other than posting. */
2059 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2060 *keyboard_charmap,
2061 iso2022jp ? _(" (except for posting)") : "");
2063 *err = cpstr(buf);
2066 input_cs = NULL;
2067 fs_give((void **) keyboard_charmap);
2068 *keyboard_charmap = cpstr("US-ASCII");
2069 if(!already_tried){
2070 already_tried++;
2071 goto try_again1;
2077 try_again2:
2078 if(!(*display_charmap))
2079 *display_charmap = cpstr("US-ASCII");
2081 if(*display_charmap){
2082 supported = output_charset_is_supported(*display_charmap);
2083 if(supported){
2084 if(!strucmp(*display_charmap, "utf-8"))
2085 init_utf8_display(1, NULL);
2086 else if((cs = utf8_charset(*display_charmap)) != NULL)
2087 init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2089 else{
2090 if(err && !*err){
2091 int iso2022jp = 0;
2093 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2094 iso2022jp = 1;
2096 snprintf(buf, sizeof(buf),
2097 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2098 *display_charmap,
2099 iso2022jp ? _(" (except for posting)") : "");
2101 *err = cpstr(buf);
2104 fs_give((void **) display_charmap);
2105 if(!already_tried){
2106 already_tried++;
2107 goto try_again2;
2111 else{
2112 if(err && !*err)
2113 *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2116 #undef cpstr
2118 *input_cs_arg = (void *) input_cs;
2120 return(0);
2125 input_charset_is_supported(char *input_charset)
2127 const CHARSET *cs;
2129 if(!(input_charset && *input_charset))
2130 return 0;
2132 if(!strucmp(input_charset, "utf-8"))
2133 return 1;
2135 if((cs = utf8_charset(input_charset)) != NULL){
2138 * This was true 2006-09-25.
2140 switch(cs->type){
2141 case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2142 case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2143 case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2144 case CT_UCS4: case CT_UTF16:
2145 return 1;
2146 break;
2148 default:
2149 break;
2153 return 0;
2158 output_charset_is_supported(char *output_charset)
2160 const CHARSET *cs;
2162 if(!(output_charset && *output_charset))
2163 return 0;
2165 if(!strucmp(output_charset, "utf-8"))
2166 return 1;
2168 if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2169 return 1;
2171 return 0;
2176 posting_charset_is_supported(char *posting_charset)
2178 return(posting_charset && *posting_charset
2179 && (!strucmp(posting_charset, "ISO-2022-JP")
2180 || output_charset_is_supported(posting_charset)));
2185 * This function is only defined in this special case and so calls
2186 * to it should be wrapped in the same macro conditionals.
2188 * Returns the default display charset for a UNIX terminal emulator,
2189 * it is what nl_langinfo(CODESET) should return but we need to
2190 * wrap nl_langinfo because we know of strange behaving implementations.
2192 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2193 char *
2194 nl_langinfo_codeset_wrapper(void)
2196 char *ret = NULL;
2198 ret = nl_langinfo(CODESET);
2201 * If the value returned from nl_langinfo() is not a real charset,
2202 * see if we can figure out what they meant. If we can't figure it
2203 * out return NULL and let the caller decide what to do.
2205 if(ret && *ret && !output_charset_is_supported(ret)){
2206 if(!strcmp("ANSI_X3.4-1968", ret)
2207 || !strcmp("646", ret)
2208 || !strcmp("ASCII", ret)
2209 || !strcmp("C", ret)
2210 || !strcmp("POSIX", ret))
2211 ret = "US-ASCII";
2212 else if(!strucmp(ret, "UTF8"))
2213 ret = "UTF-8";
2214 else if(!strucmp(ret, "EUCJP"))
2215 ret = "EUC-JP";
2216 else if(!strucmp(ret, "EUCKP"))
2217 ret = "EUC-KP";
2218 else if(!strucmp(ret, "SJIS"))
2219 ret = "SHIFT-JIS";
2220 else if(strstr(ret, "8859")){
2221 char *p;
2223 /* check for digits after 8859 */
2224 p = strstr(ret, "8859");
2225 p += 4;
2226 if(!isdigit(*p))
2227 p++;
2229 if(isdigit(*p)){
2230 static char buf[12];
2232 memset(buf, 0, sizeof(buf));
2233 strncpy(buf, "ISO-8859-", sizeof(buf));
2234 buf[9] = *p++;
2235 if(isdigit(*p))
2236 buf[10] = *p;
2238 ret = buf;
2243 if(ret && !output_charset_is_supported(ret))
2244 ret = NULL;
2246 return(ret);
2248 #endif
2252 * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2253 * needed the return value will point to orig. If a conversion is done,
2254 * the return string should be freed by the caller.
2255 * If not possible, returns NULL.
2257 char *
2258 utf8_to_charset(char *orig, char *charset, int report_err)
2260 SIZEDTEXT src, dst;
2261 char *ret = orig;
2263 if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2264 return ret;
2266 src.size = strlen(orig);
2267 src.data = (unsigned char *) orig;
2269 if(!strucmp(charset, "us-ascii")){
2270 size_t i;
2272 for(i = 0; i < src.size; i++)
2273 if(src.data[i] & 0x80)
2274 return NULL;
2276 return ret;
2280 * This works for ISO-2022-JP because of special code in utf8_cstext
2281 * but not for other 2022 charsets.
2283 memset(&dst, 0, sizeof(dst));
2284 if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2285 ret = (char *) dst.data; /* c-client already null terminates it */
2286 else
2287 ret = NULL;
2289 if((unsigned char *) ret != dst.data && dst.data)
2290 fs_give((void **) &dst.data);
2292 return ret;
2297 * Turn a number into a string with comma's
2299 * Args: number -- The long to be turned into a string.
2301 * Result: pointer to static string representing number with commas
2302 * Can use up to 3 comatose results at once.
2304 char *
2305 comatose(long int number)
2307 long i, x, done_one;
2308 static char buf[3][50];
2309 static int whichbuf = 0;
2310 char *b;
2312 whichbuf = (whichbuf + 1) % 3;
2314 if(number == 0){
2315 strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2316 buf[whichbuf][sizeof(buf[0])-1] = '\0';
2317 return(buf[whichbuf]);
2320 done_one = 0;
2321 b = buf[whichbuf];
2322 for(i = 1000000000; i >= 1; i /= 1000) {
2323 x = number / i;
2324 number = number % i;
2325 if(x != 0 || done_one) {
2326 if(b != buf[whichbuf] && (b-buf[whichbuf]) < sizeof(buf[0]))
2327 *b++ = ',';
2329 snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2330 b += strlen(b);
2331 done_one = 1;
2335 if(b-buf[whichbuf] < sizeof(buf[0]))
2336 *b = '\0';
2338 return(buf[whichbuf]);
2342 /* leave out the commas */
2343 char *
2344 tose(long int number)
2346 static char buf[3][50];
2347 static int whichbuf = 0;
2349 whichbuf = (whichbuf + 1) % 3;
2351 snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2353 return(buf[whichbuf]);
2358 * line_paint - where the real work of managing what is displayed gets done.
2360 void
2361 line_paint(int offset, /* current dot offset into vl */
2362 struct display_line *displ,
2363 int *passwd) /* flag to hide display of chars */
2365 int i, w, w2, already_got_one = 0;
2366 int vfirst, vlast, dfirst, dlast, vi, di;
2367 int new_vbase;
2368 unsigned (*width_a_to_b)(UCS *, int, int);
2371 * Set passwd to 10 in caller if you want to conceal the
2372 * password but not print asterisks for feedback.
2374 * Set passwd to 1 in caller to conceal by printing asterisks.
2376 if(passwd && *passwd >= 10){ /* don't show asterisks */
2377 if(*passwd > 10)
2378 return;
2379 else
2380 *passwd = 11; /* only blat once */
2382 i = 0;
2383 (*displ->movecursor)(displ->row, displ->col);
2384 while(i++ <= displ->dwid)
2385 (*displ->writechar)(' ');
2387 (*displ->movecursor)(displ->row, displ->col);
2388 return;
2391 if(passwd && *passwd)
2392 width_a_to_b = single_width_chars_a_to_b;
2393 else
2394 width_a_to_b = ucs4_str_width_a_to_b;
2397 * vl is the virtual line (the actual data). We operate on it by typing
2398 * characters to be added and deleting and so forth. In this routine we
2399 * copy a subset of those UCS-4 characters in vl into dl, the display
2400 * array, and show that subset on the screen.
2402 * Offset is the location of the cursor in vl.
2404 * We will display the string starting from vbase.
2405 * We have dwid screen cells to work in.
2406 * We may have to adjust vbase in order to display the
2407 * part of the string that contains the cursor.
2409 * We'll make the display look like
2410 * vl a b c d e f g h i j k l m
2411 * xxxxxxxxxxxxx <- width dwid window
2412 * < d e f g h >
2414 * vbase
2415 * The < will be there if vbase > 0.
2416 * The > will be there if the string from vbase to the
2417 * end can't all fit in the window.
2420 memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2423 * Adjust vbase so offset is not out of the window to the right.
2424 * (The +2 in w + 2 is for a possible " >" if the string goes past
2425 * the right hand edge of the window and if the last visible character
2426 * is double wide. We don't want the offset to be under that > character.)
2428 for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2429 displ->dwid > 1 &&
2430 w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2431 w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2433 * offset is off the window to the right
2434 * It looks like a b c d e f g h
2435 * | |
2436 * vbase offset
2437 * and offset is either past the right edge,
2438 * or right at the right edge (and maybe under >),
2439 * or one before right at the edge (and maybe on space
2440 * for half a character).
2442 * Since the characters may be double width it is slightly
2443 * complicated to figure out how far to increase vbase.
2444 * We're going to scoot over past width w/2 characters and
2445 * then see if that's sufficient.
2447 new_vbase = displ->vbase + 1;
2448 for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2449 w2 < displ->dwid/2;
2450 w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2451 new_vbase++;
2453 displ->vbase = new_vbase;
2456 /* adjust so offset is not out of the window to the left */
2457 while(displ->vbase > 0 && displ->vbase >= offset){
2458 /* add about dwid/2 more width */
2459 new_vbase = displ->vbase - 1;
2460 for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2461 w2 < (displ->dwid+1)/2 && new_vbase > 0;
2462 w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2463 new_vbase--;
2465 /* but don't let it get too small, recheck off right end */
2466 for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2467 w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2468 w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2469 new_vbase++;
2471 displ->vbase = MAX(new_vbase, 0);
2474 if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2475 displ->vbase = 0;
2477 vfirst = displ->vbase;
2478 dfirst = 0;
2479 if(displ->vbase > 0){ /* off screen cue left */
2480 dfirst = 1; /* index which matches vfirst */
2481 displ->dl[0] = '<';
2484 vlast = displ->vused-1; /* end */
2485 w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2487 if(displ->dwid > 0 && w + dfirst > displ->dwid){ /* off window right */
2489 /* find last ucs character to be printed */
2490 while(w + dfirst > displ->dwid - 1) /* -1 for > */
2491 w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2493 /* worry about double-width characters */
2494 if(w + dfirst == displ->dwid - 1){ /* no prob, hit it exactly */
2495 dlast = dfirst + vlast - vfirst + 1; /* +1 for > */
2496 displ->dl[dlast] = '>';
2498 else{
2499 dlast = dfirst + vlast - vfirst + 1;
2500 displ->dl[dlast++] = ' ';
2501 displ->dl[dlast] = '>';
2504 else
2505 dlast = dfirst + vlast - vfirst;
2508 * Copy the relevant part of the virtual line into the display line.
2510 for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2511 if(passwd && *passwd)
2512 displ->dl[di] = '*'; /* to conceal password */
2513 else
2514 displ->dl[di] = displ->vl[vi];
2517 * Add spaces to clear the rest of the line.
2518 * We have dwid total space to fill.
2520 w = (*width_a_to_b)(displ->dl, 0, dlast); /* width through dlast */
2521 for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2522 displ->dl[di++] = ' ';
2525 * Draw from left to right, skipping until we get to
2526 * something that is different. Characters may be different
2527 * widths than they were initially so paint from there the
2528 * rest of the way.
2530 for(di = 0; displ->dl[di]; di++){
2531 if(already_got_one || displ->dl[di] != displ->olddl[di]){
2532 /* move cursor first time */
2533 if(!already_got_one++){
2534 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2535 (*displ->movecursor)(displ->row, displ->col + w);
2538 (*displ->writechar)(displ->dl[di]);
2539 displ->olddl[di] = displ->dl[di];
2543 memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2546 * Move the cursor to the offset.
2548 * The offset is relative to the start of the virtual array. We need
2549 * to find the location on the screen. The offset into the display array
2550 * will be offset-vbase+dfirst. We want to be at the start of that
2551 * character, so we need to find the width of all the characters up
2552 * to that point.
2554 w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2556 (*displ->movecursor)(displ->row, displ->col + w);
2561 * This is just like ucs4_str_width_a_to_b() except all of the characters
2562 * are assumed to be of width 1. This is for printing out *'s when user
2563 * enters a password, while still managing to use the same code to do the
2564 * display.
2566 unsigned
2567 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2569 unsigned width = 0;
2570 int i;
2572 if(ucsstr)
2573 for(i = a; i <= b && ucsstr[i]; i++)
2574 width++;
2576 return width;