* NTLM authentication support with the ntlm library, in Unix systems.
[alpine.git] / pith / charconv / utf8.c
blob7e1ae45660806a259d44410f85b95ee248ea5138
1 #if !defined(lint) && !defined(DOS)
2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
3 #endif
5 /*
6 * ========================================================================
7 * Copyright 2013-2017 Eduardo Chappa
8 * Copyright 2006-2008 University of Washington
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
14 * http://www.apache.org/licenses/LICENSE-2.0
16 * ========================================================================
20 /* includable WITHOUT dependency on c-client */
21 #include "../../c-client/mail.h"
22 #include "../../c-client/utf8.h"
24 #ifdef _WINDOWS
25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
26 #undef ERROR
27 #else
28 #define _XOPEN_SOURCE
29 #endif
31 #include <system.h>
33 #include "../../c-client/fs.h"
35 /* includable WITHOUT dependency on pico */
36 #include "../../pico/keydefs.h"
38 #include "../osdep/collate.h"
39 #include "../filttype.h"
41 #include "utf8.h"
43 #include <stdarg.h>
46 unsigned single_width_chars_a_to_b(UCS *, int, int);
49 static char locale_charmap[50];
51 static int native_utf8;
52 static void *display_data;
54 void
55 init_utf8_display(int utf8, void *rmap)
57 native_utf8 = utf8;
58 display_data = rmap;
63 * Argument is a UCS-4 wide character.
64 * Returns the environment dependent cell width of the
65 * character when printed to the screen.
66 * This will be -1 if the character is not printable.
67 * It will be >= zero if it is printable.
69 * Note that in the case it is not printable but it is still sent to
70 * Writechar, Writechar will print a '?' with width 1.
72 int
73 wcellwidth(UCS ucs)
75 char dummy[32];
76 long w;
79 * We believe that on modern unix systems wchar_t is a UCS-4 character.
80 * That's the assumption here.
83 if(native_utf8){ /* display is UTF-8 capable */
84 w = ucs4_width((unsigned long) ucs);
85 return((w & U4W_ERROR) ? -1 : w);
87 else if(display_data){
88 if(wtomb(dummy, ucs) < 0)
89 return(-1);
90 else{
91 w = ucs4_width((unsigned long) ucs);
92 return((w & U4W_ERROR) ? -1 : w);
95 #if !defined(_WINDOWS) && HAVE_WCWIDTH
96 else
97 return(wcwidth((wchar_t) ucs));
98 #else
99 return(0);
100 #endif
103 /* ambiguous width zone character function */
105 pith_ucs4width(UCS ucs)
107 #if !defined(_WINDOWS) && HAVE_WCWIDTH
108 return wcwidth((wchar_t) ucs);
109 #else
110 return (ucs >= 0x2100) ? 2 : 1;
111 #endif /* _WINDOWS */
115 * Argument is a UCS-4 wide character.
116 * It is converted to the multibyte version (for example UTF8 or EUC-JP).
117 * Dest is a buffer at least xx chars wide where the multi-byte version
118 * of the wide character will be written.
119 * The returned value is the number of bytes written to dest or -1
120 * if the conversion can't be done.
123 wtomb(char *dest, UCS ucs)
126 * We believe that on modern unix systems wchar_t is a UCS-4 character.
127 * That's the assumption here.
130 if(native_utf8){
131 unsigned char *newdptr;
133 newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
134 return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
136 else if(display_data){
137 unsigned long ucs4;
138 int ret;
140 ucs4 = (unsigned long) ucs;
141 ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
142 if(ret >= 0)
143 ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
144 else
145 ret = -1;
147 return(ret);
149 else
150 return(wcrtomb(dest, (wchar_t) ucs, NULL));
155 * This function does not necessarily update inputp and remaining_octets, so
156 * don't rely on that. The c-client version does but the other doesn't.
159 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
161 UCS ucs;
163 if(input_cs){
164 CHARSET *cast_input_cs;
166 cast_input_cs = (CHARSET *) input_cs;
168 switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
169 case U8G_ENDSTRG:
170 case U8G_ENDSTRI:
171 return(CCONV_NEEDMORE);
173 default:
174 if(ucs & U8G_ERROR || ucs == UBOGON)
175 return(CCONV_BADCHAR);
177 return(ucs);
180 else{
181 size_t ret;
182 wchar_t w;
185 * Warning: input_cs and remaining_octets are unused in this
186 * half of the if/else.
188 * Unfortunately, we can't tell the difference between a source string
189 * that is just not long enough and one that has characters that can't
190 * be converted even though it is long enough. We return NEEDMORE in both cases.
192 ret = mbstowcs(&w, (char *) (*inputp), 1);
193 if(ret == (size_t)(-1))
194 return(CCONV_NEEDMORE);
195 else{
196 ucs = (UCS) w;
197 return(ucs);
203 void
204 set_locale_charmap(char *charmap)
206 if(charmap){
207 strncpy(locale_charmap, charmap, sizeof(locale_charmap));
208 locale_charmap[sizeof(locale_charmap)-1] = '\0';
210 else
211 locale_charmap[0] = '\0';
216 * This ensures that the string is UTF-8. If str is already a UTF-8 string,
217 * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
218 * The caller is responsible for freeing the returned value.
220 * Args str -- the string to convert
222 char *
223 convert_to_utf8(char *str, char *fromcharset, int flags)
225 char *ret = NULL;
226 char *fcharset;
227 SIZEDTEXT src, result;
228 const CHARSET *cs;
229 int try;
231 src.data = (unsigned char *) str;
232 src.size = strlen(str);
234 /* already UTF-8, return NULL */
235 if(!(flags & CU8_NOINFER)
236 && (cs = utf8_infercharset(&src))
237 && (cs->type == CT_ASCII || cs->type == CT_UTF8))
238 return(ret);
240 try = 1;
241 while(try < 5){
242 switch(try){
243 case 1:
244 fcharset = fromcharset;
245 if(fcharset && strucmp("UTF-8", fcharset) != 0)
246 break; /* give it a try */
247 else
248 try++; /* fall through */
250 case 2:
251 if(!(flags & CU8_NOINFER)){
252 fcharset = cs ? cs->name : NULL;
253 if(fcharset && strucmp("UTF-8", fcharset) != 0)
254 break;
255 else
256 try++; /* fall through */
258 else
259 try++; /* fall through */
261 case 3:
262 fcharset = locale_charmap;
263 if(fcharset && strucmp("UTF-8", fcharset) != 0)
264 break;
265 else
266 try++; /* fall through */
268 default:
269 fcharset = "ISO-8859-1"; /* this will "work" */
270 break;
273 memset(&result, 0, sizeof(result));
275 if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
276 if(!(result.size == src.size && result.data == src.data)){
277 ret = (char *) fs_get((result.size+1) * sizeof(char));
278 strncpy(ret, (char *) result.data, result.size);
279 ret[result.size] = '\0';
281 /* else no conversion necessary */
283 return(ret);
286 try++;
289 /* won't make it to here */
290 return(ret);
295 * Convert from UTF-8 to user's locale charset.
296 * This actually uses the wtomb routine to do the conversion, and that
297 * relies on setup_for_input_output having been called.
298 * If no conversion is necessary, NULL is returned, otherwise an allocated
299 * string in the locale charset is returned and the caller is responsible
300 * for freeing it.
302 char *
303 convert_to_locale(char *utf8str)
305 #define CHNK 500
306 char *inp, *retp, *ret = NULL;
307 CBUF_S cb;
308 int r, alloced;
310 if(native_utf8 || !utf8str || !utf8str[0])
311 return(NULL);
313 cb.cbuf[0] = '\0';
314 cb.cbufp = cb.cbufend = cb.cbuf;
315 inp = utf8str;
317 alloced = CHNK;
318 ret = (char *) fs_get(alloced * sizeof(char));
319 retp = ret;
322 * There's gotta be a better way to do this but utf8_to_locale was
323 * available and everything looks like a nail when all you have
324 * is a hammer.
326 while(*inp){
328 * We're placing the outgoing stream of characters in ret, a multi-byte
329 * array of characters in the user's locale charset. See if there is
330 * enough room for the next wide characters worth of output chars
331 * and allocate more space if not.
333 if((alloced - (retp-ret)) < MAX(MB_LEN_MAX,32)){
334 alloced += CHNK;
335 fs_resize((void **) &ret, alloced * sizeof(char));
338 r = utf8_to_locale((int) *inp++, &cb,
339 (unsigned char *) retp, alloced-(retp-ret));
341 retp += r;
344 *retp = '\0';
346 fs_resize((void **) &ret, strlen(ret)+1);
348 return(ret);
353 * Pass in a stream of UTF-8 characters in 'c' and return obuf
354 * filled in with multi-byte characters. The return value is the
355 * number of valid characters in obuf to be used.
358 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
360 int outchars = 0;
362 if(!(cb && cb->cbufp))
363 return(0);
365 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
366 unsigned char *inputp;
367 unsigned long remaining_octets;
368 UCS ucs;
370 *(cb->cbufp)++ = (unsigned char) c;
371 inputp = cb->cbuf;
372 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
373 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
375 switch(ucs){
376 case U8G_ENDSTRG: /* incomplete character, wait */
377 case U8G_ENDSTRI: /* incomplete character, wait */
378 break;
380 default:
381 if(ucs & U8G_ERROR || ucs == UBOGON){
383 * None of these cases is supposed to happen. If it
384 * does happen then the input stream isn't UTF-8
385 * so something is wrong. Treat each character in the
386 * input buffer as a separate error character and
387 * print a '?' for each.
389 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
390 obuf[outchars++] = '?';
392 cb->cbufp = cb->cbuf;
394 else{
395 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
397 * This happens when we have a UTF-8 character that
398 * we aren't able to print in our locale. For example,
399 * if the locale is setup with the terminal
400 * expecting ISO-8859-1 characters then there are
401 * lots of UTF-8 characters that can't be printed.
402 * Print a '?' instead.
404 obuf[outchars++] = '?';
406 else{
408 * Convert the ucs into the multibyte
409 * character that corresponds to the
410 * ucs in the users locale.
412 outchars = wtomb((char *) obuf, ucs);
413 if(outchars < 0){
414 obuf[0] = '?';
415 outchars = 1;
419 /* update the input buffer */
420 if(inputp >= cb->cbufp) /* this should be the case */
421 cb->cbufp = cb->cbuf;
422 else{ /* extra chars for some reason? */
423 unsigned char *q, *newcbufp;
425 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
426 q = cb->cbuf;
427 while(inputp < cb->cbufp)
428 *q++ = *inputp++;
430 cb->cbufp = newcbufp;
434 break;
437 else{ /* error */
438 obuf[0] = '?';
439 outchars = 1;
440 cb->cbufp = cb->cbuf; /* start over */
443 return(outchars);
448 * Returns the screen cells width of the UCS-4 string argument.
449 * The source string is zero terminated.
451 unsigned
452 ucs4_str_width(UCS *ucsstr)
454 unsigned width = 0;
455 int w;
457 if(ucsstr)
458 while(*ucsstr){
459 w = wcellwidth(*ucsstr++);
460 if(w != U4W_CTLSRGT)
461 width += (w < 0 ? 1 : w);
464 return width;
469 * Returns the screen cells width of the UCS-4 string argument
470 * from ucsstr[a] through (inclusive) ucsstr[b].
471 * No checking is done to make sure a starts in the middle
472 * of a UCS-4 array.
474 unsigned
475 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
477 unsigned width = 0;
478 int i, w;
480 if(ucsstr)
481 for(i = a; i <= b && ucsstr[i]; i++){
482 w = wcellwidth(ucsstr[i]);
483 if(w != U4W_CTLSRGT)
484 width += (w < 0 ? 1 : w);
487 return width;
492 * Returns the screen cells width of the UCS-4 string argument
493 * from ustart through (exclusive) uend.
494 * No checking is done to make sure it starts in the middle
495 * of a UCS-4 array.
497 unsigned
498 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
500 UCS *u;
501 unsigned width = 0;
502 int w;
504 if(!ustart)
505 return width;
507 if(ustart)
508 for(u = ustart; u < uend; u++){
509 w = wcellwidth(*u);
510 if(w != U4W_CTLSRGT)
511 width += (w < 0 ? 1 : w);
514 return(width);
519 * Return the largest possible pointer into ucs4str so that the width
520 * of the string from ucs4str to the pointer (exclusive)
521 * is maxwidth or less. Also stops at a null character.
523 UCS *
524 ucs4_particular_width(UCS *ucs4str, int maxwidth)
526 UCS *u;
527 int w_consumed = 0, w, done = 0;
529 u = ucs4str;
531 if(u)
532 while(!done && *u && w_consumed <= maxwidth){
533 w = wcellwidth(*u);
534 w = (w >= 0 ? w : 1);
535 if(w_consumed + w <= maxwidth){
536 w_consumed += w;
537 ++u;
539 else
540 ++done;
543 return(u);
548 * Convert and copy a UTF-8 string into a UCS-4 NULL
549 * terminated array. Just like cpystr only it converts
550 * from UTF-8 to UCS-4.
552 * Returned UCS-4 string needs to be freed by caller.
554 UCS *
555 utf8_to_ucs4_cpystr(char *utf8src)
557 size_t retsize;
558 UCS *ret = NULL;
559 UCS ucs;
560 unsigned long remaining_octets;
561 unsigned char *readptr;
562 size_t arrayindex;
565 * We don't know how big to allocate the return array
566 * because variable numbers of octets in the src array
567 * will combine to make UCS-4 characters. The number of
568 * UCS-4 characters is less than or equal to the number
569 * of src characters, though.
572 if(!utf8src)
573 return NULL;
575 retsize = strlen(utf8src) + 1;
577 ret = (UCS *) fs_get(retsize * sizeof(*ret));
578 memset(ret, 0, retsize * sizeof(*ret));
580 readptr = (unsigned char *) utf8src;
581 remaining_octets = retsize-1;
582 arrayindex = 0;
584 while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
585 ucs = (UCS) utf8_get(&readptr, &remaining_octets);
587 if(ucs & U8G_ERROR || ucs == UBOGON)
588 remaining_octets = 0;
589 else
590 ret[arrayindex++] = ucs;
593 ret[arrayindex] = '\0';
595 /* get rid of excess size */
596 if(arrayindex+1 < retsize)
597 fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
599 return ret;
604 * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
605 * terminated string. Just like cpystr only it converts
606 * from UCS-4 to UTF-8.
608 * Returned UTF-8 string needs to be freed by caller.
610 char *
611 ucs4_to_utf8_cpystr(UCS *ucs4src)
613 unsigned char *ret = NULL;
614 unsigned char *writeptr;
615 int i;
617 if(!ucs4src)
618 return NULL;
621 * Over-allocate and then resize at the end.
624 /* count characters in source */
625 for(i = 0; ucs4src[i]; i++)
628 ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
629 memset(ret, 0, (6*i + 1) * sizeof(*ret));
631 writeptr = ret;
632 for(i = 0; ucs4src[i]; i++)
633 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
635 /* get rid of excess size */
636 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
638 return ((char *) ret);
643 * Similar to above but copy a fixed number of source
644 * characters instead of going until null terminator.
646 char *
647 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
649 unsigned char *ret = NULL;
650 unsigned char *writeptr;
651 int i;
653 if(!ucs4src)
654 return NULL;
657 * Over-allocate and then resize at the end.
660 ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
661 memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
663 writeptr = ret;
664 for(i = 0; i < ucs4src_len; i++)
665 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
667 /* get rid of excess size */
668 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
670 return ((char *) ret);
674 #ifdef _WINDOWS
676 * Convert a UTF-8 argument into an LPTSTR version
677 * of that argument. The result is allocated here
678 * and should be freed by the caller.
680 LPTSTR
681 utf8_to_lptstr(LPSTR arg_utf8)
683 int lptstr_len;
684 LPTSTR lptstr_ret = NULL;
686 lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
687 if(lptstr_len > 0)
689 lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
690 lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
691 arg_utf8, -1, lptstr_ret, lptstr_len );
694 if(!lptstr_len)
696 /* check GetLastError()? */
697 lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
698 lptstr_ret[0] = 0;
701 return lptstr_ret;
706 * Convert an LPTSTR argument into a UTF-8 version
707 * of that argument. The result is allocated here
708 * and should be freed by the caller.
710 LPSTR
711 lptstr_to_utf8(LPTSTR arg_lptstr)
713 int utf8str_len;
714 LPSTR utf8str_ret = NULL;
716 utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
717 if(utf8str_len > 0)
719 utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
720 utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
721 arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
724 if(!utf8str_len)
726 /* check GetLastError()? */
727 utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
728 utf8str_ret[0] = 0;
731 return utf8str_ret;
736 * Convert a UCS4 argument into an LPTSTR version
737 * of that argument. The result is allocated here
738 * and should be freed by the caller.
740 LPTSTR
741 ucs4_to_lptstr(UCS *arg_ucs4)
743 LPTSTR ret_lptstr = NULL;
744 size_t len;
745 size_t i;
747 if(arg_ucs4){
748 len = ucs4_strlen(arg_ucs4);
749 ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
750 /* bogus conversion ignores UTF-16 */
751 for(i = 0; i < len; i++)
752 ret_lptstr[i] = arg_ucs4[i];
754 ret_lptstr[len] = '\0';
757 return(ret_lptstr);
762 * Convert an LPTSTR argument into a UCS4 version
763 * of that argument. The result is MemAlloc'd here
764 * and should be freed by the caller.
766 UCS *
767 lptstr_to_ucs4(LPTSTR arg_lptstr)
769 UCS *ret_ucs4 = NULL;
770 size_t len;
771 size_t i;
773 if(arg_lptstr){
774 len = _tcslen(arg_lptstr);
775 ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
776 /* bogus conversion ignores UTF-16 */
777 for(i = 0; i < len; i++)
778 ret_ucs4[i] = arg_lptstr[i];
780 ret_ucs4[len] = '\0';
783 return(ret_ucs4);
786 #endif /* _WINDOWS */
790 * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
791 * 1-at-a-time filled in with UCS characters. The return value is the
792 * number of valid characters in obuf to be used. It can only
793 * be 1 or 0 characters since we're only getting one UTF-8 character
794 * at a time.
797 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
799 int width = 0, outchars = 0;
801 if(!(cb && cb->cbufp))
802 return(0);
804 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
805 unsigned char *inputp;
806 unsigned long remaining_octets;
807 UCS ucs;
809 *cb->cbufp++ = (unsigned char) c;
810 inputp = cb->cbuf;
811 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
812 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
814 switch(ucs){
815 case U8G_ENDSTRG: /* incomplete character, wait */
816 case U8G_ENDSTRI: /* incomplete character, wait */
817 break;
819 default:
820 if(ucs & U8G_ERROR || ucs == UBOGON){
822 * None of these cases is supposed to happen. If it
823 * does happen then the input stream isn't UTF-8
824 * so something is wrong.
826 outchars++;
827 *obuf = '?';
828 cb->cbufp = cb->cbuf;
829 width = 1;
831 else{
832 outchars++;
833 if(ucs < 0x80 && ucs >= 0x20)
834 width = 1;
836 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
838 * This happens when we have a UTF-8 character that
839 * we aren't able to print in our locale. For example,
840 * if the locale is setup with the terminal
841 * expecting ISO-8859-1 characters then there are
842 * lots of UTF-8 characters that can't be printed.
843 * Print a '?' instead.
844 * Don't think this should happen in Windows.
846 *obuf = '?';
848 else{
849 *obuf = ucs;
852 /* update the input buffer */
853 if(inputp >= cb->cbufp) /* this should be the case */
854 cb->cbufp = cb->cbuf;
855 else{ /* extra chars for some reason? */
856 unsigned char *q, *newcbufp;
858 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
859 q = cb->cbuf;
860 while(inputp < cb->cbufp)
861 *q++ = *inputp++;
863 cb->cbufp = newcbufp;
867 break;
870 else{ /* error */
871 *obuf = '?';
872 outchars = 1;
873 width = 1;
874 cb->cbufp = cb->cbuf; /* start over */
877 if(obufwidth)
878 *obufwidth = width;
880 return(outchars);
885 * Return an allocated copy of a zero-terminated UCS-4 string.
887 UCS *
888 ucs4_cpystr(UCS *ucs4src)
890 size_t arraysize;
891 UCS *ret = NULL;
892 size_t i;
894 if(!ucs4src)
895 return NULL;
897 arraysize = ucs4_strlen(ucs4src);
899 ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
900 memset(ret, 0, (arraysize+1) * sizeof(*ret));
902 for(i = 0; i < arraysize; i++)
903 ret[i] = ucs4src[i];
905 return ret;
909 UCS *
910 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
912 size_t i;
914 if(ucs4src && ucs4dst){
915 for(i = 0; i < n; i++){
916 ucs4dst[i] = ucs4src[i];
917 if(ucs4dst[i] == '\0')
918 break;
922 return ucs4dst;
926 UCS *
927 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
929 size_t i;
930 UCS *u;
932 if(ucs4src && ucs4dst){
933 for(u = ucs4dst; *u; u++)
936 for(i = 0; i < n; i++){
937 u[i] = ucs4src[i];
938 if(u[i] == '\0')
939 break;
942 if(i == n)
943 u[i] = '\0';
946 return ucs4dst;
951 * Like strlen only this returns the number of non-zero characters
952 * in a zero-terminated UCS-4 array.
954 size_t
955 ucs4_strlen(UCS *ucs4str)
957 size_t i = 0;
959 if(ucs4str)
960 while(ucs4str[i])
961 i++;
963 return(i);
968 ucs4_strcmp(UCS *s1, UCS *s2)
970 for(; *s1 == *s2; s1++, s2++)
971 if(*s1 == '\0')
972 return 0;
974 return((*s1 < *s2) ? -1 : 1);
978 UCS *
979 ucs4_strchr(UCS *s, UCS c)
981 if(!s)
982 return NULL;
984 while(*s && *s != c)
985 s++;
987 if(*s || !c)
988 return s;
989 else
990 return NULL;
994 UCS *
995 ucs4_strrchr(UCS *s, UCS c)
997 UCS *ret = NULL;
999 if(!s)
1000 return ret;
1002 while(*s){
1003 if(*s == c)
1004 ret = s;
1006 s++;
1009 return ret;
1014 * Returns the screen cells width of the UTF-8 string argument.
1016 unsigned
1017 utf8_width(char *str)
1019 unsigned width = 0;
1020 int this_width;
1021 UCS ucs;
1022 unsigned long remaining_octets;
1023 char *readptr;
1025 if(!(str && *str))
1026 return(width);
1028 readptr = str;
1029 remaining_octets = readptr ? strlen(readptr) : 0;
1031 while(remaining_octets > 0 && *readptr){
1033 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1035 if(ucs & U8G_ERROR || ucs == UBOGON){
1037 * This should not happen, but do something to handle it anyway.
1038 * Treat each character as a single width character, which is what should
1039 * probably happen when we actually go to write it out.
1041 remaining_octets--;
1042 readptr++;
1043 this_width = 1;
1045 else{
1046 this_width = wcellwidth(ucs);
1049 * If this_width is -1 that means we can't print this character
1050 * with our current locale. Writechar will print a '?'.
1052 if(this_width < 0)
1053 this_width = 1;
1056 width += (unsigned) this_width;
1059 return(width);
1064 * Copy UTF-8 characters from src into dst.
1065 * This is intended to be used if you want to truncate a string at
1066 * the start instead of the end. For example, you have a long string
1067 * like
1068 * this_is_a_long_string
1069 * but not enough space to fit it into a particular field. You want to
1070 * end up with
1071 * s_a_long_string
1072 * where that fits in a particular width. Perhaps you'd use this with ...
1073 * to get
1074 * ...s_a_long_string
1075 * This right adjusts the end of the string in the width space and
1076 * cuts it off at the start. If there is enough width for the whole
1077 * string it will copy the string into dst with no padding.
1079 * Copy enough characters so that the result will have screen width of
1080 * want_width screen cells in current locale.
1082 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1083 * to dst. This is just for protection, it shouldn't be relied on to
1084 * do anything useful. Dstlen should be large enough. Otherwise you'll get
1085 * characters truncated in the middle or something like that.
1087 * Returned value is the number of bytes written to dst, not including
1088 * the possible terminating null.
1090 * If we can't hit want_width exactly because of double width characters
1091 * then we will pad the end of the string with space in order to make
1092 * the width exact.
1094 size_t
1095 utf8_to_width_rhs(char *dst, /* destination buffer */
1096 char *src, /* source string */
1097 size_t dstlen, /* space in dest */
1098 unsigned want_width) /* desired screen width */
1100 int this_width;
1101 unsigned width_consumed = 0;
1102 UCS ucs;
1103 unsigned long remaining_octets;
1104 char *readptr, *goodreadptr, *savereadptr, *endptr;
1105 size_t nb = 0;
1107 if(!src){
1108 if(dstlen > 0)
1109 dst[0] = '\0';
1111 return nb;
1115 * Start at the end of the source string and go backwards until we
1116 * get to the desired width, but not more than the width.
1118 readptr = src + strlen(src);
1119 endptr = readptr;
1120 goodreadptr = readptr;
1121 width_consumed = 0;
1122 savereadptr = readptr;
1124 for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1125 readptr = savereadptr-1){
1127 savereadptr = readptr;
1128 remaining_octets = goodreadptr - readptr;
1129 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1132 * Handling the error case is tough because an error will be the normal thing that
1133 * happens as we back through the string. So we're just going to punt on the
1134 * error for now.
1136 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1137 if(remaining_octets > 0){
1139 * This means there are some bad octets after this good
1140 * character so things are not going to work out well.
1141 * Bail out.
1143 savereadptr = src; /* we're done */
1145 else{
1146 this_width = wcellwidth(ucs);
1148 if(this_width < 0)
1149 this_width = 1;
1151 if(width_consumed + (unsigned) this_width <= want_width){ /* ok */
1152 width_consumed += (unsigned) this_width;
1153 goodreadptr = savereadptr;
1155 else
1156 savereadptr = src; /* we're done */
1162 * Copy characters from goodreadptr to endptr into dst.
1164 nb = MIN(endptr-goodreadptr, dstlen-1);
1165 strncpy(dst, goodreadptr, nb);
1166 dst[nb] = '\0';
1169 * Pad out with spaces in order to hit width exactly.
1171 while(width_consumed < want_width && nb < dstlen-1){
1172 dst[nb++] = ' ';
1173 dst[nb] = '\0';
1174 width_consumed++;
1177 return nb;
1182 * The arguments being converted are UTF-8 strings.
1183 * This routine attempts to make it possible to use screen cell
1184 * widths in a format specifier. In a one-byte per screen cell
1185 * world we might have used %10.10s to cause a string to occupy
1186 * 10 screen positions. Since the width and precision are really
1187 * referring to numbers of bytes instead of screen positions that
1188 * won't work with UTF-8 input. We emulate that behavior with
1189 * the format string %w. %m.nw means to use the m and n as
1190 * screen width indicators instead of bytes indicators.
1192 * There is no reason to use this routine unless you want to use
1193 * min field with or precision with the specifier. A plain %w without
1194 * widths is equivalent exactly to a plain %s in a regular printf.
1196 * Double-width characters complicate things. It may not be possible
1197 * to satisfy the request exactly. For example, %3w for an input
1198 * string that is made up of two double-width characters.
1199 * This routine will arbitrarily use a trailing space character if
1200 * needed to make the width come out correctly where a half of a
1201 * double-width character would have been needed. We'll see how
1202 * that works for us.
1204 * %w only works for strings (it's a %s replacement).
1206 * Buffer overflow is handled by the size argument. %.30s will work
1207 * to limit a particular string to 30 bytes, but you lose that
1208 * ability with %w, since it may write more than precision bytes
1209 * in order to get to the desired width. It is best to choose
1210 * size large enough so that it doesn't come into play, otherwise
1211 * it may be possible to get partial UTF-8 characters because of
1212 * the truncation.
1214 * The return value isn't quite the same as the return value
1215 * of snprintf. It is the number of bytes written, not counting
1216 * the trailing null, just like snprintf. However, if it is
1217 * truncated due to size then the output is size, not the
1218 * number of characters that would have been written.
1221 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1223 char newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1224 char *start_of_specifier;
1225 char *input_str;
1226 int int_arg;
1227 double double_arg;
1228 void *ptr_arg;
1229 unsigned got_width;
1230 int more_flags, ret, w;
1231 int min_field_width, field_precision, modifier;
1232 int flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1233 va_list args;
1235 newfmt[0] = '\0';
1236 q = newfmt;
1238 pdest = dest;
1240 #define IS_ROOM_IN_DEST(n_more_chars) \
1241 ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1244 * Strategy: Look through the fmt string for %w's. Replace the
1245 * %w's in the format string with %s's but with possibly different
1246 * width and precision arguments which will make it come out right.
1247 * Then call the regular system vsnprintf with the altered format
1248 * string but same arguments.
1250 * That would be nice but it doesn't quite work. Why? Because a
1251 * %*w will need to have the value in the integer argument the *
1252 * refers to modified. Can't do it as far as I can tell. Or we could
1253 * remove the integer argument somehow before calling printf. Can't
1254 * do it. Or we could somehow add an additional conversion specifier
1255 * that caused nothing to be printed but ate up the integer arg.
1256 * Can't figure out how to do that either.
1258 * Since we can't figure out how to do it, the alternative is to
1259 * construct the result one piece at a time, pasting together the
1260 * pieces from the different conversions.
1262 va_start(args, fmt);
1264 while(*fmt && IS_ROOM_IN_DEST(1)){
1265 if(*fmt == '%'){
1266 start_of_specifier = fmt++;
1268 min_field_width = field_precision = -1;
1269 flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1271 /* flags */
1272 more_flags = 1;
1273 while(more_flags){
1274 switch(*fmt){
1275 case '-':
1276 flags_minus++;
1277 fmt++;
1278 break;
1280 case '+':
1281 flags_plus++;
1282 fmt++;
1283 break;
1285 case ' ':
1286 flags_space++;
1287 fmt++;
1288 break;
1290 case '0':
1291 flags_zero++;
1292 fmt++;
1293 break;
1295 case '#':
1296 flags_pound++;
1297 fmt++;
1298 break;
1300 default:
1301 more_flags = 0;
1302 break;
1306 /* minimum field width */
1307 if(*fmt == '*'){
1308 min_field_width = va_arg(args, int);
1309 fmt++;
1311 else if(*fmt >= '0' && *fmt <= '9'){
1312 width_str = fmt;
1313 while (*fmt >= '0' && *fmt <= '9')
1314 fmt++;
1316 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1317 if(sizeof(buf) > fmt-width_str)
1318 buf[fmt-width_str] = '\0';
1320 buf[sizeof(buf)-1] = '\0';
1322 min_field_width = atoi(width_str);
1325 /* field precision */
1326 if(*fmt == '.'){
1327 fmt++;
1328 if(*fmt == '*'){
1329 field_precision = va_arg(args, int);
1330 fmt++;
1332 else if(*fmt >= '0' && *fmt <= '9'){
1333 width_str = fmt;
1334 while (*fmt >= '0' && *fmt <= '9')
1335 fmt++;
1337 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1338 if(sizeof(buf) > fmt-width_str)
1339 buf[fmt-width_str] = '\0';
1341 buf[sizeof(buf)-1] = '\0';
1343 field_precision = atoi(width_str);
1347 /* length modifier */
1348 if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1349 modifier = *fmt++;
1351 /* conversion character */
1352 switch(*fmt){
1353 case 'w':
1355 * work with va_arg(char *) to figure out width
1356 * and precision needed to produce the screen width
1357 * and precision asked for in %w using some of the
1358 * utf8 width routines we have.
1361 input_str = va_arg(args, char *);
1362 if(field_precision >=0 || min_field_width >= 0)
1363 w = utf8_width(input_str);
1365 if(field_precision >= 0){
1366 if(w <= field_precision)
1367 field_precision = -1; /* print it all */
1368 else{
1370 * We need to cut off some of the input_str
1371 * in this case.
1373 end = utf8_count_forw_width(input_str, field_precision, &got_width);
1374 field_precision = (int) (end - input_str);
1375 /* new w with this field_precision */
1376 w = got_width;
1380 /* need some padding */
1381 if(min_field_width >= 0)
1382 min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1383 MAX(0, min_field_width - w);
1386 * Now we just need to get the new format string
1387 * set correctly in newfmt.
1389 q = newfmt;
1390 if(q-newfmt < sizeof(newfmt))
1391 *q++ = '%';
1393 if(flags_minus && q-newfmt < sizeof(newfmt))
1394 *q++ = '-';
1395 if(flags_plus && q-newfmt < sizeof(newfmt))
1396 *q++ = '+';
1397 if(flags_space && q-newfmt < sizeof(newfmt))
1398 *q++ = ' ';
1399 if(flags_zero && q-newfmt < sizeof(newfmt))
1400 *q++ = '0';
1401 if(flags_pound && q-newfmt < sizeof(newfmt))
1402 *q++ = '#';
1404 if(min_field_width >= 0){
1405 snprintf(buf, sizeof(buf), "%d", min_field_width);
1406 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1409 if(field_precision >= 0){
1410 if(q-newfmt < sizeof(newfmt))
1411 *q++ = '.';
1413 snprintf(buf, sizeof(buf), "%d", field_precision);
1414 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1417 if(q-newfmt < sizeof(newfmt))
1418 *q++ = 's';
1420 if(q-newfmt < sizeof(newfmt))
1421 *q++ = '\0';
1423 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1424 pdest += strlen(pdest);
1426 break;
1428 case '\0':
1429 fmt--;
1430 break;
1432 default:
1433 /* make a new format which leaves out the dynamic '*' arguments */
1434 q = newfmt;
1435 if(q-newfmt < sizeof(newfmt))
1436 *q++ = '%';
1438 if(flags_minus && q-newfmt < sizeof(newfmt))
1439 *q++ = '-';
1440 if(flags_plus && q-newfmt < sizeof(newfmt))
1441 *q++ = '+';
1442 if(flags_space && q-newfmt < sizeof(newfmt))
1443 *q++ = ' ';
1444 if(flags_zero && q-newfmt < sizeof(newfmt))
1445 *q++ = '0';
1446 if(flags_pound && q-newfmt < sizeof(newfmt))
1447 *q++ = '#';
1449 if(min_field_width >= 0){
1450 snprintf(buf, sizeof(buf), "%d", min_field_width);
1451 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1454 if(field_precision >= 0){
1455 if(q-newfmt < sizeof(newfmt))
1456 *q++ = '.';
1458 snprintf(buf, sizeof(buf), "%d", field_precision);
1459 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1462 if(q-newfmt < sizeof(newfmt))
1463 *q++ = *fmt;
1465 if(q-newfmt < sizeof(newfmt))
1466 *q++ = '\0';
1468 switch(*fmt){
1469 case 'd': case 'i': case 'o':
1470 case 'x': case 'X': case 'u': case 'c':
1471 int_arg = va_arg(args, int);
1472 snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1473 pdest += strlen(pdest);
1474 break;
1476 case 's':
1477 input_str = va_arg(args, char *);
1478 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1479 pdest += strlen(pdest);
1480 break;
1482 case 'f': case 'e': case 'E':
1483 case 'g': case 'G':
1484 double_arg = va_arg(args, double);
1485 snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1486 pdest += strlen(pdest);
1487 break;
1489 case 'p':
1490 ptr_arg = va_arg(args, void *);
1491 snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1492 pdest += strlen(pdest);
1493 break;
1495 case '%':
1496 if(IS_ROOM_IN_DEST(1))
1497 *pdest++ = '%';
1499 break;
1501 default:
1502 /* didn't think of this type */
1503 assert(0);
1504 break;
1507 break;
1510 fmt++;
1512 else{
1513 if(IS_ROOM_IN_DEST(1))
1514 *pdest++ = *fmt++;
1518 ret = pdest - dest;
1520 if(IS_ROOM_IN_DEST(1))
1521 *pdest++ = '\0';
1523 va_end(args);
1525 return ret;
1530 * Copy UTF-8 characters from src into dst.
1531 * Copy enough characters so that the result will have (<=) screen width of
1532 * want_width screen cells in current locale.
1534 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1535 * to dst.
1537 * Returned value is the number of bytes written to dst, not including
1538 * the possible terminating null.
1539 * Got_width is another returned value. It is the width in screen cells of
1540 * the string placed in dst. It will be the same as want_width if there
1541 * are enough characters in the src to do that and if the character widths
1542 * hit the width exactly. It will be less than want_width if we run out
1543 * of src characters or if the next character width would skip over the
1544 * width we want, because it is double width.
1546 * Zero width characters are collected and included at the end of the string.
1547 * That is, if we make it to want_width but there is still a zero length
1548 * character sitting in src, we add that to dst. This might be an accent
1549 * or something like that.
1551 size_t
1552 utf8_to_width(char *dst, /* destination buffer */
1553 char *src, /* source string */
1554 size_t dstlen, /* space in dst */
1555 unsigned want_width, /* desired screen width */
1556 unsigned *got_width) /* returned screen width in dst */
1558 int this_width;
1559 unsigned width_consumed = 0;
1560 UCS ucs;
1561 unsigned long remaining_octets;
1562 char *writeptr, *readptr, *savereadptr, *endptr;
1563 int ran_out_of_space = 0;
1565 readptr = src;
1567 remaining_octets = readptr ? strlen(readptr) : 0;
1569 writeptr = dst;
1570 endptr = writeptr + dstlen;
1572 if(readptr && writeptr){
1573 while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1574 savereadptr = readptr;
1575 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1577 if(ucs & U8G_ERROR || ucs == UBOGON)
1578 remaining_octets = 0;
1579 else{
1580 this_width = wcellwidth(ucs);
1583 * If this_width is -1 that means we can't print this character
1584 * with our current locale. Writechar will print a '?'.
1586 if(this_width < 0)
1587 this_width = 1;
1589 if(width_consumed + (unsigned) this_width <= want_width){
1590 /* append this utf8 character to dst if it will fit */
1591 if(writeptr + (readptr - savereadptr) < endptr){
1592 width_consumed += this_width;
1593 while(savereadptr < readptr)
1594 *writeptr++ = *savereadptr++;
1596 else
1597 ran_out_of_space++; /* no more utf8 to dst */
1599 else
1600 remaining_octets = 0; /* we're done */
1604 if(writeptr < endptr)
1605 *writeptr = '\0';
1608 if(got_width)
1609 *got_width = width_consumed;
1611 return(writeptr ? (writeptr - dst) : 0);
1616 * Str is a UTF-8 string.
1617 * Count forward width screencell positions and return a pointer to the
1618 * end of the string that is width wide.
1619 * The returned pointer points at the next character (where the null would
1620 * be placed).
1622 * Got_width is another returned value. It is the width in screen cells of
1623 * the string from str to the returned pointer. It will be the same as
1624 * want_width if there are enough characters in the str to do that
1625 * and if the character widths hit the width exactly. It will be less
1626 * than want_width if we run out of characters or if the next character
1627 * width would skip over the width we want, because it is double width.
1629 char *
1630 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1632 int this_width;
1633 unsigned width_consumed = 0;
1634 UCS ucs;
1635 unsigned long remaining_octets;
1636 char *readptr;
1637 char *retptr;
1639 retptr = readptr = str;
1641 remaining_octets = readptr ? strlen(readptr) : 0;
1643 while(width_consumed <= want_width && remaining_octets > 0){
1645 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1647 if(ucs & U8G_ERROR || ucs == UBOGON){
1649 * This should not happen, but do something to handle it anyway.
1650 * Treat each character as a single width character, which is what should
1651 * probably happen when we actually go to write it out.
1653 remaining_octets--;
1654 readptr++;
1655 this_width = 1;
1657 else{
1658 this_width = wcellwidth(ucs);
1661 * If this_width is -1 that means we can't print this character
1662 * with our current locale. Writechar will print a '?'.
1664 if(this_width < 0)
1665 this_width = 1;
1668 if(width_consumed + (unsigned) this_width <= want_width){
1669 width_consumed += (unsigned) this_width;
1670 retptr = readptr;
1672 else
1673 remaining_octets = 0; /* we're done */
1676 if(got_width)
1677 *got_width = width_consumed;
1679 return(retptr);
1684 * Copy a null terminator into a UTF-8 string in place so that the string is
1685 * no more than a certain screen width wide. If the string is already less
1686 * than or equal in width to the requested width, no change is made.
1688 * The actual width accomplished is returned. Note that it may be less than
1689 * max_width due to double width characters as well as due to the fact that
1690 * it fits wholly in the max_width.
1692 * Returned value is the actual screen width of str when done.
1694 * A side effect is that a terminating null may have been written into
1695 * the passed in string.
1697 unsigned
1698 utf8_truncate(char *str, unsigned max_width)
1700 int this_width;
1701 unsigned width_consumed = 0;
1702 UCS ucs;
1703 unsigned long remaining_octets;
1704 char *readptr, *savereadptr;
1706 readptr = str;
1708 remaining_octets = readptr ? strlen(readptr) : 0;
1710 if(readptr){
1711 while(width_consumed <= max_width && remaining_octets > 0){
1713 savereadptr = readptr;
1714 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1716 if(ucs & U8G_ERROR || ucs == UBOGON){
1718 * This should not happen, but do something to handle it anyway.
1719 * Treat each character as a single width character, which is what should
1720 * probably happen when we actually go to write it out.
1722 remaining_octets--;
1723 readptr++;
1724 this_width = 1;
1726 else{
1727 this_width = wcellwidth(ucs);
1730 * If this_width is -1 that means we can't print this character
1731 * with our current locale. Writechar will print a '?'.
1733 if(this_width < 0)
1734 this_width = 1;
1737 if(width_consumed + (unsigned) this_width <= max_width){
1738 width_consumed += (unsigned) this_width;
1740 else{
1741 remaining_octets = 0; /* we're done */
1742 *savereadptr = '\0';
1747 return(width_consumed);
1752 * Copy UTF-8 characters from src into dst.
1753 * Copy enough characters so that the result will have screen width of
1754 * want_width screen cells in current locale.
1755 * If there aren't enough characters in src to get to want_width, pad on
1756 * left or right according to left_adjust argument.
1758 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1759 * to dst. Dst will be null terminated if there is enough room, but not
1760 * if that would overflow dst's len.
1762 * Returned value is the number of bytes written to dst, not including
1763 * the possible terminating null.
1765 size_t
1766 utf8_pad_to_width(char *dst, /* destination buffer */
1767 char *src, /* source string */
1768 size_t dstlen, /* space in dst */
1769 unsigned want_width, /* desired screen width */
1770 int left_adjust) /* adjust left or right in want_width columns */
1772 unsigned got_width = 0;
1773 int need_more, howmany;
1774 size_t len_left, bytes_used;
1776 bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1777 len_left = dstlen - bytes_used;
1779 need_more = want_width - got_width;
1780 howmany = MIN(need_more, len_left);
1782 if(howmany > 0){
1783 char *end, *newend, *p, *q;
1785 end = dst + bytes_used;
1786 newend = end + howmany;
1787 if(left_adjust){
1789 * Add padding to end of string. Simply append
1790 * the needed number of spaces, or however many will fit
1791 * if we don't have enough space.
1793 for(q = end; q < newend; q++)
1794 *q = ' ';
1796 else{
1798 * Add padding to start of string.
1801 /* slide existing string over */
1802 for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1803 *q = *p;
1805 /* fill rest with spaces */
1806 for(; q >= dst; q--)
1807 *q = ' ';
1810 bytes_used += howmany;
1813 if(bytes_used < dstlen)
1814 dst[bytes_used] = '\0';
1816 return(bytes_used);
1821 * Str is a UTF-8 string.
1822 * Start_here is a pointer into the string. It points one position past
1823 * the last byte that should be considered a part of the length string.
1824 * Count back want_width screencell positions and return a pointer to the
1825 * start of the string that is want_width wide and ends with start_here.
1827 * Since characters may be more than one cell width wide we may end up
1828 * skipping over the exact width. That is, if we need to we'll go back
1829 * too far (by one cell width). Account for that in the call by looking
1830 * at got_width.
1832 * Note that this call gives a possible got_width == want_width+1 as
1833 * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1834 * That was just what was needed at the time, maybe it needs to be
1835 * optional.
1837 char *
1838 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1840 unsigned width_consumed = 0;
1841 int this_width;
1842 UCS ucs;
1843 unsigned long remaining_octets;
1844 char *ptr, *savereadptr, *goodreadptr;
1846 savereadptr = start_here;
1847 goodreadptr = start_here;
1849 for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1851 savereadptr = ptr;
1852 remaining_octets = goodreadptr - ptr;
1853 ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1855 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1856 if(remaining_octets > 0){
1858 * This means there are some bad octets after this good
1859 * character so things are not going to work out well.
1860 * Bail out.
1862 savereadptr = str; /* we're done */
1864 else{
1865 this_width = wcellwidth(ucs);
1868 * If this_width is -1 that means we can't print this character
1869 * with our current locale. Writechar will print a '?'.
1871 if(this_width < 0)
1872 this_width = 1;
1874 width_consumed += (unsigned) this_width;
1875 goodreadptr = savereadptr;
1880 if(got_width)
1881 *got_width = width_consumed;
1883 return(savereadptr);
1887 /*----------------------------------------------------------------------
1888 copy the source string onto the destination string returning with
1889 the destination string pointer at the end of the destination text
1891 motivation for this is to avoid twice passing over a string that's
1892 being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1894 This doesn't really belong here but it is used here.
1895 ----*/
1896 void
1897 sstrncpy(char **d, char *s, int n)
1899 while(n-- > 0 && (**d = *s++) != '\0')
1900 (*d)++;
1905 * If use_system_routines is set then NULL is the return value and it is
1906 * not an error. Display_charmap and keyboard_charmap should come over as
1907 * malloced strings and will be filled in with the result.
1909 * Returns a void pointer to the input_cs CHARSET which is
1910 * passed to mbtow via kbseq().
1911 * If !use_system_routines && NULL is returned, that is an error and err should
1912 * have a message.
1913 * display_charmap and keyboard_charmap should be malloced data and may be
1914 * realloced and changed here.
1917 setup_for_input_output(int use_system_routines, char **display_charmap,
1918 char **keyboard_charmap, void **input_cs_arg, char **err)
1920 const CHARSET *cs;
1921 const CHARSET *input_cs = NULL;
1922 int already_tried = 0;
1923 int supported = 0;
1924 char buf[1000];
1926 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1928 if(err)
1929 *err = NULL;
1931 if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1932 *err = cpstr("Bad call to setup_for_input_output");
1933 return(-1);
1936 if(use_system_routines){
1937 #if PREREQ_FOR_SYS_TRANSLATION
1938 char *dcm;
1940 dcm = nl_langinfo_codeset_wrapper();
1941 dcm = dcm ? dcm : "US-ASCII";
1943 init_utf8_display(0, NULL);
1944 if(*display_charmap){
1945 if(dcm && strucmp(*display_charmap, dcm)){
1946 snprintf(buf, sizeof(buf),
1947 _("Display character set \"%s\" is ignored when using system translation"),
1948 *display_charmap);
1950 *err = cpstr(buf);
1953 fs_give((void **) display_charmap);
1956 if(*keyboard_charmap){
1957 if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
1958 snprintf(buf, sizeof(buf),
1959 _("Keyboard character set \"%s\" is ignored when using system translation"),
1960 *keyboard_charmap);
1962 *err = cpstr(buf);
1965 fs_give((void **) keyboard_charmap);
1968 *display_charmap = cpstr(dcm);
1969 *keyboard_charmap = cpstr(dcm);
1970 #else
1971 *err = cpstr("Bad call to setup_for_input_output");
1972 #endif
1974 *input_cs_arg = NULL;
1975 return(0);
1979 try_again1:
1980 if(!(*display_charmap))
1981 *display_charmap = cpstr("US-ASCII");
1983 if(!(*keyboard_charmap))
1984 *keyboard_charmap = cpstr(*display_charmap);
1986 if(*keyboard_charmap){
1987 supported = input_charset_is_supported(*keyboard_charmap);
1989 if(supported){
1990 if(!strucmp(*keyboard_charmap, "utf-8"))
1991 input_cs = utf8_charset(*keyboard_charmap);
1992 else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
1993 input_cs = cs;
1995 else{
1996 if(err && !*err){
1997 int iso2022jp = 0;
1999 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2000 iso2022jp = 1;
2002 snprintf(buf, sizeof(buf),
2003 /* TRANSLATORS: The first argument is the name of the character
2004 set the user is trying to use (which is unsupported by alpine).
2005 The second argument is " (except for posting)" if they are
2006 trying to use ISO-2022-JP for something other than posting. */
2007 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2008 *keyboard_charmap,
2009 iso2022jp ? _(" (except for posting)") : "");
2011 *err = cpstr(buf);
2014 input_cs = NULL;
2015 fs_give((void **) keyboard_charmap);
2016 *keyboard_charmap = cpstr("US-ASCII");
2017 if(!already_tried){
2018 already_tried++;
2019 goto try_again1;
2025 try_again2:
2026 if(!(*display_charmap))
2027 *display_charmap = cpstr("US-ASCII");
2029 if(*display_charmap){
2030 supported = output_charset_is_supported(*display_charmap);
2031 if(supported){
2032 if(!strucmp(*display_charmap, "utf-8"))
2033 init_utf8_display(1, NULL);
2034 else if((cs = utf8_charset(*display_charmap)) != NULL)
2035 init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2037 else{
2038 if(err && !*err){
2039 int iso2022jp = 0;
2041 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2042 iso2022jp = 1;
2044 snprintf(buf, sizeof(buf),
2045 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2046 *display_charmap,
2047 iso2022jp ? _(" (except for posting)") : "");
2049 *err = cpstr(buf);
2052 fs_give((void **) display_charmap);
2053 if(!already_tried){
2054 already_tried++;
2055 goto try_again2;
2059 else{
2060 if(err && !*err)
2061 *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2064 #undef cpstr
2066 *input_cs_arg = (void *) input_cs;
2068 return(0);
2073 input_charset_is_supported(char *input_charset)
2075 const CHARSET *cs;
2077 if(!(input_charset && *input_charset))
2078 return 0;
2080 if(!strucmp(input_charset, "utf-8"))
2081 return 1;
2083 if((cs = utf8_charset(input_charset)) != NULL){
2086 * This was true 2006-09-25.
2088 switch(cs->type){
2089 case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2090 case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2091 case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2092 case CT_UCS4: case CT_UTF16:
2093 return 1;
2094 break;
2096 default:
2097 break;
2101 return 0;
2106 output_charset_is_supported(char *output_charset)
2108 const CHARSET *cs;
2110 if(!(output_charset && *output_charset))
2111 return 0;
2113 if(!strucmp(output_charset, "utf-8"))
2114 return 1;
2116 if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2117 return 1;
2119 return 0;
2124 posting_charset_is_supported(char *posting_charset)
2126 return(posting_charset && *posting_charset
2127 && (!strucmp(posting_charset, "ISO-2022-JP")
2128 || output_charset_is_supported(posting_charset)));
2133 * This function is only defined in this special case and so calls
2134 * to it should be wrapped in the same macro conditionals.
2136 * Returns the default display charset for a UNIX terminal emulator,
2137 * it is what nl_langinfo(CODESET) should return but we need to
2138 * wrap nl_langinfo because we know of strange behaving implementations.
2140 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2141 char *
2142 nl_langinfo_codeset_wrapper(void)
2144 char *ret = NULL;
2146 ret = nl_langinfo(CODESET);
2149 * If the value returned from nl_langinfo() is not a real charset,
2150 * see if we can figure out what they meant. If we can't figure it
2151 * out return NULL and let the caller decide what to do.
2153 if(ret && *ret && !output_charset_is_supported(ret)){
2154 if(!strcmp("ANSI_X3.4-1968", ret)
2155 || !strcmp("646", ret)
2156 || !strcmp("ASCII", ret)
2157 || !strcmp("C", ret)
2158 || !strcmp("POSIX", ret))
2159 ret = "US-ASCII";
2160 else if(!strucmp(ret, "UTF8"))
2161 ret = "UTF-8";
2162 else if(!strucmp(ret, "EUCJP"))
2163 ret = "EUC-JP";
2164 else if(!strucmp(ret, "EUCKP"))
2165 ret = "EUC-KP";
2166 else if(!strucmp(ret, "SJIS"))
2167 ret = "SHIFT-JIS";
2168 else if(strstr(ret, "8859")){
2169 char *p;
2171 /* check for digits after 8859 */
2172 p = strstr(ret, "8859");
2173 p += 4;
2174 if(!isdigit(*p))
2175 p++;
2177 if(isdigit(*p)){
2178 static char buf[12];
2180 memset(buf, 0, sizeof(buf));
2181 strncpy(buf, "ISO-8859-", sizeof(buf));
2182 buf[9] = *p++;
2183 if(isdigit(*p))
2184 buf[10] = *p;
2186 ret = buf;
2191 if(ret && !output_charset_is_supported(ret))
2192 ret = NULL;
2194 return(ret);
2196 #endif
2200 * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2201 * needed the return value will point to orig. If a conversion is done,
2202 * the return string should be freed by the caller.
2203 * If not possible, returns NULL.
2205 char *
2206 utf8_to_charset(char *orig, char *charset, int report_err)
2208 SIZEDTEXT src, dst;
2209 char *ret = orig;
2211 if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2212 return ret;
2214 src.size = strlen(orig);
2215 src.data = (unsigned char *) orig;
2217 if(!strucmp(charset, "us-ascii")){
2218 size_t i;
2220 for(i = 0; i < src.size; i++)
2221 if(src.data[i] & 0x80)
2222 return NULL;
2224 return ret;
2228 * This works for ISO-2022-JP because of special code in utf8_cstext
2229 * but not for other 2022 charsets.
2231 memset(&dst, 0, sizeof(dst));
2232 if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2233 ret = (char *) dst.data; /* c-client already null terminates it */
2234 else
2235 ret = NULL;
2237 if((unsigned char *) ret != dst.data && dst.data)
2238 fs_give((void **) &dst.data);
2240 return ret;
2245 * Turn a number into a string with comma's
2247 * Args: number -- The long to be turned into a string.
2249 * Result: pointer to static string representing number with commas
2250 * Can use up to 3 comatose results at once.
2252 char *
2253 comatose(long int number)
2255 long i, x, done_one;
2256 static char buf[3][50];
2257 static int whichbuf = 0;
2258 char *b;
2260 whichbuf = (whichbuf + 1) % 3;
2262 if(number == 0){
2263 strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2264 buf[whichbuf][sizeof(buf[0])-1] = '\0';
2265 return(buf[whichbuf]);
2268 done_one = 0;
2269 b = buf[whichbuf];
2270 for(i = 1000000000; i >= 1; i /= 1000) {
2271 x = number / i;
2272 number = number % i;
2273 if(x != 0 || done_one) {
2274 if(b != buf[whichbuf] && (b-buf[whichbuf]) < sizeof(buf[0]))
2275 *b++ = ',';
2277 snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2278 b += strlen(b);
2279 done_one = 1;
2283 if(b-buf[whichbuf] < sizeof(buf[0]))
2284 *b = '\0';
2286 return(buf[whichbuf]);
2290 /* leave out the commas */
2291 char *
2292 tose(long int number)
2294 static char buf[3][50];
2295 static int whichbuf = 0;
2297 whichbuf = (whichbuf + 1) % 3;
2299 snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2301 return(buf[whichbuf]);
2306 * line_paint - where the real work of managing what is displayed gets done.
2308 void
2309 line_paint(int offset, /* current dot offset into vl */
2310 struct display_line *displ,
2311 int *passwd) /* flag to hide display of chars */
2313 int i, w, w2, already_got_one = 0;
2314 int vfirst, vlast, dfirst, dlast, vi, di;
2315 int new_vbase;
2316 unsigned (*width_a_to_b)(UCS *, int, int);
2319 * Set passwd to 10 in caller if you want to conceal the
2320 * password but not print asterisks for feedback.
2322 * Set passwd to 1 in caller to conceal by printing asterisks.
2324 if(passwd && *passwd >= 10){ /* don't show asterisks */
2325 if(*passwd > 10)
2326 return;
2327 else
2328 *passwd = 11; /* only blat once */
2330 i = 0;
2331 (*displ->movecursor)(displ->row, displ->col);
2332 while(i++ <= displ->dwid)
2333 (*displ->writechar)(' ');
2335 (*displ->movecursor)(displ->row, displ->col);
2336 return;
2339 if(passwd && *passwd)
2340 width_a_to_b = single_width_chars_a_to_b;
2341 else
2342 width_a_to_b = ucs4_str_width_a_to_b;
2345 * vl is the virtual line (the actual data). We operate on it by typing
2346 * characters to be added and deleting and so forth. In this routine we
2347 * copy a subset of those UCS-4 characters in vl into dl, the display
2348 * array, and show that subset on the screen.
2350 * Offset is the location of the cursor in vl.
2352 * We will display the string starting from vbase.
2353 * We have dwid screen cells to work in.
2354 * We may have to adjust vbase in order to display the
2355 * part of the string that contains the cursor.
2357 * We'll make the display look like
2358 * vl a b c d e f g h i j k l m
2359 * xxxxxxxxxxxxx <- width dwid window
2360 * < d e f g h >
2362 * vbase
2363 * The < will be there if vbase > 0.
2364 * The > will be there if the string from vbase to the
2365 * end can't all fit in the window.
2368 memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2371 * Adjust vbase so offset is not out of the window to the right.
2372 * (The +2 in w + 2 is for a possible " >" if the string goes past
2373 * the right hand edge of the window and if the last visible character
2374 * is double wide. We don't want the offset to be under that > character.)
2376 for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2377 displ->dwid > 1 &&
2378 w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2379 w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2381 * offset is off the window to the right
2382 * It looks like a b c d e f g h
2383 * | |
2384 * vbase offset
2385 * and offset is either past the right edge,
2386 * or right at the right edge (and maybe under >),
2387 * or one before right at the edge (and maybe on space
2388 * for half a character).
2390 * Since the characters may be double width it is slightly
2391 * complicated to figure out how far to increase vbase.
2392 * We're going to scoot over past width w/2 characters and
2393 * then see if that's sufficient.
2395 new_vbase = displ->vbase + 1;
2396 for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2397 w2 < displ->dwid/2;
2398 w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2399 new_vbase++;
2401 displ->vbase = new_vbase;
2404 /* adjust so offset is not out of the window to the left */
2405 while(displ->vbase > 0 && displ->vbase >= offset){
2406 /* add about dwid/2 more width */
2407 new_vbase = displ->vbase - 1;
2408 for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2409 w2 < (displ->dwid+1)/2 && new_vbase > 0;
2410 w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2411 new_vbase--;
2413 /* but don't let it get too small, recheck off right end */
2414 for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2415 w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2416 w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2417 new_vbase++;
2419 displ->vbase = MAX(new_vbase, 0);
2422 if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2423 displ->vbase = 0;
2425 vfirst = displ->vbase;
2426 dfirst = 0;
2427 if(displ->vbase > 0){ /* off screen cue left */
2428 dfirst = 1; /* index which matches vfirst */
2429 displ->dl[0] = '<';
2432 vlast = displ->vused-1; /* end */
2433 w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2435 if(displ->dwid > 0 && w + dfirst > displ->dwid){ /* off window right */
2437 /* find last ucs character to be printed */
2438 while(w + dfirst > displ->dwid - 1) /* -1 for > */
2439 w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2441 /* worry about double-width characters */
2442 if(w + dfirst == displ->dwid - 1){ /* no prob, hit it exactly */
2443 dlast = dfirst + vlast - vfirst + 1; /* +1 for > */
2444 displ->dl[dlast] = '>';
2446 else{
2447 dlast = dfirst + vlast - vfirst + 1;
2448 displ->dl[dlast++] = ' ';
2449 displ->dl[dlast] = '>';
2452 else
2453 dlast = dfirst + vlast - vfirst;
2456 * Copy the relevant part of the virtual line into the display line.
2458 for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2459 if(passwd && *passwd)
2460 displ->dl[di] = '*'; /* to conceal password */
2461 else
2462 displ->dl[di] = displ->vl[vi];
2465 * Add spaces to clear the rest of the line.
2466 * We have dwid total space to fill.
2468 w = (*width_a_to_b)(displ->dl, 0, dlast); /* width through dlast */
2469 for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2470 displ->dl[di++] = ' ';
2473 * Draw from left to right, skipping until we get to
2474 * something that is different. Characters may be different
2475 * widths than they were initially so paint from there the
2476 * rest of the way.
2478 for(di = 0; displ->dl[di]; di++){
2479 if(already_got_one || displ->dl[di] != displ->olddl[di]){
2480 /* move cursor first time */
2481 if(!already_got_one++){
2482 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2483 (*displ->movecursor)(displ->row, displ->col + w);
2486 (*displ->writechar)(displ->dl[di]);
2487 displ->olddl[di] = displ->dl[di];
2491 memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2494 * Move the cursor to the offset.
2496 * The offset is relative to the start of the virtual array. We need
2497 * to find the location on the screen. The offset into the display array
2498 * will be offset-vbase+dfirst. We want to be at the start of that
2499 * character, so we need to find the width of all the characters up
2500 * to that point.
2502 w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2504 (*displ->movecursor)(displ->row, displ->col + w);
2509 * This is just like ucs4_str_width_a_to_b() except all of the characters
2510 * are assumed to be of width 1. This is for printing out *'s when user
2511 * enters a password, while still managing to use the same code to do the
2512 * display.
2514 unsigned
2515 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2517 unsigned width = 0;
2518 int i;
2520 if(ucsstr)
2521 for(i = a; i <= b && ucsstr[i]; i++)
2522 width++;
2524 return width;