* New alpha version 2.24.1
[alpine.git] / pith / charconv / utf8.c
blob1651252ebbf676372db3f16b31925afee5d8109e
1 #if !defined(lint) && !defined(DOS)
2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
3 #endif
5 /*
6 * ========================================================================
7 * Copyright 2013-2021 Eduardo Chappa
8 * Copyright 2006-2008 University of Washington
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
14 * http://www.apache.org/licenses/LICENSE-2.0
16 * ========================================================================
20 /* includable WITHOUT dependency on c-client */
21 #include "../../c-client/mail.h"
22 #include "../../c-client/utf8.h"
24 #ifdef _WINDOWS
25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
26 #undef ERROR
27 #else
28 #define _XOPEN_SOURCE
29 #endif
31 #include <system.h>
33 #include "../../c-client/fs.h"
35 /* includable WITHOUT dependency on pico */
36 #include "../../pico/keydefs.h"
38 #include "../osdep/collate.h"
39 #include "../filttype.h"
41 #include "utf8.h"
43 #include <stdarg.h>
46 unsigned single_width_chars_a_to_b(UCS *, int, int);
49 static char locale_charmap[50];
51 static int native_utf8;
52 static void *display_data;
54 void
55 init_utf8_display(int utf8, void *rmap)
57 native_utf8 = utf8;
58 display_data = rmap;
63 * Argument is a UCS-4 wide character.
64 * Returns the environment dependent cell width of the
65 * character when printed to the screen.
66 * This will be -1 if the character is not printable.
67 * It will be >= zero if it is printable.
69 * Note that in the case it is not printable but it is still sent to
70 * Writechar, Writechar will print a '?' with width 1.
72 int
73 wcellwidth(UCS ucs)
75 char dummy[32];
76 long w;
79 * We believe that on modern unix systems wchar_t is a UCS-4 character.
80 * That's the assumption here.
83 if(native_utf8){ /* display is UTF-8 capable */
84 w = ucs4_width((unsigned long) ucs);
85 return((w & U4W_ERROR) ? -1 : w);
87 else if(display_data){
88 if(wtomb(dummy, ucs) < 0)
89 return(-1);
90 else{
91 w = ucs4_width((unsigned long) ucs);
92 return((w & U4W_ERROR) ? -1 : w);
95 #if !defined(_WINDOWS) && HAVE_WCWIDTH
96 else
97 return(wcwidth((wchar_t) ucs));
98 #else
99 return(0);
100 #endif
103 /* ambiguous width zone character function. We use the Windows code until
104 * we find a better way to do it in general.
107 pith_ucs4width(UCS ucs)
109 return (ucs >= 0x2100) ? 2 : 1;
110 #if !defined(_WINDOWS) && HAVE_WCWIDTH
111 return wcwidth((wchar_t) ucs);
112 #else
113 return (ucs >= 0x2100) ? 2 : 1;
114 #endif /* _WINDOWS */
118 * Argument is a UCS-4 wide character.
119 * It is converted to the multibyte version (for example UTF8 or EUC-JP).
120 * Dest is a buffer at least xx chars wide where the multi-byte version
121 * of the wide character will be written.
122 * The returned value is the number of bytes written to dest or -1
123 * if the conversion can't be done.
126 wtomb(char *dest, UCS ucs)
129 * We believe that on modern unix systems wchar_t is a UCS-4 character.
130 * That's the assumption here.
133 if(native_utf8){
134 unsigned char *newdptr;
136 newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
137 return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
139 else if(display_data){
140 unsigned long ucs4;
141 int ret;
143 ucs4 = (unsigned long) ucs;
144 ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
145 if(ret >= 0)
146 ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
147 else
148 ret = -1;
150 return(ret);
152 else
153 return(wcrtomb(dest, (wchar_t) ucs, NULL));
158 * This function does not necessarily update inputp and remaining_octets, so
159 * don't rely on that. The c-client version does but the other doesn't.
162 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
164 UCS ucs;
166 if(input_cs){
167 CHARSET *cast_input_cs;
169 cast_input_cs = (CHARSET *) input_cs;
171 switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
172 case U8G_ENDSTRG:
173 case U8G_ENDSTRI:
174 return(CCONV_NEEDMORE);
176 default:
177 if(ucs & U8G_ERROR || ucs == UBOGON)
178 return(CCONV_BADCHAR);
180 return(ucs);
183 else{
184 size_t ret;
185 wchar_t w;
188 * Warning: input_cs and remaining_octets are unused in this
189 * half of the if/else.
191 * Unfortunately, we can't tell the difference between a source string
192 * that is just not long enough and one that has characters that can't
193 * be converted even though it is long enough. We return NEEDMORE in both cases.
195 ret = mbstowcs(&w, (char *) (*inputp), 1);
196 if(ret == (size_t)(-1))
197 return(CCONV_NEEDMORE);
198 else{
199 ucs = (UCS) w;
200 return(ucs);
206 void
207 set_locale_charmap(char *charmap)
209 if(charmap){
210 strncpy(locale_charmap, charmap, sizeof(locale_charmap));
211 locale_charmap[sizeof(locale_charmap)-1] = '\0';
213 else
214 locale_charmap[0] = '\0';
219 * This ensures that the string is UTF-8. If str is already a UTF-8 string,
220 * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
221 * The caller is responsible for freeing the returned value.
223 * Args str -- the string to convert
225 char *
226 convert_to_utf8(char *str, char *fromcharset, int flags)
228 char *ret = NULL;
229 char *fcharset;
230 SIZEDTEXT src, result;
231 const CHARSET *cs;
232 int try;
234 src.data = (unsigned char *) str;
235 src.size = strlen(str);
237 /* already UTF-8, return NULL */
238 if(!(flags & CU8_NOINFER)
239 && (cs = utf8_infercharset(&src))
240 && (cs->type == CT_ASCII || cs->type == CT_UTF8))
241 return(ret);
243 try = 1;
244 while(try < 5){
245 switch(try){
246 case 1:
247 fcharset = fromcharset;
248 if(fcharset && strucmp("UTF-8", fcharset) != 0)
249 break; /* give it a try */
250 else
251 try++; /* fall through */
253 case 2:
254 if(!(flags & CU8_NOINFER)){
255 fcharset = cs ? cs->name : NULL;
256 if(fcharset && strucmp("UTF-8", fcharset) != 0)
257 break;
258 else
259 try++; /* fall through */
261 else
262 try++; /* fall through */
264 case 3:
265 fcharset = locale_charmap;
266 if(fcharset && strucmp("UTF-8", fcharset) != 0)
267 break;
268 else
269 try++; /* fall through */
271 default:
272 fcharset = "ISO-8859-1"; /* this will "work" */
273 break;
276 memset(&result, 0, sizeof(result));
278 if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
279 if(!(result.size == src.size && result.data == src.data)){
280 ret = (char *) fs_get((result.size+1) * sizeof(char));
281 strncpy(ret, (char *) result.data, result.size);
282 ret[result.size] = '\0';
284 /* else no conversion necessary */
286 if(result.data && result.data != src.data)
287 fs_give((void **) &result.data);
288 result.size = 0;
290 return(ret);
293 try++;
296 /* won't make it to here */
297 return(ret);
302 * Convert from UTF-8 to user's locale charset.
303 * This actually uses the wtomb routine to do the conversion, and that
304 * relies on setup_for_input_output having been called.
305 * If no conversion is necessary, NULL is returned, otherwise an allocated
306 * string in the locale charset is returned and the caller is responsible
307 * for freeing it.
309 char *
310 convert_to_locale(char *utf8str)
312 #define CHNK 500
313 char *inp, *ret = NULL;
314 CBUF_S cb;
315 int alloced;
316 size_t i = 0;
318 if(native_utf8 || !utf8str || !utf8str[0])
319 return(NULL);
321 cb.cbuf[0] = '\0';
322 cb.cbufp = cb.cbufend = cb.cbuf;
323 inp = utf8str;
325 alloced = CHNK;
326 ret = (char *) fs_get(alloced * sizeof(char));
329 * There's gotta be a better way to do this but utf8_to_locale was
330 * available and everything looks like a nail when all you have
331 * is a hammer.
333 while(*inp){
335 * We're placing the outgoing stream of characters in ret, a multi-byte
336 * array of characters in the user's locale charset. See if there is
337 * enough room for the next wide characters worth of output chars
338 * and allocate more space if not.
340 if((alloced - i) < MAX(MB_LEN_MAX,32)){
341 alloced += CHNK;
342 fs_resize((void **) &ret, alloced * sizeof(char));
345 i += utf8_to_locale((int) *inp++, &cb,
346 (unsigned char *) &ret[i], alloced - i);
349 fs_resize((void **) &ret, i + 1);
351 ret[i] = '\0';
353 return(ret);
358 * Pass in a stream of UTF-8 characters in 'c' and return obuf
359 * filled in with multi-byte characters. The return value is the
360 * number of valid characters in obuf to be used.
363 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
365 int outchars = 0;
367 if(!(cb && cb->cbufp))
368 return(0);
370 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
371 unsigned char *inputp;
372 unsigned long remaining_octets;
373 UCS ucs;
375 *(cb->cbufp)++ = (unsigned char) c;
376 inputp = cb->cbuf;
377 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
378 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
380 switch(ucs){
381 case U8G_ENDSTRG: /* incomplete character, wait */
382 case U8G_ENDSTRI: /* incomplete character, wait */
383 break;
385 default:
386 if(ucs & U8G_ERROR || ucs == UBOGON){
388 * None of these cases is supposed to happen. If it
389 * does happen then the input stream isn't UTF-8
390 * so something is wrong. Treat each character in the
391 * input buffer as a separate error character and
392 * print a '?' for each.
394 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
395 obuf[outchars++] = '?';
397 cb->cbufp = cb->cbuf;
399 else{
400 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
402 * This happens when we have a UTF-8 character that
403 * we aren't able to print in our locale. For example,
404 * if the locale is setup with the terminal
405 * expecting ISO-8859-1 characters then there are
406 * lots of UTF-8 characters that can't be printed.
407 * Print a '?' instead.
409 obuf[outchars++] = '?';
411 else{
413 * Convert the ucs into the multibyte
414 * character that corresponds to the
415 * ucs in the users locale.
417 outchars = wtomb((char *) obuf, ucs);
418 if(outchars < 0){
419 obuf[0] = '?';
420 outchars = 1;
424 /* update the input buffer */
425 if(inputp >= cb->cbufp) /* this should be the case */
426 cb->cbufp = cb->cbuf;
427 else{ /* extra chars for some reason? */
428 unsigned char *q, *newcbufp;
430 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
431 q = cb->cbuf;
432 while(inputp < cb->cbufp)
433 *q++ = *inputp++;
435 cb->cbufp = newcbufp;
439 break;
442 else{ /* error */
443 obuf[0] = '?';
444 outchars = 1;
445 cb->cbufp = cb->cbuf; /* start over */
448 return(outchars);
453 * Returns the screen cells width of the UCS-4 string argument.
454 * The source string is zero terminated.
456 unsigned
457 ucs4_str_width(UCS *ucsstr)
459 unsigned width = 0;
460 int w;
462 if(ucsstr)
463 while(*ucsstr){
464 w = wcellwidth(*ucsstr++);
465 if(w != U4W_CTLSRGT)
466 width += (w < 0 ? 1 : w);
469 return width;
474 * Returns the screen cells width of the UCS-4 string argument
475 * from ucsstr[a] through (inclusive) ucsstr[b].
476 * No checking is done to make sure a starts in the middle
477 * of a UCS-4 array.
479 unsigned
480 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
482 unsigned width = 0;
483 int i, w;
485 if(ucsstr)
486 for(i = a; i <= b && ucsstr[i]; i++){
487 w = wcellwidth(ucsstr[i]);
488 if(w != U4W_CTLSRGT)
489 width += (w < 0 ? 1 : w);
492 return width;
497 * Returns the screen cells width of the UCS-4 string argument
498 * from ustart through (exclusive) uend.
499 * No checking is done to make sure it starts in the middle
500 * of a UCS-4 array.
502 unsigned
503 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
505 UCS *u;
506 unsigned width = 0;
507 int w;
509 if(!ustart)
510 return width;
512 if(ustart)
513 for(u = ustart; u < uend; u++){
514 w = wcellwidth(*u);
515 if(w != U4W_CTLSRGT)
516 width += (w < 0 ? 1 : w);
519 return(width);
524 * Return the largest possible pointer into ucs4str so that the width
525 * of the string from ucs4str to the pointer (exclusive)
526 * is maxwidth or less. Also stops at a null character.
528 UCS *
529 ucs4_particular_width(UCS *ucs4str, int maxwidth)
531 UCS *u;
532 int w_consumed = 0, w, done = 0;
534 u = ucs4str;
536 if(u)
537 while(!done && *u && w_consumed <= maxwidth){
538 w = wcellwidth(*u);
539 w = (w >= 0 ? w : 1);
540 if(w_consumed + w <= maxwidth){
541 w_consumed += w;
542 ++u;
544 else
545 ++done;
548 return(u);
553 * Convert and copy a UTF-8 string into a UCS-4 NULL
554 * terminated array. Just like cpystr only it converts
555 * from UTF-8 to UCS-4.
557 * Returned UCS-4 string needs to be freed by caller.
559 UCS *
560 utf8_to_ucs4_cpystr(char *utf8src)
562 size_t retsize;
563 UCS *ret = NULL;
564 UCS ucs;
565 unsigned long remaining_octets;
566 unsigned char *readptr;
567 size_t arrayindex;
570 * We don't know how big to allocate the return array
571 * because variable numbers of octets in the src array
572 * will combine to make UCS-4 characters. The number of
573 * UCS-4 characters is less than or equal to the number
574 * of src characters, though.
577 if(!utf8src)
578 return NULL;
580 retsize = strlen(utf8src) + 1;
582 ret = (UCS *) fs_get(retsize * sizeof(*ret));
583 memset(ret, 0, retsize * sizeof(*ret));
585 readptr = (unsigned char *) utf8src;
586 remaining_octets = retsize-1;
587 arrayindex = 0;
589 while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
590 ucs = (UCS) utf8_get(&readptr, &remaining_octets);
592 if(ucs & U8G_ERROR || ucs == UBOGON)
593 remaining_octets = 0;
594 else
595 ret[arrayindex++] = ucs;
598 ret[arrayindex] = '\0';
600 /* get rid of excess size */
601 if(arrayindex+1 < retsize)
602 fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
604 return ret;
609 * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
610 * terminated string. Just like cpystr only it converts
611 * from UCS-4 to UTF-8.
613 * Returned UTF-8 string needs to be freed by caller.
615 char *
616 ucs4_to_utf8_cpystr(UCS *ucs4src)
618 unsigned char *ret = NULL;
619 unsigned char *writeptr;
620 int i;
622 if(!ucs4src)
623 return NULL;
626 * Over-allocate and then resize at the end.
629 /* count characters in source */
630 for(i = 0; ucs4src[i]; i++)
633 ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
634 memset(ret, 0, (6*i + 1) * sizeof(*ret));
636 writeptr = ret;
637 for(i = 0; ucs4src[i]; i++)
638 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
640 /* get rid of excess size */
641 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
643 return ((char *) ret);
648 * Similar to above but copy a fixed number of source
649 * characters instead of going until null terminator.
651 char *
652 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
654 unsigned char *ret = NULL;
655 unsigned char *writeptr;
656 int i;
658 if(!ucs4src)
659 return NULL;
662 * Over-allocate and then resize at the end.
665 ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
666 memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
668 writeptr = ret;
669 for(i = 0; i < ucs4src_len; i++)
670 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
672 /* get rid of excess size */
673 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
675 return ((char *) ret);
679 #ifdef _WINDOWS
681 * Convert a UTF-8 argument into an LPTSTR version
682 * of that argument. The result is allocated here
683 * and should be freed by the caller.
685 LPTSTR
686 utf8_to_lptstr(LPSTR arg_utf8)
688 int lptstr_len;
689 LPTSTR lptstr_ret = NULL;
691 lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
692 if(lptstr_len > 0)
694 lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
695 lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
696 arg_utf8, -1, lptstr_ret, lptstr_len );
699 if(!lptstr_len)
701 /* check GetLastError()? */
702 lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
703 lptstr_ret[0] = 0;
706 return lptstr_ret;
711 * Convert an LPTSTR argument into a UTF-8 version
712 * of that argument. The result is allocated here
713 * and should be freed by the caller.
715 LPSTR
716 lptstr_to_utf8(LPTSTR arg_lptstr)
718 int utf8str_len;
719 LPSTR utf8str_ret = NULL;
721 utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
722 if(utf8str_len > 0)
724 utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
725 utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
726 arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
729 if(!utf8str_len)
731 /* check GetLastError()? */
732 utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
733 utf8str_ret[0] = 0;
736 return utf8str_ret;
741 * Convert a UCS4 argument into an LPTSTR version
742 * of that argument. The result is allocated here
743 * and should be freed by the caller.
745 LPTSTR
746 ucs4_to_lptstr(UCS *arg_ucs4)
748 LPTSTR ret_lptstr = NULL;
749 size_t len;
750 size_t i;
752 if(arg_ucs4){
753 len = ucs4_strlen(arg_ucs4);
754 ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
755 /* bogus conversion ignores UTF-16 */
756 for(i = 0; i < len; i++)
757 ret_lptstr[i] = arg_ucs4[i];
759 ret_lptstr[len] = '\0';
762 return(ret_lptstr);
767 * Convert an LPTSTR argument into a UCS4 version
768 * of that argument. The result is MemAlloc'd here
769 * and should be freed by the caller.
771 UCS *
772 lptstr_to_ucs4(LPTSTR arg_lptstr)
774 UCS *ret_ucs4 = NULL;
775 size_t len;
776 size_t i;
778 if(arg_lptstr){
779 len = _tcslen(arg_lptstr);
780 ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
781 /* bogus conversion ignores UTF-16 */
782 for(i = 0; i < len; i++)
783 ret_ucs4[i] = arg_lptstr[i];
785 ret_ucs4[len] = '\0';
788 return(ret_ucs4);
791 #endif /* _WINDOWS */
795 * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
796 * 1-at-a-time filled in with UCS characters. The return value is the
797 * number of valid characters in obuf to be used. It can only
798 * be 1 or 0 characters since we're only getting one UTF-8 character
799 * at a time.
802 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
804 int width = 0, outchars = 0;
806 if(!(cb && cb->cbufp))
807 return(0);
809 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
810 unsigned char *inputp;
811 unsigned long remaining_octets;
812 UCS ucs;
814 *cb->cbufp++ = (unsigned char) c;
815 inputp = cb->cbuf;
816 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
817 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
819 switch(ucs){
820 case U8G_ENDSTRG: /* incomplete character, wait */
821 case U8G_ENDSTRI: /* incomplete character, wait */
822 break;
824 default:
825 if(ucs & U8G_ERROR || ucs == UBOGON){
827 * None of these cases is supposed to happen. If it
828 * does happen then the input stream isn't UTF-8
829 * so something is wrong.
831 outchars++;
832 *obuf = '?';
833 cb->cbufp = cb->cbuf;
834 width = 1;
836 else{
837 outchars++;
838 if(ucs < 0x80 && ucs >= 0x20)
839 width = 1;
841 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
843 * This happens when we have a UTF-8 character that
844 * we aren't able to print in our locale. For example,
845 * if the locale is setup with the terminal
846 * expecting ISO-8859-1 characters then there are
847 * lots of UTF-8 characters that can't be printed.
848 * Print a '?' instead.
849 * Don't think this should happen in Windows.
851 *obuf = '?';
853 else{
854 *obuf = ucs;
857 /* update the input buffer */
858 if(inputp >= cb->cbufp) /* this should be the case */
859 cb->cbufp = cb->cbuf;
860 else{ /* extra chars for some reason? */
861 unsigned char *q, *newcbufp;
863 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
864 q = cb->cbuf;
865 while(inputp < cb->cbufp)
866 *q++ = *inputp++;
868 cb->cbufp = newcbufp;
872 break;
875 else{ /* error */
876 *obuf = '?';
877 outchars = 1;
878 width = 1;
879 cb->cbufp = cb->cbuf; /* start over */
882 if(obufwidth)
883 *obufwidth = width;
885 return(outchars);
890 * Return an allocated copy of a zero-terminated UCS-4 string.
892 UCS *
893 ucs4_cpystr(UCS *ucs4src)
895 size_t arraysize;
896 UCS *ret = NULL;
897 size_t i;
899 if(!ucs4src)
900 return NULL;
902 arraysize = ucs4_strlen(ucs4src);
904 ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
905 memset(ret, 0, (arraysize+1) * sizeof(*ret));
907 for(i = 0; i < arraysize; i++)
908 ret[i] = ucs4src[i];
910 return ret;
914 UCS *
915 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
917 size_t i;
919 if(ucs4src && ucs4dst){
920 for(i = 0; i < n; i++){
921 ucs4dst[i] = ucs4src[i];
922 if(ucs4dst[i] == '\0')
923 break;
927 return ucs4dst;
931 UCS *
932 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
934 size_t i;
935 UCS *u;
937 if(ucs4src && ucs4dst){
938 for(u = ucs4dst; *u; u++)
941 for(i = 0; i < n; i++){
942 u[i] = ucs4src[i];
943 if(u[i] == '\0')
944 break;
947 if(i == n)
948 u[i] = '\0';
951 return ucs4dst;
956 * Like strlen only this returns the number of non-zero characters
957 * in a zero-terminated UCS-4 array.
959 size_t
960 ucs4_strlen(UCS *ucs4str)
962 size_t i = 0;
964 if(ucs4str)
965 while(ucs4str[i])
966 i++;
968 return(i);
973 ucs4_strcmp(UCS *s1, UCS *s2)
975 for(; *s1 == *s2; s1++, s2++)
976 if(*s1 == '\0')
977 return 0;
979 return((*s1 < *s2) ? -1 : 1);
983 UCS *
984 ucs4_strchr(UCS *s, UCS c)
986 if(!s)
987 return NULL;
989 while(*s && *s != c)
990 s++;
992 if(*s || !c)
993 return s;
994 else
995 return NULL;
999 UCS *
1000 ucs4_strrchr(UCS *s, UCS c)
1002 UCS *ret = NULL;
1004 if(!s)
1005 return ret;
1007 while(*s){
1008 if(*s == c)
1009 ret = s;
1011 s++;
1014 return ret;
1019 * Returns the screen cells width of the UTF-8 string argument.
1021 unsigned
1022 utf8_width(char *str)
1024 unsigned width = 0;
1025 int this_width;
1026 UCS ucs;
1027 unsigned long remaining_octets;
1028 char *readptr;
1030 if(!(str && *str))
1031 return(width);
1033 readptr = str;
1034 remaining_octets = readptr ? strlen(readptr) : 0;
1036 while(remaining_octets > 0 && *readptr){
1038 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1040 if(ucs & U8G_ERROR || ucs == UBOGON){
1042 * This should not happen, but do something to handle it anyway.
1043 * Treat each character as a single width character, which is what should
1044 * probably happen when we actually go to write it out.
1046 remaining_octets--;
1047 readptr++;
1048 this_width = 1;
1050 else{
1051 this_width = wcellwidth(ucs);
1054 * If this_width is -1 that means we can't print this character
1055 * with our current locale. Writechar will print a '?'.
1057 if(this_width < 0)
1058 this_width = 1;
1061 width += (unsigned) this_width;
1064 return(width);
1069 * Copy UTF-8 characters from src into dst.
1070 * This is intended to be used if you want to truncate a string at
1071 * the start instead of the end. For example, you have a long string
1072 * like
1073 * this_is_a_long_string
1074 * but not enough space to fit it into a particular field. You want to
1075 * end up with
1076 * s_a_long_string
1077 * where that fits in a particular width. Perhaps you'd use this with ...
1078 * to get
1079 * ...s_a_long_string
1080 * This right adjusts the end of the string in the width space and
1081 * cuts it off at the start. If there is enough width for the whole
1082 * string it will copy the string into dst with no padding.
1084 * Copy enough characters so that the result will have screen width of
1085 * want_width screen cells in current locale.
1087 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1088 * to dst. This is just for protection, it shouldn't be relied on to
1089 * do anything useful. Dstlen should be large enough. Otherwise you'll get
1090 * characters truncated in the middle or something like that.
1092 * Returned value is the number of bytes written to dst, not including
1093 * the possible terminating null.
1095 * If we can't hit want_width exactly because of double width characters
1096 * then we will pad the end of the string with space in order to make
1097 * the width exact.
1099 size_t
1100 utf8_to_width_rhs(char *dst, /* destination buffer */
1101 char *src, /* source string */
1102 size_t dstlen, /* space in dest */
1103 unsigned want_width) /* desired screen width */
1105 int this_width;
1106 unsigned width_consumed = 0;
1107 UCS ucs;
1108 unsigned long remaining_octets;
1109 char *readptr, *goodreadptr, *savereadptr, *endptr;
1110 size_t nb = 0;
1112 if(!src){
1113 if(dstlen > 0)
1114 dst[0] = '\0';
1116 return nb;
1120 * Start at the end of the source string and go backwards until we
1121 * get to the desired width, but not more than the width.
1123 readptr = src + strlen(src);
1124 endptr = readptr;
1125 goodreadptr = readptr;
1126 width_consumed = 0;
1127 savereadptr = readptr;
1129 for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1130 readptr = savereadptr-1){
1132 savereadptr = readptr;
1133 remaining_octets = goodreadptr - readptr;
1134 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1137 * Handling the error case is tough because an error will be the normal thing that
1138 * happens as we back through the string. So we're just going to punt on the
1139 * error for now.
1141 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1142 if(remaining_octets > 0){
1144 * This means there are some bad octets after this good
1145 * character so things are not going to work out well.
1146 * Bail out.
1148 savereadptr = src; /* we're done */
1150 else{
1151 this_width = wcellwidth(ucs);
1153 if(this_width < 0)
1154 this_width = 1;
1156 if(width_consumed + (unsigned) this_width <= want_width){ /* ok */
1157 width_consumed += (unsigned) this_width;
1158 goodreadptr = savereadptr;
1160 else
1161 savereadptr = src; /* we're done */
1167 * Copy characters from goodreadptr to endptr into dst.
1169 nb = MIN(endptr-goodreadptr, dstlen-1);
1170 strncpy(dst, goodreadptr, nb);
1171 dst[nb] = '\0';
1174 * Pad out with spaces in order to hit width exactly.
1176 while(width_consumed < want_width && nb < dstlen-1){
1177 dst[nb++] = ' ';
1178 dst[nb] = '\0';
1179 width_consumed++;
1182 return nb;
1187 * The arguments being converted are UTF-8 strings.
1188 * This routine attempts to make it possible to use screen cell
1189 * widths in a format specifier. In a one-byte per screen cell
1190 * world we might have used %10.10s to cause a string to occupy
1191 * 10 screen positions. Since the width and precision are really
1192 * referring to numbers of bytes instead of screen positions that
1193 * won't work with UTF-8 input. We emulate that behavior with
1194 * the format string %w. %m.nw means to use the m and n as
1195 * screen width indicators instead of bytes indicators.
1197 * There is no reason to use this routine unless you want to use
1198 * min field with or precision with the specifier. A plain %w without
1199 * widths is equivalent exactly to a plain %s in a regular printf.
1201 * Double-width characters complicate things. It may not be possible
1202 * to satisfy the request exactly. For example, %3w for an input
1203 * string that is made up of two double-width characters.
1204 * This routine will arbitrarily use a trailing space character if
1205 * needed to make the width come out correctly where a half of a
1206 * double-width character would have been needed. We'll see how
1207 * that works for us.
1209 * %w only works for strings (it's a %s replacement).
1211 * Buffer overflow is handled by the size argument. %.30s will work
1212 * to limit a particular string to 30 bytes, but you lose that
1213 * ability with %w, since it may write more than precision bytes
1214 * in order to get to the desired width. It is best to choose
1215 * size large enough so that it doesn't come into play, otherwise
1216 * it may be possible to get partial UTF-8 characters because of
1217 * the truncation.
1219 * The return value isn't quite the same as the return value
1220 * of snprintf. It is the number of bytes written, not counting
1221 * the trailing null, just like snprintf. However, if it is
1222 * truncated due to size then the output is size, not the
1223 * number of characters that would have been written.
1226 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1228 char newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1229 char *start_of_specifier;
1230 char *input_str;
1231 int int_arg;
1232 double double_arg;
1233 void *ptr_arg;
1234 unsigned got_width;
1235 int more_flags, ret, w;
1236 int min_field_width, field_precision, modifier;
1237 int flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1238 va_list args;
1240 newfmt[0] = '\0';
1241 q = newfmt;
1243 pdest = dest;
1245 #define IS_ROOM_IN_DEST(n_more_chars) \
1246 ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1249 * Strategy: Look through the fmt string for %w's. Replace the
1250 * %w's in the format string with %s's but with possibly different
1251 * width and precision arguments which will make it come out right.
1252 * Then call the regular system vsnprintf with the altered format
1253 * string but same arguments.
1255 * That would be nice but it doesn't quite work. Why? Because a
1256 * %*w will need to have the value in the integer argument the *
1257 * refers to modified. Can't do it as far as I can tell. Or we could
1258 * remove the integer argument somehow before calling printf. Can't
1259 * do it. Or we could somehow add an additional conversion specifier
1260 * that caused nothing to be printed but ate up the integer arg.
1261 * Can't figure out how to do that either.
1263 * Since we can't figure out how to do it, the alternative is to
1264 * construct the result one piece at a time, pasting together the
1265 * pieces from the different conversions.
1267 va_start(args, fmt);
1269 while(*fmt && IS_ROOM_IN_DEST(1)){
1270 if(*fmt == '%'){
1271 start_of_specifier = fmt++;
1273 min_field_width = field_precision = -1;
1274 flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1276 /* flags */
1277 more_flags = 1;
1278 while(more_flags){
1279 switch(*fmt){
1280 case '-':
1281 flags_minus++;
1282 fmt++;
1283 break;
1285 case '+':
1286 flags_plus++;
1287 fmt++;
1288 break;
1290 case ' ':
1291 flags_space++;
1292 fmt++;
1293 break;
1295 case '0':
1296 flags_zero++;
1297 fmt++;
1298 break;
1300 case '#':
1301 flags_pound++;
1302 fmt++;
1303 break;
1305 default:
1306 more_flags = 0;
1307 break;
1311 /* minimum field width */
1312 if(*fmt == '*'){
1313 min_field_width = va_arg(args, int);
1314 fmt++;
1316 else if(*fmt >= '0' && *fmt <= '9'){
1317 width_str = fmt;
1318 while (*fmt >= '0' && *fmt <= '9')
1319 fmt++;
1321 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1322 if(sizeof(buf) > fmt-width_str)
1323 buf[fmt-width_str] = '\0';
1325 buf[sizeof(buf)-1] = '\0';
1327 min_field_width = atoi(width_str);
1330 /* field precision */
1331 if(*fmt == '.'){
1332 fmt++;
1333 if(*fmt == '*'){
1334 field_precision = va_arg(args, int);
1335 fmt++;
1337 else if(*fmt >= '0' && *fmt <= '9'){
1338 width_str = fmt;
1339 while (*fmt >= '0' && *fmt <= '9')
1340 fmt++;
1342 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1343 if(sizeof(buf) > fmt-width_str)
1344 buf[fmt-width_str] = '\0';
1346 buf[sizeof(buf)-1] = '\0';
1348 field_precision = atoi(width_str);
1352 /* length modifier */
1353 if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1354 modifier = *fmt++;
1356 /* conversion character */
1357 switch(*fmt){
1358 case 'w':
1360 * work with va_arg(char *) to figure out width
1361 * and precision needed to produce the screen width
1362 * and precision asked for in %w using some of the
1363 * utf8 width routines we have.
1366 input_str = va_arg(args, char *);
1367 if(field_precision >=0 || min_field_width >= 0)
1368 w = utf8_width(input_str);
1370 if(field_precision >= 0){
1371 if(w <= field_precision)
1372 field_precision = -1; /* print it all */
1373 else{
1375 * We need to cut off some of the input_str
1376 * in this case.
1378 end = utf8_count_forw_width(input_str, field_precision, &got_width);
1379 field_precision = (int) (end - input_str);
1380 /* new w with this field_precision */
1381 w = got_width;
1385 /* need some padding */
1386 if(min_field_width >= 0)
1387 min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1388 MAX(0, min_field_width - w);
1391 * Now we just need to get the new format string
1392 * set correctly in newfmt.
1394 q = newfmt;
1395 if(q-newfmt < sizeof(newfmt))
1396 *q++ = '%';
1398 if(flags_minus && q-newfmt < sizeof(newfmt))
1399 *q++ = '-';
1400 if(flags_plus && q-newfmt < sizeof(newfmt))
1401 *q++ = '+';
1402 if(flags_space && q-newfmt < sizeof(newfmt))
1403 *q++ = ' ';
1404 if(flags_zero && q-newfmt < sizeof(newfmt))
1405 *q++ = '0';
1406 if(flags_pound && q-newfmt < sizeof(newfmt))
1407 *q++ = '#';
1409 if(min_field_width >= 0){
1410 snprintf(buf, sizeof(buf), "%d", min_field_width);
1411 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1414 if(field_precision >= 0){
1415 if(q-newfmt < sizeof(newfmt))
1416 *q++ = '.';
1418 snprintf(buf, sizeof(buf), "%d", field_precision);
1419 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1422 if(q-newfmt < sizeof(newfmt))
1423 *q++ = 's';
1425 if(q-newfmt < sizeof(newfmt))
1426 *q++ = '\0';
1428 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1429 pdest += strlen(pdest);
1431 break;
1433 case '\0':
1434 fmt--;
1435 break;
1437 default:
1438 /* make a new format which leaves out the dynamic '*' arguments */
1439 q = newfmt;
1440 if(q-newfmt < sizeof(newfmt))
1441 *q++ = '%';
1443 if(flags_minus && q-newfmt < sizeof(newfmt))
1444 *q++ = '-';
1445 if(flags_plus && q-newfmt < sizeof(newfmt))
1446 *q++ = '+';
1447 if(flags_space && q-newfmt < sizeof(newfmt))
1448 *q++ = ' ';
1449 if(flags_zero && q-newfmt < sizeof(newfmt))
1450 *q++ = '0';
1451 if(flags_pound && q-newfmt < sizeof(newfmt))
1452 *q++ = '#';
1454 if(min_field_width >= 0){
1455 snprintf(buf, sizeof(buf), "%d", min_field_width);
1456 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1459 if(field_precision >= 0){
1460 if(q-newfmt < sizeof(newfmt))
1461 *q++ = '.';
1463 snprintf(buf, sizeof(buf), "%d", field_precision);
1464 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1467 if(q-newfmt < sizeof(newfmt))
1468 *q++ = *fmt;
1470 if(q-newfmt < sizeof(newfmt))
1471 *q++ = '\0';
1473 switch(*fmt){
1474 case 'd': case 'i': case 'o':
1475 case 'x': case 'X': case 'u': case 'c':
1476 int_arg = va_arg(args, int);
1477 snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1478 pdest += strlen(pdest);
1479 break;
1481 case 's':
1482 input_str = va_arg(args, char *);
1483 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1484 pdest += strlen(pdest);
1485 break;
1487 case 'f': case 'e': case 'E':
1488 case 'g': case 'G':
1489 double_arg = va_arg(args, double);
1490 snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1491 pdest += strlen(pdest);
1492 break;
1494 case 'p':
1495 ptr_arg = va_arg(args, void *);
1496 snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1497 pdest += strlen(pdest);
1498 break;
1500 case '%':
1501 if(IS_ROOM_IN_DEST(1))
1502 *pdest++ = '%';
1504 break;
1506 default:
1507 /* didn't think of this type */
1508 assert(0);
1509 break;
1512 break;
1515 fmt++;
1517 else{
1518 if(IS_ROOM_IN_DEST(1))
1519 *pdest++ = *fmt++;
1523 ret = pdest - dest;
1525 if(IS_ROOM_IN_DEST(1))
1526 *pdest++ = '\0';
1528 va_end(args);
1530 return ret;
1535 * Copy UTF-8 characters from src into dst.
1536 * Copy enough characters so that the result will have (<=) screen width of
1537 * want_width screen cells in current locale.
1539 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1540 * to dst.
1542 * Returned value is the number of bytes written to dst, not including
1543 * the possible terminating null.
1544 * Got_width is another returned value. It is the width in screen cells of
1545 * the string placed in dst. It will be the same as want_width if there
1546 * are enough characters in the src to do that and if the character widths
1547 * hit the width exactly. It will be less than want_width if we run out
1548 * of src characters or if the next character width would skip over the
1549 * width we want, because it is double width.
1551 * Zero width characters are collected and included at the end of the string.
1552 * That is, if we make it to want_width but there is still a zero length
1553 * character sitting in src, we add that to dst. This might be an accent
1554 * or something like that.
1556 size_t
1557 utf8_to_width(char *dst, /* destination buffer */
1558 char *src, /* source string */
1559 size_t dstlen, /* space in dst */
1560 unsigned want_width, /* desired screen width */
1561 unsigned *got_width) /* returned screen width in dst */
1563 int this_width;
1564 unsigned width_consumed = 0;
1565 UCS ucs;
1566 unsigned long remaining_octets;
1567 char *writeptr, *readptr, *savereadptr, *endptr;
1568 int ran_out_of_space = 0;
1570 readptr = src;
1572 remaining_octets = readptr ? strlen(readptr) : 0;
1574 writeptr = dst;
1575 endptr = writeptr + dstlen;
1577 if(readptr && writeptr){
1578 while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1579 savereadptr = readptr;
1580 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1582 if(ucs & U8G_ERROR || ucs == UBOGON)
1583 remaining_octets = 0;
1584 else{
1585 this_width = wcellwidth(ucs);
1588 * If this_width is -1 that means we can't print this character
1589 * with our current locale. Writechar will print a '?'.
1591 if(this_width < 0)
1592 this_width = 1;
1594 if(width_consumed + (unsigned) this_width <= want_width){
1595 /* append this utf8 character to dst if it will fit */
1596 if(writeptr + (readptr - savereadptr) < endptr){
1597 width_consumed += this_width;
1598 while(savereadptr < readptr)
1599 *writeptr++ = *savereadptr++;
1601 else
1602 ran_out_of_space++; /* no more utf8 to dst */
1604 else
1605 remaining_octets = 0; /* we're done */
1609 if(writeptr < endptr)
1610 *writeptr = '\0';
1613 if(got_width)
1614 *got_width = width_consumed;
1616 return(writeptr ? (writeptr - dst) : 0);
1621 * Str is a UTF-8 string.
1622 * Count forward width screencell positions and return a pointer to the
1623 * end of the string that is width wide.
1624 * The returned pointer points at the next character (where the null would
1625 * be placed).
1627 * Got_width is another returned value. It is the width in screen cells of
1628 * the string from str to the returned pointer. It will be the same as
1629 * want_width if there are enough characters in the str to do that
1630 * and if the character widths hit the width exactly. It will be less
1631 * than want_width if we run out of characters or if the next character
1632 * width would skip over the width we want, because it is double width.
1634 char *
1635 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1637 int this_width;
1638 unsigned width_consumed = 0;
1639 UCS ucs;
1640 unsigned long remaining_octets;
1641 char *readptr;
1642 char *retptr;
1644 retptr = readptr = str;
1646 remaining_octets = readptr ? strlen(readptr) : 0;
1648 while(width_consumed <= want_width && remaining_octets > 0){
1650 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1652 if(ucs & U8G_ERROR || ucs == UBOGON){
1654 * This should not happen, but do something to handle it anyway.
1655 * Treat each character as a single width character, which is what should
1656 * probably happen when we actually go to write it out.
1658 remaining_octets--;
1659 readptr++;
1660 this_width = 1;
1662 else{
1663 this_width = wcellwidth(ucs);
1666 * If this_width is -1 that means we can't print this character
1667 * with our current locale. Writechar will print a '?'.
1669 if(this_width < 0)
1670 this_width = 1;
1673 if(width_consumed + (unsigned) this_width <= want_width){
1674 width_consumed += (unsigned) this_width;
1675 retptr = readptr;
1677 else
1678 remaining_octets = 0; /* we're done */
1681 if(got_width)
1682 *got_width = width_consumed;
1684 return(retptr);
1689 * Copy a null terminator into a UTF-8 string in place so that the string is
1690 * no more than a certain screen width wide. If the string is already less
1691 * than or equal in width to the requested width, no change is made.
1693 * The actual width accomplished is returned. Note that it may be less than
1694 * max_width due to double width characters as well as due to the fact that
1695 * it fits wholly in the max_width.
1697 * Returned value is the actual screen width of str when done.
1699 * A side effect is that a terminating null may have been written into
1700 * the passed in string.
1702 unsigned
1703 utf8_truncate(char *str, unsigned max_width)
1705 int this_width;
1706 unsigned width_consumed = 0;
1707 UCS ucs;
1708 unsigned long remaining_octets;
1709 char *readptr, *savereadptr;
1711 readptr = str;
1713 remaining_octets = readptr ? strlen(readptr) : 0;
1715 if(readptr){
1716 while(width_consumed <= max_width && remaining_octets > 0){
1718 savereadptr = readptr;
1719 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1721 if(ucs & U8G_ERROR || ucs == UBOGON){
1723 * This should not happen, but do something to handle it anyway.
1724 * Treat each character as a single width character, which is what should
1725 * probably happen when we actually go to write it out.
1727 remaining_octets--;
1728 readptr++;
1729 this_width = 1;
1731 else{
1732 this_width = wcellwidth(ucs);
1735 * If this_width is -1 that means we can't print this character
1736 * with our current locale. Writechar will print a '?'.
1738 if(this_width < 0)
1739 this_width = 1;
1742 if(width_consumed + (unsigned) this_width <= max_width){
1743 width_consumed += (unsigned) this_width;
1745 else{
1746 remaining_octets = 0; /* we're done */
1747 *savereadptr = '\0';
1752 return(width_consumed);
1757 * Copy UTF-8 characters from src into dst.
1758 * Copy enough characters so that the result will have screen width of
1759 * want_width screen cells in current locale.
1760 * If there aren't enough characters in src to get to want_width, pad on
1761 * left or right according to left_adjust argument.
1763 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1764 * to dst. Dst will be null terminated if there is enough room, but not
1765 * if that would overflow dst's len.
1767 * Returned value is the number of bytes written to dst, not including
1768 * the possible terminating null.
1770 size_t
1771 utf8_pad_to_width(char *dst, /* destination buffer */
1772 char *src, /* source string */
1773 size_t dstlen, /* space in dst */
1774 unsigned want_width, /* desired screen width */
1775 int left_adjust) /* adjust left or right in want_width columns */
1777 unsigned got_width = 0;
1778 int need_more, howmany;
1779 size_t len_left, bytes_used;
1781 bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1782 len_left = dstlen - bytes_used;
1784 need_more = want_width - got_width;
1785 howmany = MIN(need_more, len_left);
1787 if(howmany > 0){
1788 char *end, *newend, *p, *q;
1790 end = dst + bytes_used;
1791 newend = end + howmany;
1792 if(left_adjust){
1794 * Add padding to end of string. Simply append
1795 * the needed number of spaces, or however many will fit
1796 * if we don't have enough space.
1798 for(q = end; q < newend; q++)
1799 *q = ' ';
1801 else{
1803 * Add padding to start of string.
1806 /* slide existing string over */
1807 for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1808 *q = *p;
1810 /* fill rest with spaces */
1811 for(; q >= dst; q--)
1812 *q = ' ';
1815 bytes_used += howmany;
1818 if(bytes_used < dstlen)
1819 dst[bytes_used] = '\0';
1821 return(bytes_used);
1826 * Str is a UTF-8 string.
1827 * Start_here is a pointer into the string. It points one position past
1828 * the last byte that should be considered a part of the length string.
1829 * Count back want_width screencell positions and return a pointer to the
1830 * start of the string that is want_width wide and ends with start_here.
1832 * Since characters may be more than one cell width wide we may end up
1833 * skipping over the exact width. That is, if we need to we'll go back
1834 * too far (by one cell width). Account for that in the call by looking
1835 * at got_width.
1837 * Note that this call gives a possible got_width == want_width+1 as
1838 * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1839 * That was just what was needed at the time, maybe it needs to be
1840 * optional.
1842 char *
1843 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1845 unsigned width_consumed = 0;
1846 int this_width;
1847 UCS ucs;
1848 unsigned long remaining_octets;
1849 char *ptr, *savereadptr, *goodreadptr;
1851 savereadptr = start_here;
1852 goodreadptr = start_here;
1854 for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1856 savereadptr = ptr;
1857 remaining_octets = goodreadptr - ptr;
1858 ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1860 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1861 if(remaining_octets > 0){
1863 * This means there are some bad octets after this good
1864 * character so things are not going to work out well.
1865 * Bail out.
1867 savereadptr = str; /* we're done */
1869 else{
1870 this_width = wcellwidth(ucs);
1873 * If this_width is -1 that means we can't print this character
1874 * with our current locale. Writechar will print a '?'.
1876 if(this_width < 0)
1877 this_width = 1;
1879 width_consumed += (unsigned) this_width;
1880 goodreadptr = savereadptr;
1885 if(got_width)
1886 *got_width = width_consumed;
1888 return(savereadptr);
1892 /*----------------------------------------------------------------------
1893 copy the source string onto the destination string returning with
1894 the destination string pointer at the end of the destination text
1896 motivation for this is to avoid twice passing over a string that's
1897 being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1899 This doesn't really belong here but it is used here.
1900 ----*/
1901 void
1902 sstrncpy(char **d, char *s, int n)
1904 while(n-- > 0 && (**d = *s++) != '\0')
1905 (*d)++;
1910 * If use_system_routines is set then NULL is the return value and it is
1911 * not an error. Display_charmap and keyboard_charmap should come over as
1912 * malloced strings and will be filled in with the result.
1914 * Returns a void pointer to the input_cs CHARSET which is
1915 * passed to mbtow via kbseq().
1916 * If !use_system_routines && NULL is returned, that is an error and err should
1917 * have a message.
1918 * display_charmap and keyboard_charmap should be malloced data and may be
1919 * realloced and changed here.
1922 setup_for_input_output(int use_system_routines, char **display_charmap,
1923 char **keyboard_charmap, void **input_cs_arg, char **err)
1925 const CHARSET *cs;
1926 const CHARSET *input_cs = NULL;
1927 int already_tried = 0;
1928 int supported = 0;
1929 char buf[1000];
1931 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1933 if(err)
1934 *err = NULL;
1936 if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1937 *err = cpstr("Bad call to setup_for_input_output");
1938 return(-1);
1941 if(use_system_routines){
1942 #if PREREQ_FOR_SYS_TRANSLATION
1943 char *dcm;
1945 dcm = nl_langinfo_codeset_wrapper();
1946 dcm = dcm ? dcm : "US-ASCII";
1948 init_utf8_display(0, NULL);
1949 if(*display_charmap){
1950 if(dcm && strucmp(*display_charmap, dcm)){
1951 snprintf(buf, sizeof(buf),
1952 _("Display character set \"%s\" is ignored when using system translation"),
1953 *display_charmap);
1955 *err = cpstr(buf);
1958 fs_give((void **) display_charmap);
1961 if(*keyboard_charmap){
1962 if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
1963 snprintf(buf, sizeof(buf),
1964 _("Keyboard character set \"%s\" is ignored when using system translation"),
1965 *keyboard_charmap);
1967 *err = cpstr(buf);
1970 fs_give((void **) keyboard_charmap);
1973 *display_charmap = cpstr(dcm);
1974 *keyboard_charmap = cpstr(dcm);
1975 #else
1976 *err = cpstr("Bad call to setup_for_input_output");
1977 #endif
1979 *input_cs_arg = NULL;
1980 return(0);
1984 try_again1:
1985 if(!(*display_charmap))
1986 *display_charmap = cpstr("US-ASCII");
1988 if(!(*keyboard_charmap))
1989 *keyboard_charmap = cpstr(*display_charmap);
1991 if(*keyboard_charmap){
1992 supported = input_charset_is_supported(*keyboard_charmap);
1994 if(supported){
1995 if(!strucmp(*keyboard_charmap, "utf-8"))
1996 input_cs = utf8_charset(*keyboard_charmap);
1997 else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
1998 input_cs = cs;
2000 else{
2001 if(err && !*err){
2002 int iso2022jp = 0;
2004 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2005 iso2022jp = 1;
2007 snprintf(buf, sizeof(buf),
2008 /* TRANSLATORS: The first argument is the name of the character
2009 set the user is trying to use (which is unsupported by alpine).
2010 The second argument is " (except for posting)" if they are
2011 trying to use ISO-2022-JP for something other than posting. */
2012 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2013 *keyboard_charmap,
2014 iso2022jp ? _(" (except for posting)") : "");
2016 *err = cpstr(buf);
2019 input_cs = NULL;
2020 fs_give((void **) keyboard_charmap);
2021 *keyboard_charmap = cpstr("US-ASCII");
2022 if(!already_tried){
2023 already_tried++;
2024 goto try_again1;
2030 try_again2:
2031 if(!(*display_charmap))
2032 *display_charmap = cpstr("US-ASCII");
2034 if(*display_charmap){
2035 supported = output_charset_is_supported(*display_charmap);
2036 if(supported){
2037 if(!strucmp(*display_charmap, "utf-8"))
2038 init_utf8_display(1, NULL);
2039 else if((cs = utf8_charset(*display_charmap)) != NULL)
2040 init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2042 else{
2043 if(err && !*err){
2044 int iso2022jp = 0;
2046 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2047 iso2022jp = 1;
2049 snprintf(buf, sizeof(buf),
2050 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2051 *display_charmap,
2052 iso2022jp ? _(" (except for posting)") : "");
2054 *err = cpstr(buf);
2057 fs_give((void **) display_charmap);
2058 if(!already_tried){
2059 already_tried++;
2060 goto try_again2;
2064 else{
2065 if(err && !*err)
2066 *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2069 #undef cpstr
2071 *input_cs_arg = (void *) input_cs;
2073 return(0);
2078 input_charset_is_supported(char *input_charset)
2080 const CHARSET *cs;
2082 if(!(input_charset && *input_charset))
2083 return 0;
2085 if(!strucmp(input_charset, "utf-8"))
2086 return 1;
2088 if((cs = utf8_charset(input_charset)) != NULL){
2091 * This was true 2006-09-25.
2093 switch(cs->type){
2094 case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2095 case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2096 case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2097 case CT_UCS4: case CT_UTF16:
2098 return 1;
2099 break;
2101 default:
2102 break;
2106 return 0;
2111 output_charset_is_supported(char *output_charset)
2113 const CHARSET *cs;
2115 if(!(output_charset && *output_charset))
2116 return 0;
2118 if(!strucmp(output_charset, "utf-8"))
2119 return 1;
2121 if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2122 return 1;
2124 return 0;
2129 posting_charset_is_supported(char *posting_charset)
2131 return(posting_charset && *posting_charset
2132 && (!strucmp(posting_charset, "ISO-2022-JP")
2133 || output_charset_is_supported(posting_charset)));
2138 * This function is only defined in this special case and so calls
2139 * to it should be wrapped in the same macro conditionals.
2141 * Returns the default display charset for a UNIX terminal emulator,
2142 * it is what nl_langinfo(CODESET) should return but we need to
2143 * wrap nl_langinfo because we know of strange behaving implementations.
2145 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2146 char *
2147 nl_langinfo_codeset_wrapper(void)
2149 char *ret = NULL;
2151 ret = nl_langinfo(CODESET);
2154 * If the value returned from nl_langinfo() is not a real charset,
2155 * see if we can figure out what they meant. If we can't figure it
2156 * out return NULL and let the caller decide what to do.
2158 if(ret && *ret && !output_charset_is_supported(ret)){
2159 if(!strcmp("ANSI_X3.4-1968", ret)
2160 || !strcmp("646", ret)
2161 || !strcmp("ASCII", ret)
2162 || !strcmp("C", ret)
2163 || !strcmp("POSIX", ret))
2164 ret = "US-ASCII";
2165 else if(!strucmp(ret, "UTF8"))
2166 ret = "UTF-8";
2167 else if(!strucmp(ret, "EUCJP"))
2168 ret = "EUC-JP";
2169 else if(!strucmp(ret, "EUCKP"))
2170 ret = "EUC-KP";
2171 else if(!strucmp(ret, "SJIS"))
2172 ret = "SHIFT-JIS";
2173 else if(strstr(ret, "8859")){
2174 char *p;
2176 /* check for digits after 8859 */
2177 p = strstr(ret, "8859");
2178 p += 4;
2179 if(!isdigit(*p))
2180 p++;
2182 if(isdigit(*p)){
2183 static char buf[12];
2185 memset(buf, 0, sizeof(buf));
2186 strncpy(buf, "ISO-8859-", sizeof(buf));
2187 buf[9] = *p++;
2188 if(isdigit(*p))
2189 buf[10] = *p;
2191 ret = buf;
2196 if(ret && !output_charset_is_supported(ret))
2197 ret = NULL;
2199 return(ret);
2201 #endif
2205 * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2206 * needed the return value will point to orig. If a conversion is done,
2207 * the return string should be freed by the caller.
2208 * If not possible, returns NULL.
2210 char *
2211 utf8_to_charset(char *orig, char *charset, int report_err)
2213 SIZEDTEXT src, dst;
2214 char *ret = orig;
2216 if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2217 return ret;
2219 src.size = strlen(orig);
2220 src.data = (unsigned char *) orig;
2222 if(!strucmp(charset, "us-ascii")){
2223 size_t i;
2225 for(i = 0; i < src.size; i++)
2226 if(src.data[i] & 0x80)
2227 return NULL;
2229 return ret;
2233 * This works for ISO-2022-JP because of special code in utf8_cstext
2234 * but not for other 2022 charsets.
2236 memset(&dst, 0, sizeof(dst));
2237 if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2238 ret = (char *) dst.data; /* c-client already null terminates it */
2239 else
2240 ret = NULL;
2242 if((unsigned char *) ret != dst.data && dst.data)
2243 fs_give((void **) &dst.data);
2245 return ret;
2250 * Turn a number into a string with comma's
2252 * Args: number -- The long to be turned into a string.
2254 * Result: pointer to static string representing number with commas
2255 * Can use up to 3 comatose results at once.
2257 char *
2258 comatose(long int number)
2260 long i, x, done_one;
2261 static char buf[3][50];
2262 static int whichbuf = 0;
2263 char *b;
2265 whichbuf = (whichbuf + 1) % 3;
2267 if(number == 0){
2268 strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2269 buf[whichbuf][sizeof(buf[0])-1] = '\0';
2270 return(buf[whichbuf]);
2273 done_one = 0;
2274 b = buf[whichbuf];
2275 for(i = 1000000000; i >= 1; i /= 1000) {
2276 x = number / i;
2277 number = number % i;
2278 if(x != 0 || done_one) {
2279 if(b != buf[whichbuf] && (b-buf[whichbuf]) < sizeof(buf[0]))
2280 *b++ = ',';
2282 snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2283 b += strlen(b);
2284 done_one = 1;
2288 if(b-buf[whichbuf] < sizeof(buf[0]))
2289 *b = '\0';
2291 return(buf[whichbuf]);
2295 /* leave out the commas */
2296 char *
2297 tose(long int number)
2299 static char buf[3][50];
2300 static int whichbuf = 0;
2302 whichbuf = (whichbuf + 1) % 3;
2304 snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2306 return(buf[whichbuf]);
2311 * line_paint - where the real work of managing what is displayed gets done.
2313 void
2314 line_paint(int offset, /* current dot offset into vl */
2315 struct display_line *displ,
2316 int *passwd) /* flag to hide display of chars */
2318 int i, w, w2, already_got_one = 0;
2319 int vfirst, vlast, dfirst, dlast, vi, di;
2320 int new_vbase;
2321 unsigned (*width_a_to_b)(UCS *, int, int);
2324 * Set passwd to 10 in caller if you want to conceal the
2325 * password but not print asterisks for feedback.
2327 * Set passwd to 1 in caller to conceal by printing asterisks.
2329 if(passwd && *passwd >= 10){ /* don't show asterisks */
2330 if(*passwd > 10)
2331 return;
2332 else
2333 *passwd = 11; /* only blat once */
2335 i = 0;
2336 (*displ->movecursor)(displ->row, displ->col);
2337 while(i++ <= displ->dwid)
2338 (*displ->writechar)(' ');
2340 (*displ->movecursor)(displ->row, displ->col);
2341 return;
2344 if(passwd && *passwd)
2345 width_a_to_b = single_width_chars_a_to_b;
2346 else
2347 width_a_to_b = ucs4_str_width_a_to_b;
2350 * vl is the virtual line (the actual data). We operate on it by typing
2351 * characters to be added and deleting and so forth. In this routine we
2352 * copy a subset of those UCS-4 characters in vl into dl, the display
2353 * array, and show that subset on the screen.
2355 * Offset is the location of the cursor in vl.
2357 * We will display the string starting from vbase.
2358 * We have dwid screen cells to work in.
2359 * We may have to adjust vbase in order to display the
2360 * part of the string that contains the cursor.
2362 * We'll make the display look like
2363 * vl a b c d e f g h i j k l m
2364 * xxxxxxxxxxxxx <- width dwid window
2365 * < d e f g h >
2367 * vbase
2368 * The < will be there if vbase > 0.
2369 * The > will be there if the string from vbase to the
2370 * end can't all fit in the window.
2373 memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2376 * Adjust vbase so offset is not out of the window to the right.
2377 * (The +2 in w + 2 is for a possible " >" if the string goes past
2378 * the right hand edge of the window and if the last visible character
2379 * is double wide. We don't want the offset to be under that > character.)
2381 for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2382 displ->dwid > 1 &&
2383 w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2384 w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2386 * offset is off the window to the right
2387 * It looks like a b c d e f g h
2388 * | |
2389 * vbase offset
2390 * and offset is either past the right edge,
2391 * or right at the right edge (and maybe under >),
2392 * or one before right at the edge (and maybe on space
2393 * for half a character).
2395 * Since the characters may be double width it is slightly
2396 * complicated to figure out how far to increase vbase.
2397 * We're going to scoot over past width w/2 characters and
2398 * then see if that's sufficient.
2400 new_vbase = displ->vbase + 1;
2401 for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2402 w2 < displ->dwid/2;
2403 w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2404 new_vbase++;
2406 displ->vbase = new_vbase;
2409 /* adjust so offset is not out of the window to the left */
2410 while(displ->vbase > 0 && displ->vbase >= offset){
2411 /* add about dwid/2 more width */
2412 new_vbase = displ->vbase - 1;
2413 for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2414 w2 < (displ->dwid+1)/2 && new_vbase > 0;
2415 w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2416 new_vbase--;
2418 /* but don't let it get too small, recheck off right end */
2419 for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2420 w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2421 w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2422 new_vbase++;
2424 displ->vbase = MAX(new_vbase, 0);
2427 if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2428 displ->vbase = 0;
2430 vfirst = displ->vbase;
2431 dfirst = 0;
2432 if(displ->vbase > 0){ /* off screen cue left */
2433 dfirst = 1; /* index which matches vfirst */
2434 displ->dl[0] = '<';
2437 vlast = displ->vused-1; /* end */
2438 w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2440 if(displ->dwid > 0 && w + dfirst > displ->dwid){ /* off window right */
2442 /* find last ucs character to be printed */
2443 while(w + dfirst > displ->dwid - 1) /* -1 for > */
2444 w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2446 /* worry about double-width characters */
2447 if(w + dfirst == displ->dwid - 1){ /* no prob, hit it exactly */
2448 dlast = dfirst + vlast - vfirst + 1; /* +1 for > */
2449 displ->dl[dlast] = '>';
2451 else{
2452 dlast = dfirst + vlast - vfirst + 1;
2453 displ->dl[dlast++] = ' ';
2454 displ->dl[dlast] = '>';
2457 else
2458 dlast = dfirst + vlast - vfirst;
2461 * Copy the relevant part of the virtual line into the display line.
2463 for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2464 if(passwd && *passwd)
2465 displ->dl[di] = '*'; /* to conceal password */
2466 else
2467 displ->dl[di] = displ->vl[vi];
2470 * Add spaces to clear the rest of the line.
2471 * We have dwid total space to fill.
2473 w = (*width_a_to_b)(displ->dl, 0, dlast); /* width through dlast */
2474 for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2475 displ->dl[di++] = ' ';
2478 * Draw from left to right, skipping until we get to
2479 * something that is different. Characters may be different
2480 * widths than they were initially so paint from there the
2481 * rest of the way.
2483 for(di = 0; displ->dl[di]; di++){
2484 if(already_got_one || displ->dl[di] != displ->olddl[di]){
2485 /* move cursor first time */
2486 if(!already_got_one++){
2487 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2488 (*displ->movecursor)(displ->row, displ->col + w);
2491 (*displ->writechar)(displ->dl[di]);
2492 displ->olddl[di] = displ->dl[di];
2496 memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2499 * Move the cursor to the offset.
2501 * The offset is relative to the start of the virtual array. We need
2502 * to find the location on the screen. The offset into the display array
2503 * will be offset-vbase+dfirst. We want to be at the start of that
2504 * character, so we need to find the width of all the characters up
2505 * to that point.
2507 w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2509 (*displ->movecursor)(displ->row, displ->col + w);
2514 * This is just like ucs4_str_width_a_to_b() except all of the characters
2515 * are assumed to be of width 1. This is for printing out *'s when user
2516 * enters a password, while still managing to use the same code to do the
2517 * display.
2519 unsigned
2520 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2522 unsigned width = 0;
2523 int i;
2525 if(ucsstr)
2526 for(i = a; i <= b && ucsstr[i]; i++)
2527 width++;
2529 return width;