* Width of characters is not always determined correctly when wcwidth
[alpine.git] / pith / charconv / utf8.c
blob6613f4db9f1d0107e40348d3950a8996a4f2c518
1 #if !defined(lint) && !defined(DOS)
2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
3 #endif
5 /*
6 * ========================================================================
7 * Copyright 2013-2018 Eduardo Chappa
8 * Copyright 2006-2008 University of Washington
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
14 * http://www.apache.org/licenses/LICENSE-2.0
16 * ========================================================================
20 /* includable WITHOUT dependency on c-client */
21 #include "../../c-client/mail.h"
22 #include "../../c-client/utf8.h"
24 #ifdef _WINDOWS
25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
26 #undef ERROR
27 #else
28 #define _XOPEN_SOURCE
29 #endif
31 #include <system.h>
33 #include "../../c-client/fs.h"
35 /* includable WITHOUT dependency on pico */
36 #include "../../pico/keydefs.h"
38 #include "../osdep/collate.h"
39 #include "../filttype.h"
41 #include "utf8.h"
43 #include <stdarg.h>
46 unsigned single_width_chars_a_to_b(UCS *, int, int);
49 static char locale_charmap[50];
51 static int native_utf8;
52 static void *display_data;
54 void
55 init_utf8_display(int utf8, void *rmap)
57 native_utf8 = utf8;
58 display_data = rmap;
63 * Argument is a UCS-4 wide character.
64 * Returns the environment dependent cell width of the
65 * character when printed to the screen.
66 * This will be -1 if the character is not printable.
67 * It will be >= zero if it is printable.
69 * Note that in the case it is not printable but it is still sent to
70 * Writechar, Writechar will print a '?' with width 1.
72 int
73 wcellwidth(UCS ucs)
75 char dummy[32];
76 long w;
79 * We believe that on modern unix systems wchar_t is a UCS-4 character.
80 * That's the assumption here.
83 if(native_utf8){ /* display is UTF-8 capable */
84 w = ucs4_width((unsigned long) ucs);
85 return((w & U4W_ERROR) ? -1 : w);
87 else if(display_data){
88 if(wtomb(dummy, ucs) < 0)
89 return(-1);
90 else{
91 w = ucs4_width((unsigned long) ucs);
92 return((w & U4W_ERROR) ? -1 : w);
95 #if !defined(_WINDOWS) && HAVE_WCWIDTH
96 else
97 return(wcwidth((wchar_t) ucs));
98 #else
99 return(0);
100 #endif
103 /* ambiguous width zone character function. We use the Windows code until
104 * we find a better way to do it in general.
107 pith_ucs4width(UCS ucs)
109 return (ucs >= 0x2100) ? 2 : 1;
110 #if !defined(_WINDOWS) && HAVE_WCWIDTH
111 return wcwidth((wchar_t) ucs);
112 #else
113 return (ucs >= 0x2100) ? 2 : 1;
114 #endif /* _WINDOWS */
118 * Argument is a UCS-4 wide character.
119 * It is converted to the multibyte version (for example UTF8 or EUC-JP).
120 * Dest is a buffer at least xx chars wide where the multi-byte version
121 * of the wide character will be written.
122 * The returned value is the number of bytes written to dest or -1
123 * if the conversion can't be done.
126 wtomb(char *dest, UCS ucs)
129 * We believe that on modern unix systems wchar_t is a UCS-4 character.
130 * That's the assumption here.
133 if(native_utf8){
134 unsigned char *newdptr;
136 newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
137 return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
139 else if(display_data){
140 unsigned long ucs4;
141 int ret;
143 ucs4 = (unsigned long) ucs;
144 ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
145 if(ret >= 0)
146 ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
147 else
148 ret = -1;
150 return(ret);
152 else
153 return(wcrtomb(dest, (wchar_t) ucs, NULL));
158 * This function does not necessarily update inputp and remaining_octets, so
159 * don't rely on that. The c-client version does but the other doesn't.
162 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
164 UCS ucs;
166 if(input_cs){
167 CHARSET *cast_input_cs;
169 cast_input_cs = (CHARSET *) input_cs;
171 switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
172 case U8G_ENDSTRG:
173 case U8G_ENDSTRI:
174 return(CCONV_NEEDMORE);
176 default:
177 if(ucs & U8G_ERROR || ucs == UBOGON)
178 return(CCONV_BADCHAR);
180 return(ucs);
183 else{
184 size_t ret;
185 wchar_t w;
188 * Warning: input_cs and remaining_octets are unused in this
189 * half of the if/else.
191 * Unfortunately, we can't tell the difference between a source string
192 * that is just not long enough and one that has characters that can't
193 * be converted even though it is long enough. We return NEEDMORE in both cases.
195 ret = mbstowcs(&w, (char *) (*inputp), 1);
196 if(ret == (size_t)(-1))
197 return(CCONV_NEEDMORE);
198 else{
199 ucs = (UCS) w;
200 return(ucs);
206 void
207 set_locale_charmap(char *charmap)
209 if(charmap){
210 strncpy(locale_charmap, charmap, sizeof(locale_charmap));
211 locale_charmap[sizeof(locale_charmap)-1] = '\0';
213 else
214 locale_charmap[0] = '\0';
219 * This ensures that the string is UTF-8. If str is already a UTF-8 string,
220 * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
221 * The caller is responsible for freeing the returned value.
223 * Args str -- the string to convert
225 char *
226 convert_to_utf8(char *str, char *fromcharset, int flags)
228 char *ret = NULL;
229 char *fcharset;
230 SIZEDTEXT src, result;
231 const CHARSET *cs;
232 int try;
234 src.data = (unsigned char *) str;
235 src.size = strlen(str);
237 /* already UTF-8, return NULL */
238 if(!(flags & CU8_NOINFER)
239 && (cs = utf8_infercharset(&src))
240 && (cs->type == CT_ASCII || cs->type == CT_UTF8))
241 return(ret);
243 try = 1;
244 while(try < 5){
245 switch(try){
246 case 1:
247 fcharset = fromcharset;
248 if(fcharset && strucmp("UTF-8", fcharset) != 0)
249 break; /* give it a try */
250 else
251 try++; /* fall through */
253 case 2:
254 if(!(flags & CU8_NOINFER)){
255 fcharset = cs ? cs->name : NULL;
256 if(fcharset && strucmp("UTF-8", fcharset) != 0)
257 break;
258 else
259 try++; /* fall through */
261 else
262 try++; /* fall through */
264 case 3:
265 fcharset = locale_charmap;
266 if(fcharset && strucmp("UTF-8", fcharset) != 0)
267 break;
268 else
269 try++; /* fall through */
271 default:
272 fcharset = "ISO-8859-1"; /* this will "work" */
273 break;
276 memset(&result, 0, sizeof(result));
278 if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
279 if(!(result.size == src.size && result.data == src.data)){
280 ret = (char *) fs_get((result.size+1) * sizeof(char));
281 strncpy(ret, (char *) result.data, result.size);
282 ret[result.size] = '\0';
284 /* else no conversion necessary */
286 if(result.data)
287 fs_give((void **) &result.data);
288 result.size = 0;
290 return(ret);
293 try++;
296 /* won't make it to here */
297 return(ret);
302 * Convert from UTF-8 to user's locale charset.
303 * This actually uses the wtomb routine to do the conversion, and that
304 * relies on setup_for_input_output having been called.
305 * If no conversion is necessary, NULL is returned, otherwise an allocated
306 * string in the locale charset is returned and the caller is responsible
307 * for freeing it.
309 char *
310 convert_to_locale(char *utf8str)
312 #define CHNK 500
313 char *inp, *retp, *ret = NULL;
314 CBUF_S cb;
315 int r, alloced;
317 if(native_utf8 || !utf8str || !utf8str[0])
318 return(NULL);
320 cb.cbuf[0] = '\0';
321 cb.cbufp = cb.cbufend = cb.cbuf;
322 inp = utf8str;
324 alloced = CHNK;
325 ret = (char *) fs_get(alloced * sizeof(char));
326 retp = ret;
329 * There's gotta be a better way to do this but utf8_to_locale was
330 * available and everything looks like a nail when all you have
331 * is a hammer.
333 while(*inp){
335 * We're placing the outgoing stream of characters in ret, a multi-byte
336 * array of characters in the user's locale charset. See if there is
337 * enough room for the next wide characters worth of output chars
338 * and allocate more space if not.
340 if((alloced - (retp-ret)) < MAX(MB_LEN_MAX,32)){
341 alloced += CHNK;
342 fs_resize((void **) &ret, alloced * sizeof(char));
345 r = utf8_to_locale((int) *inp++, &cb,
346 (unsigned char *) retp, alloced-(retp-ret));
348 retp += r;
351 *retp = '\0';
353 fs_resize((void **) &ret, strlen(ret)+1);
355 return(ret);
360 * Pass in a stream of UTF-8 characters in 'c' and return obuf
361 * filled in with multi-byte characters. The return value is the
362 * number of valid characters in obuf to be used.
365 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
367 int outchars = 0;
369 if(!(cb && cb->cbufp))
370 return(0);
372 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
373 unsigned char *inputp;
374 unsigned long remaining_octets;
375 UCS ucs;
377 *(cb->cbufp)++ = (unsigned char) c;
378 inputp = cb->cbuf;
379 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
380 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
382 switch(ucs){
383 case U8G_ENDSTRG: /* incomplete character, wait */
384 case U8G_ENDSTRI: /* incomplete character, wait */
385 break;
387 default:
388 if(ucs & U8G_ERROR || ucs == UBOGON){
390 * None of these cases is supposed to happen. If it
391 * does happen then the input stream isn't UTF-8
392 * so something is wrong. Treat each character in the
393 * input buffer as a separate error character and
394 * print a '?' for each.
396 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
397 obuf[outchars++] = '?';
399 cb->cbufp = cb->cbuf;
401 else{
402 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
404 * This happens when we have a UTF-8 character that
405 * we aren't able to print in our locale. For example,
406 * if the locale is setup with the terminal
407 * expecting ISO-8859-1 characters then there are
408 * lots of UTF-8 characters that can't be printed.
409 * Print a '?' instead.
411 obuf[outchars++] = '?';
413 else{
415 * Convert the ucs into the multibyte
416 * character that corresponds to the
417 * ucs in the users locale.
419 outchars = wtomb((char *) obuf, ucs);
420 if(outchars < 0){
421 obuf[0] = '?';
422 outchars = 1;
426 /* update the input buffer */
427 if(inputp >= cb->cbufp) /* this should be the case */
428 cb->cbufp = cb->cbuf;
429 else{ /* extra chars for some reason? */
430 unsigned char *q, *newcbufp;
432 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
433 q = cb->cbuf;
434 while(inputp < cb->cbufp)
435 *q++ = *inputp++;
437 cb->cbufp = newcbufp;
441 break;
444 else{ /* error */
445 obuf[0] = '?';
446 outchars = 1;
447 cb->cbufp = cb->cbuf; /* start over */
450 return(outchars);
455 * Returns the screen cells width of the UCS-4 string argument.
456 * The source string is zero terminated.
458 unsigned
459 ucs4_str_width(UCS *ucsstr)
461 unsigned width = 0;
462 int w;
464 if(ucsstr)
465 while(*ucsstr){
466 w = wcellwidth(*ucsstr++);
467 if(w != U4W_CTLSRGT)
468 width += (w < 0 ? 1 : w);
471 return width;
476 * Returns the screen cells width of the UCS-4 string argument
477 * from ucsstr[a] through (inclusive) ucsstr[b].
478 * No checking is done to make sure a starts in the middle
479 * of a UCS-4 array.
481 unsigned
482 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
484 unsigned width = 0;
485 int i, w;
487 if(ucsstr)
488 for(i = a; i <= b && ucsstr[i]; i++){
489 w = wcellwidth(ucsstr[i]);
490 if(w != U4W_CTLSRGT)
491 width += (w < 0 ? 1 : w);
494 return width;
499 * Returns the screen cells width of the UCS-4 string argument
500 * from ustart through (exclusive) uend.
501 * No checking is done to make sure it starts in the middle
502 * of a UCS-4 array.
504 unsigned
505 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
507 UCS *u;
508 unsigned width = 0;
509 int w;
511 if(!ustart)
512 return width;
514 if(ustart)
515 for(u = ustart; u < uend; u++){
516 w = wcellwidth(*u);
517 if(w != U4W_CTLSRGT)
518 width += (w < 0 ? 1 : w);
521 return(width);
526 * Return the largest possible pointer into ucs4str so that the width
527 * of the string from ucs4str to the pointer (exclusive)
528 * is maxwidth or less. Also stops at a null character.
530 UCS *
531 ucs4_particular_width(UCS *ucs4str, int maxwidth)
533 UCS *u;
534 int w_consumed = 0, w, done = 0;
536 u = ucs4str;
538 if(u)
539 while(!done && *u && w_consumed <= maxwidth){
540 w = wcellwidth(*u);
541 w = (w >= 0 ? w : 1);
542 if(w_consumed + w <= maxwidth){
543 w_consumed += w;
544 ++u;
546 else
547 ++done;
550 return(u);
555 * Convert and copy a UTF-8 string into a UCS-4 NULL
556 * terminated array. Just like cpystr only it converts
557 * from UTF-8 to UCS-4.
559 * Returned UCS-4 string needs to be freed by caller.
561 UCS *
562 utf8_to_ucs4_cpystr(char *utf8src)
564 size_t retsize;
565 UCS *ret = NULL;
566 UCS ucs;
567 unsigned long remaining_octets;
568 unsigned char *readptr;
569 size_t arrayindex;
572 * We don't know how big to allocate the return array
573 * because variable numbers of octets in the src array
574 * will combine to make UCS-4 characters. The number of
575 * UCS-4 characters is less than or equal to the number
576 * of src characters, though.
579 if(!utf8src)
580 return NULL;
582 retsize = strlen(utf8src) + 1;
584 ret = (UCS *) fs_get(retsize * sizeof(*ret));
585 memset(ret, 0, retsize * sizeof(*ret));
587 readptr = (unsigned char *) utf8src;
588 remaining_octets = retsize-1;
589 arrayindex = 0;
591 while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
592 ucs = (UCS) utf8_get(&readptr, &remaining_octets);
594 if(ucs & U8G_ERROR || ucs == UBOGON)
595 remaining_octets = 0;
596 else
597 ret[arrayindex++] = ucs;
600 ret[arrayindex] = '\0';
602 /* get rid of excess size */
603 if(arrayindex+1 < retsize)
604 fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
606 return ret;
611 * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
612 * terminated string. Just like cpystr only it converts
613 * from UCS-4 to UTF-8.
615 * Returned UTF-8 string needs to be freed by caller.
617 char *
618 ucs4_to_utf8_cpystr(UCS *ucs4src)
620 unsigned char *ret = NULL;
621 unsigned char *writeptr;
622 int i;
624 if(!ucs4src)
625 return NULL;
628 * Over-allocate and then resize at the end.
631 /* count characters in source */
632 for(i = 0; ucs4src[i]; i++)
635 ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
636 memset(ret, 0, (6*i + 1) * sizeof(*ret));
638 writeptr = ret;
639 for(i = 0; ucs4src[i]; i++)
640 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
642 /* get rid of excess size */
643 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
645 return ((char *) ret);
650 * Similar to above but copy a fixed number of source
651 * characters instead of going until null terminator.
653 char *
654 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
656 unsigned char *ret = NULL;
657 unsigned char *writeptr;
658 int i;
660 if(!ucs4src)
661 return NULL;
664 * Over-allocate and then resize at the end.
667 ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
668 memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
670 writeptr = ret;
671 for(i = 0; i < ucs4src_len; i++)
672 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
674 /* get rid of excess size */
675 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
677 return ((char *) ret);
681 #ifdef _WINDOWS
683 * Convert a UTF-8 argument into an LPTSTR version
684 * of that argument. The result is allocated here
685 * and should be freed by the caller.
687 LPTSTR
688 utf8_to_lptstr(LPSTR arg_utf8)
690 int lptstr_len;
691 LPTSTR lptstr_ret = NULL;
693 lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
694 if(lptstr_len > 0)
696 lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
697 lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
698 arg_utf8, -1, lptstr_ret, lptstr_len );
701 if(!lptstr_len)
703 /* check GetLastError()? */
704 lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
705 lptstr_ret[0] = 0;
708 return lptstr_ret;
713 * Convert an LPTSTR argument into a UTF-8 version
714 * of that argument. The result is allocated here
715 * and should be freed by the caller.
717 LPSTR
718 lptstr_to_utf8(LPTSTR arg_lptstr)
720 int utf8str_len;
721 LPSTR utf8str_ret = NULL;
723 utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
724 if(utf8str_len > 0)
726 utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
727 utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
728 arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
731 if(!utf8str_len)
733 /* check GetLastError()? */
734 utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
735 utf8str_ret[0] = 0;
738 return utf8str_ret;
743 * Convert a UCS4 argument into an LPTSTR version
744 * of that argument. The result is allocated here
745 * and should be freed by the caller.
747 LPTSTR
748 ucs4_to_lptstr(UCS *arg_ucs4)
750 LPTSTR ret_lptstr = NULL;
751 size_t len;
752 size_t i;
754 if(arg_ucs4){
755 len = ucs4_strlen(arg_ucs4);
756 ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
757 /* bogus conversion ignores UTF-16 */
758 for(i = 0; i < len; i++)
759 ret_lptstr[i] = arg_ucs4[i];
761 ret_lptstr[len] = '\0';
764 return(ret_lptstr);
769 * Convert an LPTSTR argument into a UCS4 version
770 * of that argument. The result is MemAlloc'd here
771 * and should be freed by the caller.
773 UCS *
774 lptstr_to_ucs4(LPTSTR arg_lptstr)
776 UCS *ret_ucs4 = NULL;
777 size_t len;
778 size_t i;
780 if(arg_lptstr){
781 len = _tcslen(arg_lptstr);
782 ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
783 /* bogus conversion ignores UTF-16 */
784 for(i = 0; i < len; i++)
785 ret_ucs4[i] = arg_lptstr[i];
787 ret_ucs4[len] = '\0';
790 return(ret_ucs4);
793 #endif /* _WINDOWS */
797 * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
798 * 1-at-a-time filled in with UCS characters. The return value is the
799 * number of valid characters in obuf to be used. It can only
800 * be 1 or 0 characters since we're only getting one UTF-8 character
801 * at a time.
804 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
806 int width = 0, outchars = 0;
808 if(!(cb && cb->cbufp))
809 return(0);
811 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
812 unsigned char *inputp;
813 unsigned long remaining_octets;
814 UCS ucs;
816 *cb->cbufp++ = (unsigned char) c;
817 inputp = cb->cbuf;
818 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
819 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
821 switch(ucs){
822 case U8G_ENDSTRG: /* incomplete character, wait */
823 case U8G_ENDSTRI: /* incomplete character, wait */
824 break;
826 default:
827 if(ucs & U8G_ERROR || ucs == UBOGON){
829 * None of these cases is supposed to happen. If it
830 * does happen then the input stream isn't UTF-8
831 * so something is wrong.
833 outchars++;
834 *obuf = '?';
835 cb->cbufp = cb->cbuf;
836 width = 1;
838 else{
839 outchars++;
840 if(ucs < 0x80 && ucs >= 0x20)
841 width = 1;
843 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
845 * This happens when we have a UTF-8 character that
846 * we aren't able to print in our locale. For example,
847 * if the locale is setup with the terminal
848 * expecting ISO-8859-1 characters then there are
849 * lots of UTF-8 characters that can't be printed.
850 * Print a '?' instead.
851 * Don't think this should happen in Windows.
853 *obuf = '?';
855 else{
856 *obuf = ucs;
859 /* update the input buffer */
860 if(inputp >= cb->cbufp) /* this should be the case */
861 cb->cbufp = cb->cbuf;
862 else{ /* extra chars for some reason? */
863 unsigned char *q, *newcbufp;
865 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
866 q = cb->cbuf;
867 while(inputp < cb->cbufp)
868 *q++ = *inputp++;
870 cb->cbufp = newcbufp;
874 break;
877 else{ /* error */
878 *obuf = '?';
879 outchars = 1;
880 width = 1;
881 cb->cbufp = cb->cbuf; /* start over */
884 if(obufwidth)
885 *obufwidth = width;
887 return(outchars);
892 * Return an allocated copy of a zero-terminated UCS-4 string.
894 UCS *
895 ucs4_cpystr(UCS *ucs4src)
897 size_t arraysize;
898 UCS *ret = NULL;
899 size_t i;
901 if(!ucs4src)
902 return NULL;
904 arraysize = ucs4_strlen(ucs4src);
906 ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
907 memset(ret, 0, (arraysize+1) * sizeof(*ret));
909 for(i = 0; i < arraysize; i++)
910 ret[i] = ucs4src[i];
912 return ret;
916 UCS *
917 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
919 size_t i;
921 if(ucs4src && ucs4dst){
922 for(i = 0; i < n; i++){
923 ucs4dst[i] = ucs4src[i];
924 if(ucs4dst[i] == '\0')
925 break;
929 return ucs4dst;
933 UCS *
934 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
936 size_t i;
937 UCS *u;
939 if(ucs4src && ucs4dst){
940 for(u = ucs4dst; *u; u++)
943 for(i = 0; i < n; i++){
944 u[i] = ucs4src[i];
945 if(u[i] == '\0')
946 break;
949 if(i == n)
950 u[i] = '\0';
953 return ucs4dst;
958 * Like strlen only this returns the number of non-zero characters
959 * in a zero-terminated UCS-4 array.
961 size_t
962 ucs4_strlen(UCS *ucs4str)
964 size_t i = 0;
966 if(ucs4str)
967 while(ucs4str[i])
968 i++;
970 return(i);
975 ucs4_strcmp(UCS *s1, UCS *s2)
977 for(; *s1 == *s2; s1++, s2++)
978 if(*s1 == '\0')
979 return 0;
981 return((*s1 < *s2) ? -1 : 1);
985 UCS *
986 ucs4_strchr(UCS *s, UCS c)
988 if(!s)
989 return NULL;
991 while(*s && *s != c)
992 s++;
994 if(*s || !c)
995 return s;
996 else
997 return NULL;
1001 UCS *
1002 ucs4_strrchr(UCS *s, UCS c)
1004 UCS *ret = NULL;
1006 if(!s)
1007 return ret;
1009 while(*s){
1010 if(*s == c)
1011 ret = s;
1013 s++;
1016 return ret;
1021 * Returns the screen cells width of the UTF-8 string argument.
1023 unsigned
1024 utf8_width(char *str)
1026 unsigned width = 0;
1027 int this_width;
1028 UCS ucs;
1029 unsigned long remaining_octets;
1030 char *readptr;
1032 if(!(str && *str))
1033 return(width);
1035 readptr = str;
1036 remaining_octets = readptr ? strlen(readptr) : 0;
1038 while(remaining_octets > 0 && *readptr){
1040 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1042 if(ucs & U8G_ERROR || ucs == UBOGON){
1044 * This should not happen, but do something to handle it anyway.
1045 * Treat each character as a single width character, which is what should
1046 * probably happen when we actually go to write it out.
1048 remaining_octets--;
1049 readptr++;
1050 this_width = 1;
1052 else{
1053 this_width = wcellwidth(ucs);
1056 * If this_width is -1 that means we can't print this character
1057 * with our current locale. Writechar will print a '?'.
1059 if(this_width < 0)
1060 this_width = 1;
1063 width += (unsigned) this_width;
1066 return(width);
1071 * Copy UTF-8 characters from src into dst.
1072 * This is intended to be used if you want to truncate a string at
1073 * the start instead of the end. For example, you have a long string
1074 * like
1075 * this_is_a_long_string
1076 * but not enough space to fit it into a particular field. You want to
1077 * end up with
1078 * s_a_long_string
1079 * where that fits in a particular width. Perhaps you'd use this with ...
1080 * to get
1081 * ...s_a_long_string
1082 * This right adjusts the end of the string in the width space and
1083 * cuts it off at the start. If there is enough width for the whole
1084 * string it will copy the string into dst with no padding.
1086 * Copy enough characters so that the result will have screen width of
1087 * want_width screen cells in current locale.
1089 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1090 * to dst. This is just for protection, it shouldn't be relied on to
1091 * do anything useful. Dstlen should be large enough. Otherwise you'll get
1092 * characters truncated in the middle or something like that.
1094 * Returned value is the number of bytes written to dst, not including
1095 * the possible terminating null.
1097 * If we can't hit want_width exactly because of double width characters
1098 * then we will pad the end of the string with space in order to make
1099 * the width exact.
1101 size_t
1102 utf8_to_width_rhs(char *dst, /* destination buffer */
1103 char *src, /* source string */
1104 size_t dstlen, /* space in dest */
1105 unsigned want_width) /* desired screen width */
1107 int this_width;
1108 unsigned width_consumed = 0;
1109 UCS ucs;
1110 unsigned long remaining_octets;
1111 char *readptr, *goodreadptr, *savereadptr, *endptr;
1112 size_t nb = 0;
1114 if(!src){
1115 if(dstlen > 0)
1116 dst[0] = '\0';
1118 return nb;
1122 * Start at the end of the source string and go backwards until we
1123 * get to the desired width, but not more than the width.
1125 readptr = src + strlen(src);
1126 endptr = readptr;
1127 goodreadptr = readptr;
1128 width_consumed = 0;
1129 savereadptr = readptr;
1131 for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1132 readptr = savereadptr-1){
1134 savereadptr = readptr;
1135 remaining_octets = goodreadptr - readptr;
1136 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1139 * Handling the error case is tough because an error will be the normal thing that
1140 * happens as we back through the string. So we're just going to punt on the
1141 * error for now.
1143 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1144 if(remaining_octets > 0){
1146 * This means there are some bad octets after this good
1147 * character so things are not going to work out well.
1148 * Bail out.
1150 savereadptr = src; /* we're done */
1152 else{
1153 this_width = wcellwidth(ucs);
1155 if(this_width < 0)
1156 this_width = 1;
1158 if(width_consumed + (unsigned) this_width <= want_width){ /* ok */
1159 width_consumed += (unsigned) this_width;
1160 goodreadptr = savereadptr;
1162 else
1163 savereadptr = src; /* we're done */
1169 * Copy characters from goodreadptr to endptr into dst.
1171 nb = MIN(endptr-goodreadptr, dstlen-1);
1172 strncpy(dst, goodreadptr, nb);
1173 dst[nb] = '\0';
1176 * Pad out with spaces in order to hit width exactly.
1178 while(width_consumed < want_width && nb < dstlen-1){
1179 dst[nb++] = ' ';
1180 dst[nb] = '\0';
1181 width_consumed++;
1184 return nb;
1189 * The arguments being converted are UTF-8 strings.
1190 * This routine attempts to make it possible to use screen cell
1191 * widths in a format specifier. In a one-byte per screen cell
1192 * world we might have used %10.10s to cause a string to occupy
1193 * 10 screen positions. Since the width and precision are really
1194 * referring to numbers of bytes instead of screen positions that
1195 * won't work with UTF-8 input. We emulate that behavior with
1196 * the format string %w. %m.nw means to use the m and n as
1197 * screen width indicators instead of bytes indicators.
1199 * There is no reason to use this routine unless you want to use
1200 * min field with or precision with the specifier. A plain %w without
1201 * widths is equivalent exactly to a plain %s in a regular printf.
1203 * Double-width characters complicate things. It may not be possible
1204 * to satisfy the request exactly. For example, %3w for an input
1205 * string that is made up of two double-width characters.
1206 * This routine will arbitrarily use a trailing space character if
1207 * needed to make the width come out correctly where a half of a
1208 * double-width character would have been needed. We'll see how
1209 * that works for us.
1211 * %w only works for strings (it's a %s replacement).
1213 * Buffer overflow is handled by the size argument. %.30s will work
1214 * to limit a particular string to 30 bytes, but you lose that
1215 * ability with %w, since it may write more than precision bytes
1216 * in order to get to the desired width. It is best to choose
1217 * size large enough so that it doesn't come into play, otherwise
1218 * it may be possible to get partial UTF-8 characters because of
1219 * the truncation.
1221 * The return value isn't quite the same as the return value
1222 * of snprintf. It is the number of bytes written, not counting
1223 * the trailing null, just like snprintf. However, if it is
1224 * truncated due to size then the output is size, not the
1225 * number of characters that would have been written.
1228 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1230 char newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1231 char *start_of_specifier;
1232 char *input_str;
1233 int int_arg;
1234 double double_arg;
1235 void *ptr_arg;
1236 unsigned got_width;
1237 int more_flags, ret, w;
1238 int min_field_width, field_precision, modifier;
1239 int flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1240 va_list args;
1242 newfmt[0] = '\0';
1243 q = newfmt;
1245 pdest = dest;
1247 #define IS_ROOM_IN_DEST(n_more_chars) \
1248 ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1251 * Strategy: Look through the fmt string for %w's. Replace the
1252 * %w's in the format string with %s's but with possibly different
1253 * width and precision arguments which will make it come out right.
1254 * Then call the regular system vsnprintf with the altered format
1255 * string but same arguments.
1257 * That would be nice but it doesn't quite work. Why? Because a
1258 * %*w will need to have the value in the integer argument the *
1259 * refers to modified. Can't do it as far as I can tell. Or we could
1260 * remove the integer argument somehow before calling printf. Can't
1261 * do it. Or we could somehow add an additional conversion specifier
1262 * that caused nothing to be printed but ate up the integer arg.
1263 * Can't figure out how to do that either.
1265 * Since we can't figure out how to do it, the alternative is to
1266 * construct the result one piece at a time, pasting together the
1267 * pieces from the different conversions.
1269 va_start(args, fmt);
1271 while(*fmt && IS_ROOM_IN_DEST(1)){
1272 if(*fmt == '%'){
1273 start_of_specifier = fmt++;
1275 min_field_width = field_precision = -1;
1276 flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1278 /* flags */
1279 more_flags = 1;
1280 while(more_flags){
1281 switch(*fmt){
1282 case '-':
1283 flags_minus++;
1284 fmt++;
1285 break;
1287 case '+':
1288 flags_plus++;
1289 fmt++;
1290 break;
1292 case ' ':
1293 flags_space++;
1294 fmt++;
1295 break;
1297 case '0':
1298 flags_zero++;
1299 fmt++;
1300 break;
1302 case '#':
1303 flags_pound++;
1304 fmt++;
1305 break;
1307 default:
1308 more_flags = 0;
1309 break;
1313 /* minimum field width */
1314 if(*fmt == '*'){
1315 min_field_width = va_arg(args, int);
1316 fmt++;
1318 else if(*fmt >= '0' && *fmt <= '9'){
1319 width_str = fmt;
1320 while (*fmt >= '0' && *fmt <= '9')
1321 fmt++;
1323 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1324 if(sizeof(buf) > fmt-width_str)
1325 buf[fmt-width_str] = '\0';
1327 buf[sizeof(buf)-1] = '\0';
1329 min_field_width = atoi(width_str);
1332 /* field precision */
1333 if(*fmt == '.'){
1334 fmt++;
1335 if(*fmt == '*'){
1336 field_precision = va_arg(args, int);
1337 fmt++;
1339 else if(*fmt >= '0' && *fmt <= '9'){
1340 width_str = fmt;
1341 while (*fmt >= '0' && *fmt <= '9')
1342 fmt++;
1344 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1345 if(sizeof(buf) > fmt-width_str)
1346 buf[fmt-width_str] = '\0';
1348 buf[sizeof(buf)-1] = '\0';
1350 field_precision = atoi(width_str);
1354 /* length modifier */
1355 if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1356 modifier = *fmt++;
1358 /* conversion character */
1359 switch(*fmt){
1360 case 'w':
1362 * work with va_arg(char *) to figure out width
1363 * and precision needed to produce the screen width
1364 * and precision asked for in %w using some of the
1365 * utf8 width routines we have.
1368 input_str = va_arg(args, char *);
1369 if(field_precision >=0 || min_field_width >= 0)
1370 w = utf8_width(input_str);
1372 if(field_precision >= 0){
1373 if(w <= field_precision)
1374 field_precision = -1; /* print it all */
1375 else{
1377 * We need to cut off some of the input_str
1378 * in this case.
1380 end = utf8_count_forw_width(input_str, field_precision, &got_width);
1381 field_precision = (int) (end - input_str);
1382 /* new w with this field_precision */
1383 w = got_width;
1387 /* need some padding */
1388 if(min_field_width >= 0)
1389 min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1390 MAX(0, min_field_width - w);
1393 * Now we just need to get the new format string
1394 * set correctly in newfmt.
1396 q = newfmt;
1397 if(q-newfmt < sizeof(newfmt))
1398 *q++ = '%';
1400 if(flags_minus && q-newfmt < sizeof(newfmt))
1401 *q++ = '-';
1402 if(flags_plus && q-newfmt < sizeof(newfmt))
1403 *q++ = '+';
1404 if(flags_space && q-newfmt < sizeof(newfmt))
1405 *q++ = ' ';
1406 if(flags_zero && q-newfmt < sizeof(newfmt))
1407 *q++ = '0';
1408 if(flags_pound && q-newfmt < sizeof(newfmt))
1409 *q++ = '#';
1411 if(min_field_width >= 0){
1412 snprintf(buf, sizeof(buf), "%d", min_field_width);
1413 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1416 if(field_precision >= 0){
1417 if(q-newfmt < sizeof(newfmt))
1418 *q++ = '.';
1420 snprintf(buf, sizeof(buf), "%d", field_precision);
1421 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1424 if(q-newfmt < sizeof(newfmt))
1425 *q++ = 's';
1427 if(q-newfmt < sizeof(newfmt))
1428 *q++ = '\0';
1430 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1431 pdest += strlen(pdest);
1433 break;
1435 case '\0':
1436 fmt--;
1437 break;
1439 default:
1440 /* make a new format which leaves out the dynamic '*' arguments */
1441 q = newfmt;
1442 if(q-newfmt < sizeof(newfmt))
1443 *q++ = '%';
1445 if(flags_minus && q-newfmt < sizeof(newfmt))
1446 *q++ = '-';
1447 if(flags_plus && q-newfmt < sizeof(newfmt))
1448 *q++ = '+';
1449 if(flags_space && q-newfmt < sizeof(newfmt))
1450 *q++ = ' ';
1451 if(flags_zero && q-newfmt < sizeof(newfmt))
1452 *q++ = '0';
1453 if(flags_pound && q-newfmt < sizeof(newfmt))
1454 *q++ = '#';
1456 if(min_field_width >= 0){
1457 snprintf(buf, sizeof(buf), "%d", min_field_width);
1458 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1461 if(field_precision >= 0){
1462 if(q-newfmt < sizeof(newfmt))
1463 *q++ = '.';
1465 snprintf(buf, sizeof(buf), "%d", field_precision);
1466 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1469 if(q-newfmt < sizeof(newfmt))
1470 *q++ = *fmt;
1472 if(q-newfmt < sizeof(newfmt))
1473 *q++ = '\0';
1475 switch(*fmt){
1476 case 'd': case 'i': case 'o':
1477 case 'x': case 'X': case 'u': case 'c':
1478 int_arg = va_arg(args, int);
1479 snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1480 pdest += strlen(pdest);
1481 break;
1483 case 's':
1484 input_str = va_arg(args, char *);
1485 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1486 pdest += strlen(pdest);
1487 break;
1489 case 'f': case 'e': case 'E':
1490 case 'g': case 'G':
1491 double_arg = va_arg(args, double);
1492 snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1493 pdest += strlen(pdest);
1494 break;
1496 case 'p':
1497 ptr_arg = va_arg(args, void *);
1498 snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1499 pdest += strlen(pdest);
1500 break;
1502 case '%':
1503 if(IS_ROOM_IN_DEST(1))
1504 *pdest++ = '%';
1506 break;
1508 default:
1509 /* didn't think of this type */
1510 assert(0);
1511 break;
1514 break;
1517 fmt++;
1519 else{
1520 if(IS_ROOM_IN_DEST(1))
1521 *pdest++ = *fmt++;
1525 ret = pdest - dest;
1527 if(IS_ROOM_IN_DEST(1))
1528 *pdest++ = '\0';
1530 va_end(args);
1532 return ret;
1537 * Copy UTF-8 characters from src into dst.
1538 * Copy enough characters so that the result will have (<=) screen width of
1539 * want_width screen cells in current locale.
1541 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1542 * to dst.
1544 * Returned value is the number of bytes written to dst, not including
1545 * the possible terminating null.
1546 * Got_width is another returned value. It is the width in screen cells of
1547 * the string placed in dst. It will be the same as want_width if there
1548 * are enough characters in the src to do that and if the character widths
1549 * hit the width exactly. It will be less than want_width if we run out
1550 * of src characters or if the next character width would skip over the
1551 * width we want, because it is double width.
1553 * Zero width characters are collected and included at the end of the string.
1554 * That is, if we make it to want_width but there is still a zero length
1555 * character sitting in src, we add that to dst. This might be an accent
1556 * or something like that.
1558 size_t
1559 utf8_to_width(char *dst, /* destination buffer */
1560 char *src, /* source string */
1561 size_t dstlen, /* space in dst */
1562 unsigned want_width, /* desired screen width */
1563 unsigned *got_width) /* returned screen width in dst */
1565 int this_width;
1566 unsigned width_consumed = 0;
1567 UCS ucs;
1568 unsigned long remaining_octets;
1569 char *writeptr, *readptr, *savereadptr, *endptr;
1570 int ran_out_of_space = 0;
1572 readptr = src;
1574 remaining_octets = readptr ? strlen(readptr) : 0;
1576 writeptr = dst;
1577 endptr = writeptr + dstlen;
1579 if(readptr && writeptr){
1580 while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1581 savereadptr = readptr;
1582 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1584 if(ucs & U8G_ERROR || ucs == UBOGON)
1585 remaining_octets = 0;
1586 else{
1587 this_width = wcellwidth(ucs);
1590 * If this_width is -1 that means we can't print this character
1591 * with our current locale. Writechar will print a '?'.
1593 if(this_width < 0)
1594 this_width = 1;
1596 if(width_consumed + (unsigned) this_width <= want_width){
1597 /* append this utf8 character to dst if it will fit */
1598 if(writeptr + (readptr - savereadptr) < endptr){
1599 width_consumed += this_width;
1600 while(savereadptr < readptr)
1601 *writeptr++ = *savereadptr++;
1603 else
1604 ran_out_of_space++; /* no more utf8 to dst */
1606 else
1607 remaining_octets = 0; /* we're done */
1611 if(writeptr < endptr)
1612 *writeptr = '\0';
1615 if(got_width)
1616 *got_width = width_consumed;
1618 return(writeptr ? (writeptr - dst) : 0);
1623 * Str is a UTF-8 string.
1624 * Count forward width screencell positions and return a pointer to the
1625 * end of the string that is width wide.
1626 * The returned pointer points at the next character (where the null would
1627 * be placed).
1629 * Got_width is another returned value. It is the width in screen cells of
1630 * the string from str to the returned pointer. It will be the same as
1631 * want_width if there are enough characters in the str to do that
1632 * and if the character widths hit the width exactly. It will be less
1633 * than want_width if we run out of characters or if the next character
1634 * width would skip over the width we want, because it is double width.
1636 char *
1637 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1639 int this_width;
1640 unsigned width_consumed = 0;
1641 UCS ucs;
1642 unsigned long remaining_octets;
1643 char *readptr;
1644 char *retptr;
1646 retptr = readptr = str;
1648 remaining_octets = readptr ? strlen(readptr) : 0;
1650 while(width_consumed <= want_width && remaining_octets > 0){
1652 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1654 if(ucs & U8G_ERROR || ucs == UBOGON){
1656 * This should not happen, but do something to handle it anyway.
1657 * Treat each character as a single width character, which is what should
1658 * probably happen when we actually go to write it out.
1660 remaining_octets--;
1661 readptr++;
1662 this_width = 1;
1664 else{
1665 this_width = wcellwidth(ucs);
1668 * If this_width is -1 that means we can't print this character
1669 * with our current locale. Writechar will print a '?'.
1671 if(this_width < 0)
1672 this_width = 1;
1675 if(width_consumed + (unsigned) this_width <= want_width){
1676 width_consumed += (unsigned) this_width;
1677 retptr = readptr;
1679 else
1680 remaining_octets = 0; /* we're done */
1683 if(got_width)
1684 *got_width = width_consumed;
1686 return(retptr);
1691 * Copy a null terminator into a UTF-8 string in place so that the string is
1692 * no more than a certain screen width wide. If the string is already less
1693 * than or equal in width to the requested width, no change is made.
1695 * The actual width accomplished is returned. Note that it may be less than
1696 * max_width due to double width characters as well as due to the fact that
1697 * it fits wholly in the max_width.
1699 * Returned value is the actual screen width of str when done.
1701 * A side effect is that a terminating null may have been written into
1702 * the passed in string.
1704 unsigned
1705 utf8_truncate(char *str, unsigned max_width)
1707 int this_width;
1708 unsigned width_consumed = 0;
1709 UCS ucs;
1710 unsigned long remaining_octets;
1711 char *readptr, *savereadptr;
1713 readptr = str;
1715 remaining_octets = readptr ? strlen(readptr) : 0;
1717 if(readptr){
1718 while(width_consumed <= max_width && remaining_octets > 0){
1720 savereadptr = readptr;
1721 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1723 if(ucs & U8G_ERROR || ucs == UBOGON){
1725 * This should not happen, but do something to handle it anyway.
1726 * Treat each character as a single width character, which is what should
1727 * probably happen when we actually go to write it out.
1729 remaining_octets--;
1730 readptr++;
1731 this_width = 1;
1733 else{
1734 this_width = wcellwidth(ucs);
1737 * If this_width is -1 that means we can't print this character
1738 * with our current locale. Writechar will print a '?'.
1740 if(this_width < 0)
1741 this_width = 1;
1744 if(width_consumed + (unsigned) this_width <= max_width){
1745 width_consumed += (unsigned) this_width;
1747 else{
1748 remaining_octets = 0; /* we're done */
1749 *savereadptr = '\0';
1754 return(width_consumed);
1759 * Copy UTF-8 characters from src into dst.
1760 * Copy enough characters so that the result will have screen width of
1761 * want_width screen cells in current locale.
1762 * If there aren't enough characters in src to get to want_width, pad on
1763 * left or right according to left_adjust argument.
1765 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1766 * to dst. Dst will be null terminated if there is enough room, but not
1767 * if that would overflow dst's len.
1769 * Returned value is the number of bytes written to dst, not including
1770 * the possible terminating null.
1772 size_t
1773 utf8_pad_to_width(char *dst, /* destination buffer */
1774 char *src, /* source string */
1775 size_t dstlen, /* space in dst */
1776 unsigned want_width, /* desired screen width */
1777 int left_adjust) /* adjust left or right in want_width columns */
1779 unsigned got_width = 0;
1780 int need_more, howmany;
1781 size_t len_left, bytes_used;
1783 bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1784 len_left = dstlen - bytes_used;
1786 need_more = want_width - got_width;
1787 howmany = MIN(need_more, len_left);
1789 if(howmany > 0){
1790 char *end, *newend, *p, *q;
1792 end = dst + bytes_used;
1793 newend = end + howmany;
1794 if(left_adjust){
1796 * Add padding to end of string. Simply append
1797 * the needed number of spaces, or however many will fit
1798 * if we don't have enough space.
1800 for(q = end; q < newend; q++)
1801 *q = ' ';
1803 else{
1805 * Add padding to start of string.
1808 /* slide existing string over */
1809 for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1810 *q = *p;
1812 /* fill rest with spaces */
1813 for(; q >= dst; q--)
1814 *q = ' ';
1817 bytes_used += howmany;
1820 if(bytes_used < dstlen)
1821 dst[bytes_used] = '\0';
1823 return(bytes_used);
1828 * Str is a UTF-8 string.
1829 * Start_here is a pointer into the string. It points one position past
1830 * the last byte that should be considered a part of the length string.
1831 * Count back want_width screencell positions and return a pointer to the
1832 * start of the string that is want_width wide and ends with start_here.
1834 * Since characters may be more than one cell width wide we may end up
1835 * skipping over the exact width. That is, if we need to we'll go back
1836 * too far (by one cell width). Account for that in the call by looking
1837 * at got_width.
1839 * Note that this call gives a possible got_width == want_width+1 as
1840 * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1841 * That was just what was needed at the time, maybe it needs to be
1842 * optional.
1844 char *
1845 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1847 unsigned width_consumed = 0;
1848 int this_width;
1849 UCS ucs;
1850 unsigned long remaining_octets;
1851 char *ptr, *savereadptr, *goodreadptr;
1853 savereadptr = start_here;
1854 goodreadptr = start_here;
1856 for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1858 savereadptr = ptr;
1859 remaining_octets = goodreadptr - ptr;
1860 ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1862 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1863 if(remaining_octets > 0){
1865 * This means there are some bad octets after this good
1866 * character so things are not going to work out well.
1867 * Bail out.
1869 savereadptr = str; /* we're done */
1871 else{
1872 this_width = wcellwidth(ucs);
1875 * If this_width is -1 that means we can't print this character
1876 * with our current locale. Writechar will print a '?'.
1878 if(this_width < 0)
1879 this_width = 1;
1881 width_consumed += (unsigned) this_width;
1882 goodreadptr = savereadptr;
1887 if(got_width)
1888 *got_width = width_consumed;
1890 return(savereadptr);
1894 /*----------------------------------------------------------------------
1895 copy the source string onto the destination string returning with
1896 the destination string pointer at the end of the destination text
1898 motivation for this is to avoid twice passing over a string that's
1899 being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1901 This doesn't really belong here but it is used here.
1902 ----*/
1903 void
1904 sstrncpy(char **d, char *s, int n)
1906 while(n-- > 0 && (**d = *s++) != '\0')
1907 (*d)++;
1912 * If use_system_routines is set then NULL is the return value and it is
1913 * not an error. Display_charmap and keyboard_charmap should come over as
1914 * malloced strings and will be filled in with the result.
1916 * Returns a void pointer to the input_cs CHARSET which is
1917 * passed to mbtow via kbseq().
1918 * If !use_system_routines && NULL is returned, that is an error and err should
1919 * have a message.
1920 * display_charmap and keyboard_charmap should be malloced data and may be
1921 * realloced and changed here.
1924 setup_for_input_output(int use_system_routines, char **display_charmap,
1925 char **keyboard_charmap, void **input_cs_arg, char **err)
1927 const CHARSET *cs;
1928 const CHARSET *input_cs = NULL;
1929 int already_tried = 0;
1930 int supported = 0;
1931 char buf[1000];
1933 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1935 if(err)
1936 *err = NULL;
1938 if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1939 *err = cpstr("Bad call to setup_for_input_output");
1940 return(-1);
1943 if(use_system_routines){
1944 #if PREREQ_FOR_SYS_TRANSLATION
1945 char *dcm;
1947 dcm = nl_langinfo_codeset_wrapper();
1948 dcm = dcm ? dcm : "US-ASCII";
1950 init_utf8_display(0, NULL);
1951 if(*display_charmap){
1952 if(dcm && strucmp(*display_charmap, dcm)){
1953 snprintf(buf, sizeof(buf),
1954 _("Display character set \"%s\" is ignored when using system translation"),
1955 *display_charmap);
1957 *err = cpstr(buf);
1960 fs_give((void **) display_charmap);
1963 if(*keyboard_charmap){
1964 if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
1965 snprintf(buf, sizeof(buf),
1966 _("Keyboard character set \"%s\" is ignored when using system translation"),
1967 *keyboard_charmap);
1969 *err = cpstr(buf);
1972 fs_give((void **) keyboard_charmap);
1975 *display_charmap = cpstr(dcm);
1976 *keyboard_charmap = cpstr(dcm);
1977 #else
1978 *err = cpstr("Bad call to setup_for_input_output");
1979 #endif
1981 *input_cs_arg = NULL;
1982 return(0);
1986 try_again1:
1987 if(!(*display_charmap))
1988 *display_charmap = cpstr("US-ASCII");
1990 if(!(*keyboard_charmap))
1991 *keyboard_charmap = cpstr(*display_charmap);
1993 if(*keyboard_charmap){
1994 supported = input_charset_is_supported(*keyboard_charmap);
1996 if(supported){
1997 if(!strucmp(*keyboard_charmap, "utf-8"))
1998 input_cs = utf8_charset(*keyboard_charmap);
1999 else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
2000 input_cs = cs;
2002 else{
2003 if(err && !*err){
2004 int iso2022jp = 0;
2006 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2007 iso2022jp = 1;
2009 snprintf(buf, sizeof(buf),
2010 /* TRANSLATORS: The first argument is the name of the character
2011 set the user is trying to use (which is unsupported by alpine).
2012 The second argument is " (except for posting)" if they are
2013 trying to use ISO-2022-JP for something other than posting. */
2014 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2015 *keyboard_charmap,
2016 iso2022jp ? _(" (except for posting)") : "");
2018 *err = cpstr(buf);
2021 input_cs = NULL;
2022 fs_give((void **) keyboard_charmap);
2023 *keyboard_charmap = cpstr("US-ASCII");
2024 if(!already_tried){
2025 already_tried++;
2026 goto try_again1;
2032 try_again2:
2033 if(!(*display_charmap))
2034 *display_charmap = cpstr("US-ASCII");
2036 if(*display_charmap){
2037 supported = output_charset_is_supported(*display_charmap);
2038 if(supported){
2039 if(!strucmp(*display_charmap, "utf-8"))
2040 init_utf8_display(1, NULL);
2041 else if((cs = utf8_charset(*display_charmap)) != NULL)
2042 init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2044 else{
2045 if(err && !*err){
2046 int iso2022jp = 0;
2048 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2049 iso2022jp = 1;
2051 snprintf(buf, sizeof(buf),
2052 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2053 *display_charmap,
2054 iso2022jp ? _(" (except for posting)") : "");
2056 *err = cpstr(buf);
2059 fs_give((void **) display_charmap);
2060 if(!already_tried){
2061 already_tried++;
2062 goto try_again2;
2066 else{
2067 if(err && !*err)
2068 *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2071 #undef cpstr
2073 *input_cs_arg = (void *) input_cs;
2075 return(0);
2080 input_charset_is_supported(char *input_charset)
2082 const CHARSET *cs;
2084 if(!(input_charset && *input_charset))
2085 return 0;
2087 if(!strucmp(input_charset, "utf-8"))
2088 return 1;
2090 if((cs = utf8_charset(input_charset)) != NULL){
2093 * This was true 2006-09-25.
2095 switch(cs->type){
2096 case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2097 case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2098 case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2099 case CT_UCS4: case CT_UTF16:
2100 return 1;
2101 break;
2103 default:
2104 break;
2108 return 0;
2113 output_charset_is_supported(char *output_charset)
2115 const CHARSET *cs;
2117 if(!(output_charset && *output_charset))
2118 return 0;
2120 if(!strucmp(output_charset, "utf-8"))
2121 return 1;
2123 if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2124 return 1;
2126 return 0;
2131 posting_charset_is_supported(char *posting_charset)
2133 return(posting_charset && *posting_charset
2134 && (!strucmp(posting_charset, "ISO-2022-JP")
2135 || output_charset_is_supported(posting_charset)));
2140 * This function is only defined in this special case and so calls
2141 * to it should be wrapped in the same macro conditionals.
2143 * Returns the default display charset for a UNIX terminal emulator,
2144 * it is what nl_langinfo(CODESET) should return but we need to
2145 * wrap nl_langinfo because we know of strange behaving implementations.
2147 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2148 char *
2149 nl_langinfo_codeset_wrapper(void)
2151 char *ret = NULL;
2153 ret = nl_langinfo(CODESET);
2156 * If the value returned from nl_langinfo() is not a real charset,
2157 * see if we can figure out what they meant. If we can't figure it
2158 * out return NULL and let the caller decide what to do.
2160 if(ret && *ret && !output_charset_is_supported(ret)){
2161 if(!strcmp("ANSI_X3.4-1968", ret)
2162 || !strcmp("646", ret)
2163 || !strcmp("ASCII", ret)
2164 || !strcmp("C", ret)
2165 || !strcmp("POSIX", ret))
2166 ret = "US-ASCII";
2167 else if(!strucmp(ret, "UTF8"))
2168 ret = "UTF-8";
2169 else if(!strucmp(ret, "EUCJP"))
2170 ret = "EUC-JP";
2171 else if(!strucmp(ret, "EUCKP"))
2172 ret = "EUC-KP";
2173 else if(!strucmp(ret, "SJIS"))
2174 ret = "SHIFT-JIS";
2175 else if(strstr(ret, "8859")){
2176 char *p;
2178 /* check for digits after 8859 */
2179 p = strstr(ret, "8859");
2180 p += 4;
2181 if(!isdigit(*p))
2182 p++;
2184 if(isdigit(*p)){
2185 static char buf[12];
2187 memset(buf, 0, sizeof(buf));
2188 strncpy(buf, "ISO-8859-", sizeof(buf));
2189 buf[9] = *p++;
2190 if(isdigit(*p))
2191 buf[10] = *p;
2193 ret = buf;
2198 if(ret && !output_charset_is_supported(ret))
2199 ret = NULL;
2201 return(ret);
2203 #endif
2207 * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2208 * needed the return value will point to orig. If a conversion is done,
2209 * the return string should be freed by the caller.
2210 * If not possible, returns NULL.
2212 char *
2213 utf8_to_charset(char *orig, char *charset, int report_err)
2215 SIZEDTEXT src, dst;
2216 char *ret = orig;
2218 if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2219 return ret;
2221 src.size = strlen(orig);
2222 src.data = (unsigned char *) orig;
2224 if(!strucmp(charset, "us-ascii")){
2225 size_t i;
2227 for(i = 0; i < src.size; i++)
2228 if(src.data[i] & 0x80)
2229 return NULL;
2231 return ret;
2235 * This works for ISO-2022-JP because of special code in utf8_cstext
2236 * but not for other 2022 charsets.
2238 memset(&dst, 0, sizeof(dst));
2239 if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2240 ret = (char *) dst.data; /* c-client already null terminates it */
2241 else
2242 ret = NULL;
2244 if((unsigned char *) ret != dst.data && dst.data)
2245 fs_give((void **) &dst.data);
2247 return ret;
2252 * Turn a number into a string with comma's
2254 * Args: number -- The long to be turned into a string.
2256 * Result: pointer to static string representing number with commas
2257 * Can use up to 3 comatose results at once.
2259 char *
2260 comatose(long int number)
2262 long i, x, done_one;
2263 static char buf[3][50];
2264 static int whichbuf = 0;
2265 char *b;
2267 whichbuf = (whichbuf + 1) % 3;
2269 if(number == 0){
2270 strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2271 buf[whichbuf][sizeof(buf[0])-1] = '\0';
2272 return(buf[whichbuf]);
2275 done_one = 0;
2276 b = buf[whichbuf];
2277 for(i = 1000000000; i >= 1; i /= 1000) {
2278 x = number / i;
2279 number = number % i;
2280 if(x != 0 || done_one) {
2281 if(b != buf[whichbuf] && (b-buf[whichbuf]) < sizeof(buf[0]))
2282 *b++ = ',';
2284 snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2285 b += strlen(b);
2286 done_one = 1;
2290 if(b-buf[whichbuf] < sizeof(buf[0]))
2291 *b = '\0';
2293 return(buf[whichbuf]);
2297 /* leave out the commas */
2298 char *
2299 tose(long int number)
2301 static char buf[3][50];
2302 static int whichbuf = 0;
2304 whichbuf = (whichbuf + 1) % 3;
2306 snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2308 return(buf[whichbuf]);
2313 * line_paint - where the real work of managing what is displayed gets done.
2315 void
2316 line_paint(int offset, /* current dot offset into vl */
2317 struct display_line *displ,
2318 int *passwd) /* flag to hide display of chars */
2320 int i, w, w2, already_got_one = 0;
2321 int vfirst, vlast, dfirst, dlast, vi, di;
2322 int new_vbase;
2323 unsigned (*width_a_to_b)(UCS *, int, int);
2326 * Set passwd to 10 in caller if you want to conceal the
2327 * password but not print asterisks for feedback.
2329 * Set passwd to 1 in caller to conceal by printing asterisks.
2331 if(passwd && *passwd >= 10){ /* don't show asterisks */
2332 if(*passwd > 10)
2333 return;
2334 else
2335 *passwd = 11; /* only blat once */
2337 i = 0;
2338 (*displ->movecursor)(displ->row, displ->col);
2339 while(i++ <= displ->dwid)
2340 (*displ->writechar)(' ');
2342 (*displ->movecursor)(displ->row, displ->col);
2343 return;
2346 if(passwd && *passwd)
2347 width_a_to_b = single_width_chars_a_to_b;
2348 else
2349 width_a_to_b = ucs4_str_width_a_to_b;
2352 * vl is the virtual line (the actual data). We operate on it by typing
2353 * characters to be added and deleting and so forth. In this routine we
2354 * copy a subset of those UCS-4 characters in vl into dl, the display
2355 * array, and show that subset on the screen.
2357 * Offset is the location of the cursor in vl.
2359 * We will display the string starting from vbase.
2360 * We have dwid screen cells to work in.
2361 * We may have to adjust vbase in order to display the
2362 * part of the string that contains the cursor.
2364 * We'll make the display look like
2365 * vl a b c d e f g h i j k l m
2366 * xxxxxxxxxxxxx <- width dwid window
2367 * < d e f g h >
2369 * vbase
2370 * The < will be there if vbase > 0.
2371 * The > will be there if the string from vbase to the
2372 * end can't all fit in the window.
2375 memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2378 * Adjust vbase so offset is not out of the window to the right.
2379 * (The +2 in w + 2 is for a possible " >" if the string goes past
2380 * the right hand edge of the window and if the last visible character
2381 * is double wide. We don't want the offset to be under that > character.)
2383 for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2384 displ->dwid > 1 &&
2385 w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2386 w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2388 * offset is off the window to the right
2389 * It looks like a b c d e f g h
2390 * | |
2391 * vbase offset
2392 * and offset is either past the right edge,
2393 * or right at the right edge (and maybe under >),
2394 * or one before right at the edge (and maybe on space
2395 * for half a character).
2397 * Since the characters may be double width it is slightly
2398 * complicated to figure out how far to increase vbase.
2399 * We're going to scoot over past width w/2 characters and
2400 * then see if that's sufficient.
2402 new_vbase = displ->vbase + 1;
2403 for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2404 w2 < displ->dwid/2;
2405 w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2406 new_vbase++;
2408 displ->vbase = new_vbase;
2411 /* adjust so offset is not out of the window to the left */
2412 while(displ->vbase > 0 && displ->vbase >= offset){
2413 /* add about dwid/2 more width */
2414 new_vbase = displ->vbase - 1;
2415 for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2416 w2 < (displ->dwid+1)/2 && new_vbase > 0;
2417 w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2418 new_vbase--;
2420 /* but don't let it get too small, recheck off right end */
2421 for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2422 w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2423 w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2424 new_vbase++;
2426 displ->vbase = MAX(new_vbase, 0);
2429 if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2430 displ->vbase = 0;
2432 vfirst = displ->vbase;
2433 dfirst = 0;
2434 if(displ->vbase > 0){ /* off screen cue left */
2435 dfirst = 1; /* index which matches vfirst */
2436 displ->dl[0] = '<';
2439 vlast = displ->vused-1; /* end */
2440 w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2442 if(displ->dwid > 0 && w + dfirst > displ->dwid){ /* off window right */
2444 /* find last ucs character to be printed */
2445 while(w + dfirst > displ->dwid - 1) /* -1 for > */
2446 w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2448 /* worry about double-width characters */
2449 if(w + dfirst == displ->dwid - 1){ /* no prob, hit it exactly */
2450 dlast = dfirst + vlast - vfirst + 1; /* +1 for > */
2451 displ->dl[dlast] = '>';
2453 else{
2454 dlast = dfirst + vlast - vfirst + 1;
2455 displ->dl[dlast++] = ' ';
2456 displ->dl[dlast] = '>';
2459 else
2460 dlast = dfirst + vlast - vfirst;
2463 * Copy the relevant part of the virtual line into the display line.
2465 for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2466 if(passwd && *passwd)
2467 displ->dl[di] = '*'; /* to conceal password */
2468 else
2469 displ->dl[di] = displ->vl[vi];
2472 * Add spaces to clear the rest of the line.
2473 * We have dwid total space to fill.
2475 w = (*width_a_to_b)(displ->dl, 0, dlast); /* width through dlast */
2476 for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2477 displ->dl[di++] = ' ';
2480 * Draw from left to right, skipping until we get to
2481 * something that is different. Characters may be different
2482 * widths than they were initially so paint from there the
2483 * rest of the way.
2485 for(di = 0; displ->dl[di]; di++){
2486 if(already_got_one || displ->dl[di] != displ->olddl[di]){
2487 /* move cursor first time */
2488 if(!already_got_one++){
2489 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2490 (*displ->movecursor)(displ->row, displ->col + w);
2493 (*displ->writechar)(displ->dl[di]);
2494 displ->olddl[di] = displ->dl[di];
2498 memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2501 * Move the cursor to the offset.
2503 * The offset is relative to the start of the virtual array. We need
2504 * to find the location on the screen. The offset into the display array
2505 * will be offset-vbase+dfirst. We want to be at the start of that
2506 * character, so we need to find the width of all the characters up
2507 * to that point.
2509 w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2511 (*displ->movecursor)(displ->row, displ->col + w);
2516 * This is just like ucs4_str_width_a_to_b() except all of the characters
2517 * are assumed to be of width 1. This is for printing out *'s when user
2518 * enters a password, while still managing to use the same code to do the
2519 * display.
2521 unsigned
2522 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2524 unsigned width = 0;
2525 int i;
2527 if(ucsstr)
2528 for(i = a; i <= b && ucsstr[i]; i++)
2529 width++;
2531 return width;