* New version 2.26
[alpine.git] / pith / charset.c
blobca2e627272b3a526c87ef5631fb42183b55a4c47
1 /*
2 * ========================================================================
3 * Copyright 2013-2022 Eduardo Chappa
4 * Copyright 2006-2008 University of Washington
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * ========================================================================
15 #include "../pith/headers.h"
16 #include "../pith/charset.h"
17 #include "../pith/state.h"
18 #include "../pith/conf.h"
19 #include "../pith/escapes.h"
20 #include "../pith/mimedesc.h"
21 #include "../pith/filter.h"
22 #include "../pith/string.h"
23 #include "../pith/options.h"
27 * Internal prototypes
29 int rfc1522_token(char *, int (*)(int), char *, char **);
30 int rfc1522_valtok(int);
31 int rfc1522_valenc(int);
32 int rfc1522_valid(char *, char **, char **, char **, char **);
33 void rfc1522_copy_and_transliterate(unsigned char *, unsigned char **, size_t,
34 unsigned char *, unsigned long, char *);
35 unsigned char *rfc1522_encoded_word(unsigned char *, int, char *);
36 char *rfc1522_8bit(void *, int);
37 char *rfc1522_binary(void *, int);
40 char *
41 body_charset(MAILSTREAM *stream, long int msgno, unsigned char *section)
43 BODY *body;
44 char *charset;
47 if((body = mail_body(stream, msgno, section)) && body->type == TYPETEXT){
48 if(!(charset = parameter_val(body->parameter, "charset")))
49 charset = cpystr("US-ASCII");
51 return(charset);
54 return(NULL);
59 * Copies the source string into allocated space with the 8-bit EUC codes
60 * (on Unix) or the Shift-JIS (on PC) converted into ISO-2022-JP.
61 * Caller is responsible for freeing the result.
63 unsigned char *
64 trans_euc_to_2022_jp(unsigned char *src)
66 size_t len, alloc;
67 unsigned char *rv, *p, *q;
68 int inside_esc_seq = 0;
69 int c1 = -1; /* remembers first of pair for Shift-JIS */
71 if(!src)
72 return(NULL);
74 len = strlen((char *) src);
77 * Worst possible increase is every other character an 8-bit character.
78 * In that case, each of those gets 6 extra characters for the escape
79 * sequences. We're not too concerned about the extra length because
80 * these are relatively short strings.
82 alloc = len + 1 + ((len+1)/2) * 6;
83 rv = (unsigned char *) fs_get(alloc * sizeof(char));
85 for(p = src, q = rv; *p; p++){
86 if(inside_esc_seq){
87 if(c1 >= 0){ /* second of a pair? */
88 int adjust = *p < 159;
89 int rowOffset = c1 < 160 ? 112 : 176;
90 int cellOffset = adjust ? (*p > 127 ? 32 : 31) : 126;
92 *q++ = ((c1 - rowOffset) << 1) - adjust;
93 *q++ = *p - cellOffset;
94 c1 = -1;
96 else if(*p & 0x80){
97 *q++ = (*p & 0x7f);
99 else{
100 *q++ = '\033';
101 *q++ = '(';
102 *q++ = 'B';
103 *q++ = (*p);
104 c1 = -1;
105 inside_esc_seq = 0;
108 else{
109 if(*p & 0x80){
110 *q++ = '\033';
111 *q++ = '$';
112 *q++ = 'B';
113 *q++ = (*p & 0x7f);
114 inside_esc_seq = 1;
116 else{
117 *q++ = (*p);
122 if(inside_esc_seq){
123 *q++ = '\033';
124 *q++ = '(';
125 *q++ = 'B';
128 *q = '\0';
130 return(rv);
135 * * * * * * * * * RFC 1522 support routines * * * * * * * *
137 * RFC 1522 support is *very* loosely based on code contributed
138 * by Lars-Erik Johansson <lej@cdg.chalmers.se>. Thanks to Lars-Erik,
139 * and apologies for taking such liberties with his code.
142 #define RFC1522_INIT "=?"
143 #define RFC1522_INIT_L 2
144 #define RFC1522_TERM "?="
145 #define RFC1522_TERM_L 2
146 #define RFC1522_DLIM "?"
147 #define RFC1522_DLIM_L 1
148 #define RFC1522_MAXW 75 /* RFC's say 75, but no senders seem to care*/
149 #define ESPECIALS "()<>@,;:\"/[]?.="
150 #define RFC1522_OVERHEAD(S) (RFC1522_INIT_L + RFC1522_TERM_L + \
151 (2 * RFC1522_DLIM_L) + strlen(S) + 1);
152 #define RFC1522_ENC_CHAR(C) (((C) & 0x80) || !rfc1522_valtok(C) \
153 || (C) == '_' )
156 * rfc1522_decode_to_utf8 - try to decode the given source string ala RFC 2047
157 * (obsoleted RFC 1522) into the given destination buffer,
158 * encoded in UTF-8.
160 * How large should d be? The decoded string of octets will fit in
161 * the same size string as the source string. However, because we're
162 * translating that into UTF-8 the result may expand. Currently the
163 * Thai character set has single octet characters which expand to
164 * three octets in UTF-8. So it would be safe to use 3 * strlen(s)
165 * for the size of d. One can imagine a currently non-existent
166 * character set that expanded to 4 octets instead, so use 4 to be
167 * super safe.
169 * Returns: pointer to either the destination buffer containing the
170 * decoded text, or a pointer to the source buffer if there was
171 * no valid 'encoded-word' found during scanning.
173 unsigned char *
174 rfc1522_decode_to_utf8(unsigned char *d, size_t len, char *s)
176 unsigned char *rv = NULL, *p;
177 char *start = s, *sw, *enc, *txt, *ew, **q, *lang;
178 char *cset;
179 unsigned long l;
180 int i;
182 *d = '\0'; /* init destination */
184 while(s && (sw = strstr(s, RFC1522_INIT))){
185 if(!rv) /* there's something to do, init it */
186 rv = d;
187 /* validate the rest of the encoded-word */
188 if(rfc1522_valid(sw, &cset, &enc, &txt, &ew)){
190 * We may have been putting off copying the first part of the
191 * source while waiting to see if we have to copy at all.
193 if(rv == d && s != start){
194 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) start,
195 sw - start, NULL);
196 s = sw;
199 /* copy everything between s and sw to destination */
200 for(i = 0; &s[i] < sw; i++)
201 if(!isspace((unsigned char)s[i])){ /* if some non-whitespace */
202 while(s < sw && d-rv<len-1)
203 *d++ = (unsigned char) *s++;
205 break;
208 enc[-1] = txt[-1] = ew[0] = '\0'; /* tie off token strings */
210 if((lang = strchr(cset, '*')) != NULL)
211 *lang++ = '\0';
213 /* based on encoding, write the encoded text to output buffer */
214 switch(*enc){
215 case 'Q' : /* 'Q' encoding */
216 case 'q' :
217 /* special hocus-pocus to deal with '_' exception, too bad */
218 for(l = 0L, i = 0; txt[l]; l++)
219 if(txt[l] == '_')
220 i++;
222 if(i){
223 q = (char **) fs_get((i + 1) * sizeof(char *));
224 for(l = 0L, i = 0; txt[l]; l++)
225 if(txt[l] == '_'){
226 q[i++] = &txt[l];
227 txt[l] = SPACE;
230 q[i] = NULL;
232 else
233 q = NULL;
235 if((p = rfc822_qprint((unsigned char *)txt, strlen(txt), &l)) != NULL){
236 rfc1522_copy_and_transliterate(rv, &d, len, p, l, cset);
237 fs_give((void **)&p); /* free encoded buf */
239 else{
240 if(q)
241 fs_give((void **) &q);
243 goto bogus;
246 if(q){ /* restore underscores */
247 for(i = 0; q[i]; i++)
248 *(q[i]) = '_';
250 fs_give((void **)&q);
253 break;
255 case 'B' : /* 'B' encoding */
256 case 'b' :
257 if((p = rfc822_base64((unsigned char *) txt, strlen(txt), &l)) != NULL){
258 rfc1522_copy_and_transliterate(rv, &d, len, p, l, cset);
259 fs_give((void **)&p); /* free encoded buf */
261 else
262 goto bogus;
264 break;
266 default:
267 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) txt,
268 strlen(txt), NULL);
269 dprint((1, "RFC1522_decode: Unknown ENCODING: %s\n",
270 enc ? enc : "?"));
271 break;
274 /* restore trompled source string */
275 enc[-1] = txt[-1] = '?';
276 ew[0] = RFC1522_TERM[0];
278 /* advance s to start of text after encoded-word */
279 s = ew + RFC1522_TERM_L;
281 if(lang)
282 lang[-1] = '*';
284 else{
286 * Found intro, but bogus data followed, treat it as normal text.
288 l = (sw - s) + RFC1522_INIT_L;
289 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, l, NULL);
290 for(; isspace((unsigned char) *(s+l)) && d-rv<len-1;l++)
291 *d++ = *(s+l); /* copy any trailing space */
292 rv[len-1] = '\0';
293 *d = '\0';
294 s += l;
298 if(rv){
299 if(s && *s){ /* copy remaining text */
300 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, strlen(s), NULL);
301 rv[len-1] = '\0';
304 else if(s){
305 rv = d;
306 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, strlen(s), NULL);
307 rv[len-1] = '\0';
310 return(rv ? rv : (unsigned char *) start);
312 bogus:
313 dprint((1, "RFC1522_decode: BOGUS INPUT: -->%s<--\n",
314 start ? start : "?"));
315 return((unsigned char *) start);
320 * rfc1522_token - scan the given source line up to the end_str making
321 * sure all subsequent chars are "valid" leaving endp
322 * a the start of the end_str.
323 * Returns: TRUE if we got a valid token, FALSE otherwise
326 rfc1522_token(char *s, int (*valid) (int), char *end_str, char **endp)
328 while(*s){
329 if((char) *s == *end_str /* test for matching end_str */
330 && ((end_str[1])
331 ? !strncmp((char *)s + 1, end_str + 1, strlen(end_str + 1))
332 : 1)){
333 *endp = s;
334 return(TRUE);
337 if(!(*valid)(*s++)) /* test for valid char */
338 break;
341 return(FALSE);
346 * rfc1522_valtok - test for valid character in the RFC 1522 encoded
347 * word's charset and encoding fields.
350 rfc1522_valtok(int c)
352 return(!(c == SPACE || iscntrl(c & 0x7f) || strindex(ESPECIALS, c)));
357 * rfc1522_valenc - test for valid character in the RFC 1522 encoded
358 * word's encoded-text field.
361 rfc1522_valenc(int c)
363 return(!(c == '?' || c == SPACE) && isprint((unsigned char)c));
368 * rfc1522_valid - validate the given string as to it's rfc1522-ness
371 rfc1522_valid(char *s, char **charset, char **enc, char **txt, char **endp)
373 char *c, *e, *t, *p;
374 int rv;
376 rv = rfc1522_token(c = s+RFC1522_INIT_L, rfc1522_valtok, RFC1522_DLIM, &e)
377 && rfc1522_token(++e, rfc1522_valtok, RFC1522_DLIM, &t)
378 && rfc1522_token(++t, rfc1522_valenc, RFC1522_TERM, &p);
380 if(charset)
381 *charset = c;
383 if(enc)
384 *enc = e;
386 if(txt)
387 *txt = t;
389 if(endp)
390 *endp = p;
392 return(rv);
397 * rfc1522_copy_and_transliterate - copy given buf to destination buffer
398 * as UTF-8 characters
400 void
401 rfc1522_copy_and_transliterate(unsigned char *rv,
402 unsigned char **d,
403 size_t len,
404 unsigned char *s,
405 unsigned long l,
406 char *cset)
408 unsigned long i;
409 SIZEDTEXT src, xsrc;
411 src.data = s;
412 src.size = l;
413 memset(&xsrc, 0, sizeof(SIZEDTEXT));
415 /* transliterate decoded segment to utf-8 */
416 if(cset){
417 if(strucmp((char *) cset, "us-ascii")
418 && strucmp((char *) cset, "utf-8")){
419 if(utf8_charset(cset)){
420 if(!utf8_text(&src, cset, &xsrc, 0L)){
421 /* should not happen */
422 alpine_panic("c-client failed to transliterate recognized characterset");
425 else{
426 /* non-xlatable charset */
427 for(i = 0; i < l; i++)
428 if(src.data[i] & 0x80){
429 xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char));
430 xsrc.size = l;
431 for(i = 0; i < l; i++)
432 xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i];
434 break;
439 else{
440 const CHARSET *cs;
442 src.data = s;
443 src.size = strlen((char *) s);
445 if((cs = utf8_infercharset(&src))){
446 if(!(cs->type == CT_ASCII || cs->type == CT_UTF8)){
447 if(!utf8_text_cs(&src, cs, &xsrc, 0L, 0L)){
448 /* should not happen */
449 alpine_panic("c-client failed to transliterate recognized characterset");
453 else if((cset=ps_global->VAR_UNK_CHAR_SET)
454 && strucmp((char *) cset, "us-ascii")
455 && strucmp((char *) cset, "utf-8")
456 && utf8_charset(cset)){
457 if(!utf8_text(&src, cset, &xsrc, 0L)){
458 /* should not happen */
459 alpine_panic("c-client failed to transliterate recognized character set");
462 else{
463 /* unknown bytes - mask off high bit chars */
464 for(i = 0; i < l; i++)
465 if(src.data[i] & 0x80){
466 xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char));
467 xsrc.size = l;
468 for(i = 0; i < l; i++)
469 xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i];
471 break;
476 if(xsrc.data){
477 s = xsrc.data;
478 l = xsrc.size;
481 i = MIN(l,len-1-((*d)-rv));
482 strncpy((char *) (*d), (char *) s, i);
483 (*d)[i] = '\0';
484 *d += l; /* advance dest ptr to EOL */
485 if((*d)-rv > len-1)
486 *d = rv+len-1;
488 if(xsrc.data && src.data != xsrc.data)
489 fs_give((void **) &xsrc.data);
495 * rfc1522_encode - encode the given source string ala RFC 1522,
496 * IF NECESSARY, into the given destination buffer.
497 * Don't bother copying if it turns out encoding
498 * isn't necessary.
500 * Returns: pointer to either the destination buffer containing the
501 * encoded text, or a pointer to the source buffer if we didn't
502 * have to encode anything.
504 char *
505 rfc1522_encode(char *d, size_t dlen, unsigned char *s, char *charset)
507 unsigned char *p, *q;
508 int n;
510 if(!s)
511 return((char *) s);
513 if(!charset)
514 charset = UNKNOWN_CHARSET;
516 /* look for a reason to encode */
517 for(p = s, n = 0; *p; p++)
518 if((*p) & 0x80){
519 n++;
521 else if(*p == RFC1522_INIT[0]
522 && !strncmp((char *) p, RFC1522_INIT, RFC1522_INIT_L)){
523 if(rfc1522_valid((char *) p, NULL, NULL, NULL, (char **) &q))
524 p = q + RFC1522_TERM_L - 1; /* advance past encoded gunk */
526 else if(*p == ESCAPE && match_escapes((char *)(p+1))){
527 n++;
530 if(n){ /* found, encoding to do */
531 char *rv = d, *t,
532 enc = (n > (2 * (p - s)) / 3) ? 'B' : 'Q';
534 while(*s){
535 if(d-rv < dlen-1-(RFC1522_INIT_L+2*RFC1522_DLIM_L+1)){
536 sstrncpy(&d, RFC1522_INIT, dlen-(d-rv)); /* insert intro header, */
537 sstrncpy(&d, charset, dlen-(d-rv)); /* character set tag, */
538 sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv)); /* and encoding flavor */
539 if(dlen-(d-rv) > 0)
540 *d++ = enc;
542 sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv));
546 * feed lines to encoder such that they're guaranteed
547 * less than RFC1522_MAXW.
549 p = rfc1522_encoded_word(s, enc, charset);
550 if(enc == 'B') /* insert encoded data */
551 sstrncpy(&d, t = rfc1522_binary(s, p - s), dlen-1-(d-rv));
552 else /* 'Q' encoding */
553 sstrncpy(&d, t = rfc1522_8bit(s, p - s), dlen-1-(d-rv));
555 sstrncpy(&d, RFC1522_TERM, dlen-1-(d-rv)); /* insert terminator */
556 fs_give((void **) &t);
557 if(*p) /* more src string follows */
558 sstrncpy(&d, "\015\012 ", dlen-1-(d-rv)); /* insert cont. line */
560 s = p; /* advance s */
563 rv[dlen-1] = '\0';
564 return(rv);
566 else
567 return((char *) s); /* no work for us here */
573 * rfc1522_encoded_word -- cut given string into max length encoded word
575 * Return: pointer into 's' such that the encoded 's' is no greater
576 * than RFC1522_MAXW
578 * NOTE: this line break code is NOT cognizant of any SI/SO
579 * charset requirements nor similar strategies using escape
580 * codes. Hopefully this will matter little and such
581 * representation strategies don't also include 8bit chars.
583 unsigned char *
584 rfc1522_encoded_word(unsigned char *s, int enc, char *charset)
586 int goal = RFC1522_MAXW - RFC1522_OVERHEAD(charset);
588 if(enc == 'B') /* base64 encode */
589 for(goal = ((goal / 4) * 3) - 2; goal && *s; goal--, s++)
591 else /* special 'Q' encoding */
592 if(!strucmp(charset, "UTF-8")){ /* special handling for utf-8 */
593 int i,more;
594 unsigned char *p;
595 for(; goal && *s; s++){
596 more = *s < 0x80 ? 0
597 : *s < 0xe0 ? 1
598 : *s < 0xf0 ? 2
599 : *s < 0xf8 ? 3
600 : *s < 0xfc ? 4
601 : *s < 0xfe ? 5 : -1;
602 if(more >= 0){ /* check that we have at least more characters */
603 for(p = s, i = 0; i <= more && *p != '\0'; i++, p++)
604 goal -= RFC1522_ENC_CHAR(*p) ? 3 : 1;
605 if(goal < 0) /* does not fit in encoded word */
606 break;
607 s += i - 1; /* i - 1 should be equal to more */
609 else /* encode it, and skip it */
610 if((goal -= RFC1522_ENC_CHAR(*s) ? 3 : 1) < 0)
611 break;
614 else
615 for(; goal && *s; s++)
616 if((goal -= RFC1522_ENC_CHAR(*s) ? 3 : 1) < 0)
617 break;
619 return(s);
625 * rfc1522_8bit -- apply RFC 1522 'Q' encoding to the given 8bit buffer
627 * Return: alloc'd buffer containing encoded string
629 char *
630 rfc1522_8bit(void *src, int slen)
632 char *ret = (char *) fs_get ((size_t) (3*slen + 2));
633 char *d = ret;
634 unsigned char c;
635 unsigned char *s = (unsigned char *) src;
637 while (slen--) { /* for each character */
638 if (((c = *s++) == '\015') && (*s == '\012') && slen) {
639 *d++ = '\015'; /* true line break */
640 *d++ = *s++;
641 slen--;
643 else if(c == SPACE){ /* special encoding case */
644 *d++ = '_';
646 else if(RFC1522_ENC_CHAR(c)){
647 *d++ = '='; /* quote character */
648 C2XPAIR(c, d);
650 else
651 *d++ = (char) c; /* ordinary character */
654 *d = '\0'; /* tie off destination */
655 return(ret);
660 * rfc1522_binary -- apply RFC 1522 'B' encoding to the given 8bit buffer
662 * Return: alloc'd buffer containing encoded string
664 char *
665 rfc1522_binary (void *src, int srcl)
667 static char *v =
668 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
669 unsigned char *s = (unsigned char *) src;
670 char *ret, *d;
672 d = ret = (char *) fs_get ((size_t) ((((srcl + 2) / 3) * 4) + 1));
673 for (; srcl; s += 3) { /* process tuplets */
674 /* byte 1: high 6 bits (1) */
675 *d++ = v[s[0] >> 2];
676 /* byte 2: low 2 bits (1), high 4 bits (2) */
677 *d++ = v[((s[0] << 4) + (--srcl ? (s[1] >> 4) : 0)) & 0x3f];
678 /* byte 3: low 4 bits (2), high 2 bits (3) */
679 *d++ = srcl ? v[((s[1] << 2) + (--srcl ? (s[2] >> 6) :0)) & 0x3f] :'=';
680 /* byte 4: low 6 bits (3) */
681 *d++ = srcl ? v[s[2] & 0x3f] : '=';
682 if(srcl)
683 srcl--; /* count third character if processed */
686 *d = '\0'; /* tie off string */
687 return(ret); /* return the resulting string */
692 * Checks if charset conversion is possible and which quality could be achieved
694 * args: from_cs -- charset to convert from
695 * to_cs -- charset to convert to
697 * Results:
698 * CONV_TABLE->table -- conversion table, NULL if conversion not needed
699 * or not supported
700 * CONV_TABLE->quality -- conversion quality (conversion not supported, not
701 * needed, loses special chars, or loses letters
703 * The other entries of CONV_TABLE are used inside this function only
704 * and may not be used outside unless this documentation is updated.
706 CONV_TABLE *
707 conversion_table(char *from_cs, char *to_cs)
709 int i, j;
710 unsigned char *p = NULL;
711 unsigned short *fromtab, *totab;
712 CONV_TABLE *ct = NULL;
713 const CHARSET *from, *to;
714 static CONV_TABLE null_tab;
716 if(!(from_cs && *from_cs && to_cs && *to_cs) || !strucmp(from_cs, to_cs)){
717 memset(&null_tab, 0, sizeof(null_tab));
718 null_tab.quality = CV_NO_TRANSLATE_NEEDED;
719 return(&null_tab);
723 * First check to see if we are already set up for this pair of charsets.
725 if((ct = ps_global->conv_table) != NULL
726 && ct->from_charset && ct->to_charset
727 && !strucmp(ct->from_charset, from_cs)
728 && !strucmp(ct->to_charset, to_cs))
729 return(ct);
732 * No such luck. Get rid of the cache of the previous translation table
733 * and build a new one.
735 if(ct){
736 if(ct->table && (ct->convert != gf_convert_utf8_charset))
737 fs_give((void **) &ct->table);
739 if(ct->from_charset)
740 fs_give((void **) &ct->from_charset);
742 if(ct->to_charset)
743 fs_give((void **) &ct->to_charset);
745 else
746 ct = ps_global->conv_table = (CONV_TABLE *) fs_get(sizeof(*ct));
748 memset(ct, 0, sizeof(*ct));
750 ct->from_charset = cpystr(from_cs);
751 ct->to_charset = cpystr(to_cs);
752 ct->quality = CV_NO_TRANSLATE_POSSIBLE;
755 * Check to see if a translation is feasible.
757 from = utf8_charset(from_cs);
758 to = utf8_charset(to_cs);
760 if(from && to){ /* if both charsets found */
761 /* no mapping if same or from is ASCII */
762 if((from->type == to->type && from->tab == to->tab)
763 || (from->type == CT_ASCII))
764 ct->quality = CV_NO_TRANSLATE_NEEDED;
765 else switch(from->type){
766 case CT_1BYTE0: /* 1 byte no table */
767 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
768 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
769 switch(to->type){
770 case CT_1BYTE0: /* 1 byte no table */
771 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
772 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
773 ct->quality = (from->script & to->script) ?
774 CV_LOSES_SOME_LETTERS : CV_LOSES_SPECIAL_CHARS;
775 break;
777 break;
778 case CT_UTF8: /* variable UTF-8 encoded Unicode no table */
779 /* If source is UTF-8, see if destination charset has an 8 or 16 bit
780 * coded character set that we can translate to. By special
781 * dispensation, kludge ISO-2022-JP to EUC or Shift-JIS, but don't
782 * try to do any other ISO 2022 charsets or UTF-7.
784 switch (to->type){
785 case CT_SJIS: /* 2 byte Shift-JIS */
786 /* only win if can get EUC-JP chartab */
787 if(utf8_charset("EUC-JP"))
788 ct->quality = CV_LOSES_SOME_LETTERS;
789 break;
790 case CT_ASCII: /* 7-bit ASCII no table */
791 case CT_1BYTE0: /* 1 byte no table */
792 case CT_1BYTE: /* 1 byte ASCII + table 0x80-0xff */
793 case CT_1BYTE8: /* 1 byte table 0x00 - 0xff */
794 case CT_EUC: /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
795 case CT_DBYTE: /* 2 byte ASCII + utf8_eucparam */
796 case CT_DBYTE2: /* 2 byte ASCII + utf8_eucparam plane1/2 */
797 ct->quality = CV_LOSES_SOME_LETTERS;
798 break;
800 break;
803 switch (ct->quality) { /* need to map? */
804 case CV_NO_TRANSLATE_POSSIBLE:
805 case CV_NO_TRANSLATE_NEEDED:
806 break; /* no mapping needed */
807 default: /* do mapping */
808 switch (from->type) {
809 case CT_UTF8: /* UTF-8 to legacy character set */
810 if((ct->table = utf8_rmap (to_cs)) != NULL)
811 ct->convert = gf_convert_utf8_charset;
812 break;
814 case CT_1BYTE0: /* ISO 8859-1 */
815 case CT_1BYTE: /* low part ASCII, high part other */
816 case CT_1BYTE8: /* low part has some non-ASCII */
818 * The fromtab and totab tables are mappings from the 128 character
819 * positions 128-255 to their Unicode values (so unsigned shorts).
820 * The table we are creating is such that if
822 * from_char_value -> unicode_value
823 * to_char_value -> same_unicode_value
825 * then we want to map from_char_value -> to_char_value
827 * To simplify conversions we create the whole 256 element array,
828 * with the first 128 positions just the identity. If there is no
829 * conversion for a particular from_char_value (that is, no
830 * to_char_value maps to the same unicode character) then we put
831 * '?' in that character. We may want to output blob on the PC,
832 * but don't so far.
834 * If fromtab or totab are NULL, that means the mapping is simply
835 * the identity mapping. Since that is still useful to us, we
836 * create it on the fly.
838 fromtab = (unsigned short *) from->tab;
839 totab = (unsigned short *) to->tab;
841 ct->convert = gf_convert_8bit_charset;
842 p = ct->table = (unsigned char *)
843 fs_get(256 * sizeof(unsigned char));
844 for(i = 0; i < 256; i++){
845 unsigned int fc = 0;
846 p[i] = '?';
847 switch(from->type){ /* get "from" UCS-2 codepoint */
848 case CT_1BYTE0: /* ISO 8859-1 */
849 fc = i;
850 break;
851 case CT_1BYTE: /* low part ASCII, high part other */
852 fc = (i < 128) ? i : fromtab[i-128];
853 break;
854 case CT_1BYTE8: /* low part has some non-ASCII */
855 fc = fromtab[i];
856 break;
858 switch(to->type){ /* match against "to" UCS-2 codepoint */
859 case CT_1BYTE0: /* identity match for ISO 8859-1*/
860 if(fc < 256)
861 p[i] = fc;
862 break;
863 case CT_1BYTE: /* ASCII is identity, search high part */
864 if(fc < 128) p[i] = fc;
865 else for(j = 0; j < 128; j++){
866 if(fc == totab[j]){
867 p[i] = 128 + j;
868 break;
871 break;
872 case CT_1BYTE8: /* search all codepoints */
873 for(j = 0; j < 256; j++){
874 if(fc == totab[j]){
875 p[i] = j;
876 break;
879 break;
882 break;
887 return(ct);
892 * Replace personal names in list of addresses with
893 * decoded personal names in UTF-8.
894 * Assumes we can free and reallocate the name.
896 void
897 decode_addr_names_to_utf8(struct mail_address *a)
899 for(; a; a = a->next)
900 if(a->personal)
901 convert_possibly_encoded_str_to_utf8(&a->personal);
906 * Strp is a pointer to an allocated string.
907 * This routine will convert the string to UTF-8, possibly
908 * freeing and re-allocating it.
909 * The source string may or may not have RFC1522 encoding
910 * which will be undone using rfc1522_decode.
911 * The string will have been converted on return.
913 void
914 convert_possibly_encoded_str_to_utf8(char **strp)
916 size_t len, lensrc, lenresult;
917 char *bufp, *decoded;
919 if(!strp || !*strp || **strp == '\0')
920 return;
922 len = 4 * strlen(*strp) + 1;
923 bufp = (char *) fs_get(len);
925 decoded = (char *) rfc1522_decode_to_utf8((unsigned char *) bufp, len, *strp);
926 if(decoded != (*strp)){ /* unchanged */
927 if((lensrc=strlen(*strp)) >= (lenresult=strlen(decoded))){
928 strncpy(*strp, decoded, lensrc);
929 (*strp)[lensrc] = '\0';
931 else{
932 fs_give((void **) strp);
933 if(decoded == bufp){ /* this will be true */
934 fs_resize((void **) &bufp, lenresult+1);
935 *strp = bufp;
936 bufp = NULL;
938 else{ /* this is unreachable */
939 *strp = cpystr(decoded);
943 /* else, already UTF-8 */
945 if(bufp)
946 fs_give((void **) &bufp);