pith/charset.c

   1 /*
   2  * ========================================================================
   3  * Copyright 2013-2022 Eduardo Chappa
   4  * Copyright 2006-2008 University of Washington
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * ========================================================================
  13  */
  14
  15 #include "../pith/headers.h"
  16 #include "../pith/charset.h"
  17 #include "../pith/state.h"
  18 #include "../pith/conf.h"
  19 #include "../pith/escapes.h"
  20 #include "../pith/mimedesc.h"
  21 #include "../pith/filter.h"
  22 #include "../pith/string.h"
  23 #include "../pith/options.h"
  24
  25
  26 /*
  27  * Internal prototypes
  28  */
  29 int            rfc1522_token(char *, int (*)(int), char *, char **);
  30 int            rfc1522_valtok(int);
  31 int            rfc1522_valenc(int);
  32 int            rfc1522_valid(char *, char **, char **, char **, char **);
  33 void           rfc1522_copy_and_transliterate(unsigned char *, unsigned char **, size_t,
  34                                               unsigned char *, unsigned long, char *);
  35 unsigned char *rfc1522_encoded_word(unsigned char *, int, char *);
  36 char          *rfc1522_8bit(void *, int);
  37 char          *rfc1522_binary(void *, int);
  38
  39
  40 char *
  41 body_charset(MAILSTREAM *stream, long int msgno, unsigned char *section)
  42 {
  43     BODY *body;
  44     char *charset;
  45
  46
  47     if((body = mail_body(stream, msgno, section)) && body->type == TYPETEXT){
  48         if(!(charset = parameter_val(body->parameter, "charset")))
  49           charset = cpystr("US-ASCII");
  50
  51         return(charset);
  52     }
  53
  54     return(NULL);
  55 }
  56
  57
  58 /*
  59  * Copies the source string into allocated space with the 8-bit EUC codes
  60  * (on Unix) or the Shift-JIS (on PC) converted into ISO-2022-JP.
  61  * Caller is responsible for freeing the result.
  62  */
  63 unsigned char *
  64 trans_euc_to_2022_jp(unsigned char *src)
  65 {
  66     size_t len, alloc;
  67     unsigned char *rv, *p, *q;
  68     int    inside_esc_seq = 0;
  69     int    c1 = -1;             /* remembers first of pair for Shift-JIS */
  70
  71     if(!src)
  72       return(NULL);
  73
  74     len = strlen((char *) src);
  75
  76     /*
  77      * Worst possible increase is every other character an 8-bit character.
  78      * In that case, each of those gets 6 extra characters for the escape
  79      * sequences. We're not too concerned about the extra length because
  80      * these are relatively short strings.
  81      */
  82     alloc = len + 1 + ((len+1)/2) * 6;
  83     rv = (unsigned char *) fs_get(alloc * sizeof(char));
  84
  85     for(p = src, q = rv; *p; p++){
  86         if(inside_esc_seq){
  87             if(c1 >= 0){                        /* second of a pair? */
  88                 int adjust = *p < 159;
  89                 int rowOffset = c1 < 160 ? 112 : 176;
  90                 int cellOffset = adjust ? (*p > 127 ? 32 : 31) : 126;
  91
  92                 *q++ = ((c1 - rowOffset) << 1) - adjust;
  93                 *q++ = *p - cellOffset;
  94                 c1 = -1;
  95             }
  96             else if(*p & 0x80){
  97                 *q++ = (*p & 0x7f);
  98             }
  99             else{
 100                 *q++ = '\033';
 101                 *q++ = '(';
 102                 *q++ = 'B';
 103                 *q++ = (*p);
 104                 c1 = -1;
 105                 inside_esc_seq = 0;
 106             }
 107         }
 108         else{
 109             if(*p & 0x80){
 110                 *q++ = '\033';
 111                 *q++ = '$';
 112                 *q++ = 'B';
 113                 *q++ = (*p & 0x7f);
 114                 inside_esc_seq = 1;
 115             }
 116             else{
 117                 *q++ = (*p);
 118             }
 119         }
 120     }
 121
 122     if(inside_esc_seq){
 123         *q++ = '\033';
 124         *q++ = '(';
 125         *q++ = 'B';
 126     }
 127
 128     *q = '\0';
 129
 130     return(rv);
 131 }
 132
 133
 134 /*
 135  *  * * * * * * * *      RFC 1522 support routines      * * * * * * * *
 136  *
 137  *   RFC 1522 support is *very* loosely based on code contributed
 138  *   by Lars-Erik Johansson <lej@cdg.chalmers.se>.  Thanks to Lars-Erik,
 139  *   and apologies for taking such liberties with his code.
 140  */
 141
 142 #define RFC1522_INIT    "=?"
 143 #define RFC1522_INIT_L  2
 144 #define RFC1522_TERM    "?="
 145 #define RFC1522_TERM_L  2
 146 #define RFC1522_DLIM    "?"
 147 #define RFC1522_DLIM_L  1
 148 #define RFC1522_MAXW    75      /* RFC's say 75, but no senders seem to care*/
 149 #define ESPECIALS       "()<>@,;:\"/[]?.="
 150 #define RFC1522_OVERHEAD(S)     (RFC1522_INIT_L + RFC1522_TERM_L +      \
 151                                  (2 * RFC1522_DLIM_L) + strlen(S) + 1);
 152 #define RFC1522_ENC_CHAR(C)     (((C) & 0x80) || !rfc1522_valtok(C)     \
 153                                  || (C) == '_' )
 154
 155 /*
 156  * rfc1522_decode_to_utf8 - try to decode the given source string ala RFC 2047
 157  *                  (obsoleted RFC 1522) into the given destination buffer,
 158  *                  encoded in UTF-8.
 159  *
 160  * How large should d be? The decoded string of octets will fit in
 161  * the same size string as the source string. However, because we're
 162  * translating that into UTF-8 the result may expand. Currently the
 163  * Thai character set has single octet characters which expand to
 164  * three octets in UTF-8. So it would be safe to use 3 * strlen(s)
 165  * for the size of d. One can imagine a currently non-existent
 166  * character set that expanded to 4 octets instead, so use 4 to be
 167  * super safe.
 168  *
 169  * Returns: pointer to either the destination buffer containing the
 170  *          decoded text, or a pointer to the source buffer if there was
 171  *          no valid 'encoded-word' found during scanning.
 172  */
 173 unsigned char *
 174 rfc1522_decode_to_utf8(unsigned char *d, size_t len, char *s)
 175 {
 176     unsigned char *rv = NULL, *p;
 177     char          *start = s, *sw, *enc, *txt, *ew, **q, *lang;
 178     char          *cset;
 179     unsigned long  l;
 180     int            i;
 181
 182     *d = '\0';                                  /* init destination */
 183
 184     while(s && (sw = strstr(s, RFC1522_INIT))){
 185         if(!rv)  /* there's something to do, init it */
 186           rv = d;
 187         /* validate the rest of the encoded-word */
 188         if(rfc1522_valid(sw, &cset, &enc, &txt, &ew)){
 189             /*
 190              * We may have been putting off copying the first part of the
 191              * source while waiting to see if we have to copy at all.
 192              */
 193             if(rv == d && s != start){
 194                 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) start,
 195                                                sw - start, NULL);
 196                 s = sw;
 197             }
 198
 199             /* copy everything between s and sw to destination */
 200             for(i = 0; &s[i] < sw; i++)
 201               if(!isspace((unsigned char)s[i])){ /* if some non-whitespace */
 202                   while(s < sw && d-rv<len-1)
 203                     *d++ = (unsigned char) *s++;
 204
 205                   break;
 206               }
 207
 208             enc[-1] = txt[-1] = ew[0] = '\0';   /* tie off token strings */
 209
 210             if((lang = strchr(cset, '*')) != NULL)
 211               *lang++ = '\0';
 212
 213             /* based on encoding, write the encoded text to output buffer */
 214             switch(*enc){
 215               case 'Q' :                        /* 'Q' encoding */
 216               case 'q' :
 217                 /* special hocus-pocus to deal with '_' exception, too bad */
 218                 for(l = 0L, i = 0; txt[l]; l++)
 219                   if(txt[l] == '_')
 220                     i++;
 221
 222                 if(i){
 223                     q = (char **) fs_get((i + 1) * sizeof(char *));
 224                     for(l = 0L, i = 0; txt[l]; l++)
 225                       if(txt[l] == '_'){
 226                           q[i++] = &txt[l];
 227                           txt[l] = SPACE;
 228                       }
 229
 230                     q[i] = NULL;
 231                 }
 232                 else
 233                   q = NULL;
 234
 235                 if((p = rfc822_qprint((unsigned char *)txt, strlen(txt), &l)) != NULL){
 236                     rfc1522_copy_and_transliterate(rv, &d, len, p, l, cset);
 237                     fs_give((void **)&p);       /* free encoded buf */
 238                 }
 239                 else{
 240                     if(q)
 241                       fs_give((void **) &q);
 242
 243                     goto bogus;
 244                 }
 245
 246                 if(q){                          /* restore underscores */
 247                     for(i = 0; q[i]; i++)
 248                       *(q[i]) = '_';
 249
 250                     fs_give((void **)&q);
 251                 }
 252
 253                 break;
 254
 255               case 'B' :                        /* 'B' encoding */
 256               case 'b' :
 257                 if((p = rfc822_base64((unsigned char *) txt, strlen(txt), &l)) != NULL){
 258                     rfc1522_copy_and_transliterate(rv, &d, len, p, l, cset);
 259                     fs_give((void **)&p);       /* free encoded buf */
 260                 }
 261                 else
 262                   goto bogus;
 263
 264                 break;
 265
 266               default:
 267                 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) txt,
 268                                                strlen(txt), NULL);
 269                 dprint((1, "RFC1522_decode: Unknown ENCODING: %s\n",
 270                        enc ? enc : "?"));
 271                 break;
 272             }
 273
 274             /* restore trompled source string */
 275             enc[-1] = txt[-1] = '?';
 276             ew[0]   = RFC1522_TERM[0];
 277
 278             /* advance s to start of text after encoded-word */
 279             s = ew + RFC1522_TERM_L;
 280
 281             if(lang)
 282               lang[-1] = '*';
 283         }
 284         else{
 285             /*
 286              * Found intro, but bogus data followed, treat it as normal text.
 287              */
 288             l = (sw - s) + RFC1522_INIT_L;
 289             rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, l, NULL);
 290             for(; isspace((unsigned char) *(s+l)) && d-rv<len-1;l++)
 291                 *d++ = *(s+l);  /* copy any trailing space */
 292             rv[len-1] = '\0';
 293             *d = '\0';
 294             s += l;
 295         }
 296     }
 297
 298     if(rv){
 299         if(s && *s){                            /* copy remaining text */
 300             rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, strlen(s), NULL);
 301             rv[len-1] = '\0';
 302         }
 303     }
 304     else if(s){
 305         rv = d;
 306         rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, strlen(s), NULL);
 307         rv[len-1] = '\0';
 308     }
 309
 310     return(rv ? rv : (unsigned char *) start);
 311
 312   bogus:
 313     dprint((1, "RFC1522_decode: BOGUS INPUT: -->%s<--\n",
 314            start ? start : "?"));
 315     return((unsigned char *) start);
 316 }
 317
 318
 319 /*
 320  * rfc1522_token - scan the given source line up to the end_str making
 321  *                 sure all subsequent chars are "valid" leaving endp
 322  *                 a the start of the end_str.
 323  * Returns: TRUE if we got a valid token, FALSE otherwise
 324  */
 325 int
 326 rfc1522_token(char *s, int (*valid) (int), char *end_str, char **endp)
 327 {
 328     while(*s){
 329         if((char) *s == *end_str                /* test for matching end_str */
 330            && ((end_str[1])
 331                 ? !strncmp((char *)s + 1, end_str + 1, strlen(end_str + 1))
 332                 : 1)){
 333             *endp = s;
 334             return(TRUE);
 335         }
 336
 337         if(!(*valid)(*s++))                     /* test for valid char */
 338           break;
 339     }
 340
 341     return(FALSE);
 342 }
 343
 344
 345 /*
 346  * rfc1522_valtok - test for valid character in the RFC 1522 encoded
 347  *                  word's charset and encoding fields.
 348  */
 349 int
 350 rfc1522_valtok(int c)
 351 {
 352     return(!(c == SPACE || iscntrl(c & 0x7f) || strindex(ESPECIALS, c)));
 353 }
 354
 355
 356 /*
 357  * rfc1522_valenc - test for valid character in the RFC 1522 encoded
 358  *                  word's encoded-text field.
 359  */
 360 int
 361 rfc1522_valenc(int c)
 362 {
 363     return(!(c == '?' || c == SPACE) && isprint((unsigned char)c));
 364 }
 365
 366
 367 /*
 368  * rfc1522_valid - validate the given string as to it's rfc1522-ness
 369  */
 370 int
 371 rfc1522_valid(char *s, char **charset, char **enc, char **txt, char **endp)
 372 {
 373     char *c, *e, *t, *p;
 374     int   rv;
 375
 376     rv = rfc1522_token(c = s+RFC1522_INIT_L, rfc1522_valtok, RFC1522_DLIM, &e)
 377            && rfc1522_token(++e, rfc1522_valtok, RFC1522_DLIM, &t)
 378            && rfc1522_token(++t, rfc1522_valenc, RFC1522_TERM, &p);
 379
 380     if(charset)
 381       *charset = c;
 382
 383     if(enc)
 384       *enc = e;
 385
 386     if(txt)
 387       *txt = t;
 388
 389     if(endp)
 390       *endp = p;
 391
 392     return(rv);
 393 }
 394
 395
 396 /*
 397  * rfc1522_copy_and_transliterate - copy given buf to destination buffer
 398  *                                  as UTF-8 characters
 399  */
 400 void
 401 rfc1522_copy_and_transliterate(unsigned char  *rv,
 402                                unsigned char **d,
 403                                size_t          len,
 404                                unsigned char  *s,
 405                                unsigned long   l,
 406                                char           *cset)
 407 {
 408     unsigned long i;
 409     SIZEDTEXT     src, xsrc;
 410
 411     src.data = s;
 412     src.size = l;
 413     memset(&xsrc, 0, sizeof(SIZEDTEXT));
 414
 415     /* transliterate decoded segment to utf-8 */
 416     if(cset){
 417         if(strucmp((char *) cset, "us-ascii")
 418            && strucmp((char *) cset, "utf-8")){
 419             if(utf8_charset(cset)){
 420                 if(!utf8_text(&src, cset, &xsrc, 0L)){
 421                     /* should not happen */
 422                     alpine_panic("c-client failed to transliterate recognized characterset");
 423                 }
 424             }
 425             else{
 426                 /* non-xlatable charset */
 427                 for(i = 0; i < l; i++)
 428                   if(src.data[i] & 0x80){
 429                       xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char));
 430                       xsrc.size = l;
 431                       for(i = 0; i < l; i++)
 432                         xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i];
 433
 434                       break;
 435                   }
 436             }
 437         }
 438     }
 439     else{
 440         const CHARSET *cs;
 441
 442         src.data = s;
 443         src.size = strlen((char *) s);
 444
 445         if((cs = utf8_infercharset(&src))){
 446             if(!(cs->type == CT_ASCII || cs->type == CT_UTF8)){
 447                 if(!utf8_text_cs(&src, cs, &xsrc, 0L, 0L)){
 448                     /* should not happen */
 449                     alpine_panic("c-client failed to transliterate recognized characterset");
 450                 }
 451             }
 452         }
 453         else if((cset=ps_global->VAR_UNK_CHAR_SET)
 454                 && strucmp((char *) cset, "us-ascii")
 455                 && strucmp((char *) cset, "utf-8")
 456                 && utf8_charset(cset)){
 457                 if(!utf8_text(&src, cset, &xsrc, 0L)){
 458                     /* should not happen */
 459                     alpine_panic("c-client failed to transliterate recognized character set");
 460                 }
 461         }
 462         else{
 463             /* unknown bytes - mask off high bit chars */
 464             for(i = 0; i < l; i++)
 465               if(src.data[i] & 0x80){
 466                   xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char));
 467                   xsrc.size = l;
 468                   for(i = 0; i < l; i++)
 469                     xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i];
 470
 471                   break;
 472               }
 473         }
 474     }
 475
 476     if(xsrc.data){
 477         s = xsrc.data;
 478         l = xsrc.size;
 479     }
 480
 481     i = MIN(l,len-1-((*d)-rv));
 482     strncpy((char *) (*d), (char *) s, i);
 483     (*d)[i] = '\0';
 484     *d += l;                    /* advance dest ptr to EOL */
 485     if((*d)-rv > len-1)
 486       *d = rv+len-1;
 487
 488     if(xsrc.data && src.data != xsrc.data)
 489       fs_give((void **) &xsrc.data);
 490 }
 491
 492
 493
 494 /*
 495  * rfc1522_encode - encode the given source string ala RFC 1522,
 496  *                  IF NECESSARY, into the given destination buffer.
 497  *                  Don't bother copying if it turns out encoding
 498  *                  isn't necessary.
 499  *
 500  * Returns: pointer to either the destination buffer containing the
 501  *          encoded text, or a pointer to the source buffer if we didn't
 502  *          have to encode anything.
 503  */
 504 char *
 505 rfc1522_encode(char *d, size_t dlen, unsigned char *s, char *charset)
 506 {
 507     unsigned char *p, *q;
 508     int            n;
 509
 510     if(!s)
 511       return((char *) s);
 512
 513     if(!charset)
 514       charset = UNKNOWN_CHARSET;
 515
 516     /* look for a reason to encode */
 517     for(p = s, n = 0; *p; p++)
 518       if((*p) & 0x80){
 519           n++;
 520       }
 521       else if(*p == RFC1522_INIT[0]
 522               && !strncmp((char *) p, RFC1522_INIT, RFC1522_INIT_L)){
 523           if(rfc1522_valid((char *) p, NULL, NULL, NULL, (char **) &q))
 524             p = q + RFC1522_TERM_L - 1;         /* advance past encoded gunk */
 525       }
 526       else if(*p == ESCAPE && match_escapes((char *)(p+1))){
 527           n++;
 528       }
 529
 530     if(n){                                      /* found, encoding to do */
 531         char *rv  = d, *t,
 532               enc = (n > (2 * (p - s)) / 3) ? 'B' : 'Q';
 533
 534         while(*s){
 535             if(d-rv < dlen-1-(RFC1522_INIT_L+2*RFC1522_DLIM_L+1)){
 536                 sstrncpy(&d, RFC1522_INIT, dlen-(d-rv));        /* insert intro header, */
 537                 sstrncpy(&d, charset, dlen-(d-rv));             /* character set tag, */
 538                 sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv));        /* and encoding flavor */
 539                 if(dlen-(d-rv) > 0)
 540                   *d++ = enc;
 541
 542                 sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv));
 543             }
 544
 545             /*
 546              * feed lines to encoder such that they're guaranteed
 547              * less than RFC1522_MAXW.
 548              */
 549             p = rfc1522_encoded_word(s, enc, charset);
 550             if(enc == 'B')                      /* insert encoded data */
 551               sstrncpy(&d, t = rfc1522_binary(s, p - s), dlen-1-(d-rv));
 552             else                                /* 'Q' encoding */
 553               sstrncpy(&d, t = rfc1522_8bit(s, p - s), dlen-1-(d-rv));
 554
 555             sstrncpy(&d, RFC1522_TERM, dlen-1-(d-rv));  /* insert terminator */
 556             fs_give((void **) &t);
 557             if(*p)                              /* more src string follows */
 558               sstrncpy(&d, "\015\012 ", dlen-1-(d-rv)); /* insert cont. line */
 559
 560             s = p;                              /* advance s */
 561         }
 562
 563         rv[dlen-1] = '\0';
 564         return(rv);
 565     }
 566     else
 567       return((char *) s);                       /* no work for us here */
 568 }
 569
 570
 571
 572 /*
 573  * rfc1522_encoded_word -- cut given string into max length encoded word
 574  *
 575  * Return: pointer into 's' such that the encoded 's' is no greater
 576  *         than RFC1522_MAXW
 577  *
 578  *  NOTE: this line break code is NOT cognizant of any SI/SO
 579  *  charset requirements nor similar strategies using escape
 580  *  codes.  Hopefully this will matter little and such
 581  *  representation strategies don't also include 8bit chars.
 582  */
 583 unsigned char *
 584 rfc1522_encoded_word(unsigned char *s, int enc, char *charset)
 585 {
 586     int goal = RFC1522_MAXW - RFC1522_OVERHEAD(charset);
 587
 588     if(enc == 'B')                      /* base64 encode */
 589       for(goal = ((goal / 4) * 3) - 2; goal && *s; goal--, s++)
 590         ;
 591     else                                /* special 'Q' encoding */
 592       if(!strucmp(charset, "UTF-8")){   /* special handling for utf-8 */
 593         int i,more;
 594         unsigned char *p;
 595         for(; goal && *s; s++){
 596            more = *s < 0x80 ? 0
 597                    : *s < 0xe0 ? 1
 598                    : *s < 0xf0 ? 2
 599                    : *s < 0xf8 ? 3
 600                    : *s < 0xfc ? 4
 601                    : *s < 0xfe ? 5 : -1;
 602            if(more >= 0){       /* check that we have at least more characters */
 603                 for(p = s, i = 0; i <= more && *p != '\0'; i++, p++)
 604                     goal -= RFC1522_ENC_CHAR(*p) ? 3 : 1;
 605                 if(goal < 0)   /* does not fit in encoded word */
 606                    break;
 607                 s += i - 1;     /* i - 1 should be equal to more */
 608            }
 609            else /* encode it, and skip it */
 610               if((goal -= RFC1522_ENC_CHAR(*s) ? 3 : 1) < 0)
 611                 break;
 612         }
 613       }
 614       else
 615         for(; goal && *s; s++)
 616           if((goal -= RFC1522_ENC_CHAR(*s) ? 3 : 1) < 0)
 617             break;
 618
 619     return(s);
 620 }
 621
 622
 623
 624 /*
 625  * rfc1522_8bit -- apply RFC 1522 'Q' encoding to the given 8bit buffer
 626  *
 627  * Return: alloc'd buffer containing encoded string
 628  */
 629 char *
 630 rfc1522_8bit(void *src, int slen)
 631 {
 632     char *ret = (char *) fs_get ((size_t) (3*slen + 2));
 633     char *d = ret;
 634     unsigned char c;
 635     unsigned char *s = (unsigned char *) src;
 636
 637     while (slen--) {                            /* for each character */
 638         if (((c = *s++) == '\015') && (*s == '\012') && slen) {
 639             *d++ = '\015';                      /* true line break */
 640             *d++ = *s++;
 641             slen--;
 642         }
 643         else if(c == SPACE){                    /* special encoding case */
 644             *d++ = '_';
 645         }
 646         else if(RFC1522_ENC_CHAR(c)){
 647             *d++ = '=';                         /* quote character */
 648             C2XPAIR(c, d);
 649         }
 650         else
 651           *d++ = (char) c;                      /* ordinary character */
 652     }
 653
 654     *d = '\0';                                  /* tie off destination */
 655     return(ret);
 656 }
 657
 658
 659 /*
 660  * rfc1522_binary -- apply RFC 1522 'B' encoding to the given 8bit buffer
 661  *
 662  * Return: alloc'd buffer containing encoded string
 663  */
 664 char *
 665 rfc1522_binary (void *src, int srcl)
 666 {
 667     static char *v =
 668             "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 669     unsigned char *s = (unsigned char *) src;
 670     char *ret, *d;
 671
 672     d = ret = (char *) fs_get ((size_t) ((((srcl + 2) / 3) * 4) + 1));
 673     for (; srcl; s += 3) {      /* process tuplets */
 674                                 /* byte 1: high 6 bits (1) */
 675         *d++ = v[s[0] >> 2];
 676                                 /* byte 2: low 2 bits (1), high 4 bits (2) */
 677         *d++ = v[((s[0] << 4) + (--srcl ? (s[1] >> 4) : 0)) & 0x3f];
 678                                 /* byte 3: low 4 bits (2), high 2 bits (3) */
 679         *d++ = srcl ? v[((s[1] << 2) + (--srcl ? (s[2] >> 6) :0)) & 0x3f] :'=';
 680                                 /* byte 4: low 6 bits (3) */
 681         *d++ = srcl ? v[s[2] & 0x3f] : '=';
 682         if(srcl)
 683           srcl--;               /* count third character if processed */
 684     }
 685
 686     *d = '\0';                  /* tie off string */
 687     return(ret);                /* return the resulting string */
 688 }
 689
 690
 691 /*
 692  * Checks if charset conversion is possible and which quality could be achieved
 693  *
 694  * args: from_cs -- charset to convert from
 695  *       to_cs   -- charset to convert to
 696  *
 697  * Results:
 698  * CONV_TABLE->table   -- conversion table, NULL if conversion not needed
 699  *                        or not supported
 700  * CONV_TABLE->quality -- conversion quality (conversion not supported, not
 701  *                        needed, loses special chars, or loses letters
 702  *
 703  * The other entries of CONV_TABLE are used inside this function only
 704  * and may not be used outside unless this documentation is updated.
 705  */
 706 CONV_TABLE *
 707 conversion_table(char *from_cs, char *to_cs)
 708 {
 709     int               i, j;
 710     unsigned char    *p = NULL;
 711     unsigned short   *fromtab, *totab;
 712     CONV_TABLE       *ct = NULL;
 713     const CHARSET    *from, *to;
 714     static CONV_TABLE null_tab;
 715
 716     if(!(from_cs && *from_cs && to_cs && *to_cs) || !strucmp(from_cs, to_cs)){
 717         memset(&null_tab, 0, sizeof(null_tab));
 718         null_tab.quality = CV_NO_TRANSLATE_NEEDED;
 719         return(&null_tab);
 720     }
 721
 722     /*
 723      * First check to see if we are already set up for this pair of charsets.
 724      */
 725     if((ct = ps_global->conv_table) != NULL
 726        && ct->from_charset && ct->to_charset
 727        && !strucmp(ct->from_charset, from_cs)
 728        && !strucmp(ct->to_charset, to_cs))
 729       return(ct);
 730
 731     /*
 732      * No such luck. Get rid of the cache of the previous translation table
 733      * and build a new one.
 734      */
 735     if(ct){
 736         if(ct->table && (ct->convert != gf_convert_utf8_charset))
 737           fs_give((void **) &ct->table);
 738
 739         if(ct->from_charset)
 740           fs_give((void **) &ct->from_charset);
 741
 742         if(ct->to_charset)
 743           fs_give((void **) &ct->to_charset);
 744     }
 745     else
 746       ct = ps_global->conv_table = (CONV_TABLE *) fs_get(sizeof(*ct));
 747
 748     memset(ct, 0, sizeof(*ct));
 749
 750     ct->from_charset = cpystr(from_cs);
 751     ct->to_charset   = cpystr(to_cs);
 752     ct->quality = CV_NO_TRANSLATE_POSSIBLE;
 753
 754     /*
 755      * Check to see if a translation is feasible.
 756      */
 757     from = utf8_charset(from_cs);
 758     to =   utf8_charset(to_cs);
 759
 760     if(from && to){             /* if both charsets found */
 761                                 /* no mapping if same or from is ASCII */
 762         if((from->type == to->type && from->tab == to->tab)
 763            || (from->type == CT_ASCII))
 764             ct->quality = CV_NO_TRANSLATE_NEEDED;
 765         else switch(from->type){
 766         case CT_1BYTE0:         /* 1 byte no table */
 767         case CT_1BYTE:          /* 1 byte ASCII + table 0x80-0xff */
 768         case CT_1BYTE8:         /* 1 byte table 0x00 - 0xff */
 769             switch(to->type){
 770             case CT_1BYTE0:     /* 1 byte no table */
 771             case CT_1BYTE:      /* 1 byte ASCII + table 0x80-0xff */
 772             case CT_1BYTE8:     /* 1 byte table 0x00 - 0xff */
 773                 ct->quality = (from->script & to->script) ?
 774                   CV_LOSES_SOME_LETTERS : CV_LOSES_SPECIAL_CHARS;
 775                 break;
 776             }
 777             break;
 778         case CT_UTF8:           /* variable UTF-8 encoded Unicode no table */
 779         /* If source is UTF-8, see if destination charset has an 8 or 16 bit
 780          * coded character set that we can translate to.  By special
 781          * dispensation, kludge ISO-2022-JP to EUC or Shift-JIS, but don't
 782          * try to do any other ISO 2022 charsets or UTF-7.
 783          */
 784             switch (to->type){
 785             case CT_SJIS:       /* 2 byte Shift-JIS */
 786                                 /* only win if can get EUC-JP chartab */
 787                 if(utf8_charset("EUC-JP"))
 788                     ct->quality = CV_LOSES_SOME_LETTERS;
 789                 break;
 790             case CT_ASCII:      /* 7-bit ASCII no table */
 791             case CT_1BYTE0:     /* 1 byte no table */
 792             case CT_1BYTE:      /* 1 byte ASCII + table 0x80-0xff */
 793             case CT_1BYTE8:     /* 1 byte table 0x00 - 0xff */
 794             case CT_EUC:        /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
 795             case CT_DBYTE:      /* 2 byte ASCII + utf8_eucparam */
 796             case CT_DBYTE2:     /* 2 byte ASCII + utf8_eucparam plane1/2 */
 797                 ct->quality = CV_LOSES_SOME_LETTERS;
 798                 break;
 799             }
 800             break;
 801         }
 802
 803         switch (ct->quality) {  /* need to map? */
 804         case CV_NO_TRANSLATE_POSSIBLE:
 805         case CV_NO_TRANSLATE_NEEDED:
 806           break;                /* no mapping needed */
 807         default:                /* do mapping */
 808             switch (from->type) {
 809             case CT_UTF8:       /* UTF-8 to legacy character set */
 810               if((ct->table = utf8_rmap (to_cs)) != NULL)
 811                 ct->convert = gf_convert_utf8_charset;
 812               break;
 813
 814             case CT_1BYTE0:     /* ISO 8859-1 */
 815             case CT_1BYTE:      /* low part ASCII, high part other */
 816             case CT_1BYTE8:     /* low part has some non-ASCII */
 817             /*
 818              * The fromtab and totab tables are mappings from the 128 character
 819              * positions 128-255 to their Unicode values (so unsigned shorts).
 820              * The table we are creating is such that if
 821              *
 822              *    from_char_value -> unicode_value
 823              *    to_char_value   -> same_unicode_value
 824              *
 825              *  then we want to map from_char_value -> to_char_value
 826              *
 827              * To simplify conversions we create the whole 256 element array,
 828              * with the first 128 positions just the identity. If there is no
 829              * conversion for a particular from_char_value (that is, no
 830              * to_char_value maps to the same unicode character) then we put
 831              *  '?' in that character. We may want to output blob on the PC,
 832              * but don't so far.
 833              *
 834              * If fromtab or totab are NULL, that means the mapping is simply
 835              * the identity mapping. Since that is still useful to us, we
 836              * create it on the fly.
 837              */
 838                 fromtab = (unsigned short *) from->tab;
 839                 totab   = (unsigned short *) to->tab;
 840
 841                 ct->convert = gf_convert_8bit_charset;
 842                 p = ct->table = (unsigned char *)
 843                   fs_get(256 * sizeof(unsigned char));
 844                 for(i = 0; i < 256; i++){
 845                     unsigned int fc = 0;
 846                     p[i] = '?';
 847                     switch(from->type){ /* get "from" UCS-2 codepoint */
 848                     case CT_1BYTE0:     /* ISO 8859-1 */
 849                         fc = i;
 850                         break;
 851                     case CT_1BYTE:      /* low part ASCII, high part other */
 852                         fc = (i < 128) ? i : fromtab[i-128];
 853                         break;
 854                     case CT_1BYTE8:     /* low part has some non-ASCII */
 855                         fc = fromtab[i];
 856                         break;
 857                     }
 858                     switch(to->type){ /* match against "to" UCS-2 codepoint */
 859                     case CT_1BYTE0: /* identity match for ISO 8859-1*/
 860                         if(fc < 256)
 861                           p[i] = fc;
 862                         break;
 863                     case CT_1BYTE: /* ASCII is identity, search high part */
 864                         if(fc < 128) p[i] = fc;
 865                         else for(j = 0; j < 128; j++){
 866                             if(fc == totab[j]){
 867                                 p[i] = 128 + j;
 868                                 break;
 869                             }
 870                         }
 871                         break;
 872                     case CT_1BYTE8: /* search all codepoints */
 873                         for(j = 0; j < 256; j++){
 874                             if(fc == totab[j]){
 875                               p[i] = j;
 876                               break;
 877                             }
 878                         }
 879                         break;
 880                     }
 881                 }
 882                 break;
 883             }
 884         }
 885     }
 886
 887     return(ct);
 888 }
 889
 890
 891 /*
 892  * Replace personal names in list of addresses with
 893  * decoded personal names in UTF-8.
 894  * Assumes we can free and reallocate the name.
 895  */
 896 void
 897 decode_addr_names_to_utf8(struct mail_address *a)
 898 {
 899     for(; a; a = a->next)
 900       if(a->personal)
 901         convert_possibly_encoded_str_to_utf8(&a->personal);
 902 }
 903
 904
 905 /*
 906  * Strp is a pointer to an allocated string.
 907  * This routine will convert the string to UTF-8, possibly
 908  * freeing and re-allocating it.
 909  * The source string may or may not have RFC1522 encoding
 910  * which will be undone using rfc1522_decode.
 911  * The string will have been converted on return.
 912  */
 913 void
 914 convert_possibly_encoded_str_to_utf8(char **strp)
 915 {
 916     size_t     len, lensrc, lenresult;
 917     char      *bufp, *decoded;
 918
 919     if(!strp || !*strp || **strp == '\0')
 920       return;
 921
 922     len = 4 * strlen(*strp) + 1;
 923     bufp = (char *) fs_get(len);
 924
 925     decoded = (char *) rfc1522_decode_to_utf8((unsigned char *) bufp, len, *strp);
 926     if(decoded != (*strp)){     /* unchanged */
 927         if((lensrc=strlen(*strp)) >= (lenresult=strlen(decoded))){
 928             strncpy(*strp, decoded, lensrc);
 929             (*strp)[lensrc] = '\0';
 930         }
 931         else{
 932             fs_give((void **) strp);
 933             if(decoded == bufp){        /* this will be true */
 934                 fs_resize((void **) &bufp, lenresult+1);
 935                 *strp = bufp;
 936                 bufp = NULL;
 937             }
 938             else{                       /* this is unreachable */
 939                 *strp = cpystr(decoded);
 940             }
 941         }
 942     }
 943     /* else, already UTF-8 */
 944
 945     if(bufp)
 946       fs_give((void **) &bufp);
 947 }