pith/charset.c

   1 #if !defined(lint) && !defined(DOS)
   2 static char rcsid[] = "$Id: charset.c 1032 2008-04-11 00:30:04Z hubert@u.washington.edu $";
   3 #endif
   4
   5 /*
   6  * ========================================================================
   7  * Copyright 2006-2008 University of Washington
   8  * Copyright 2013-2014 Eduardo Chappa
   9  *
  10  * Licensed under the Apache License, Version 2.0 (the "License");
  11  * you may not use this file except in compliance with the License.
  12  * You may obtain a copy of the License at
  13  *
  14  *     http://www.apache.org/licenses/LICENSE-2.0
  15  *
  16  * ========================================================================
  17  */
  18
  19 #include "../pith/headers.h"
  20 #include "../pith/charset.h"
  21 #include "../pith/state.h"
  22 #include "../pith/conf.h"
  23 #include "../pith/escapes.h"
  24 #include "../pith/mimedesc.h"
  25 #include "../pith/filter.h"
  26 #include "../pith/string.h"
  27 #include "../pith/options.h"
  28
  29
  30 /*
  31  * Internal prototypes
  32  */
  33 int            rfc1522_token(char *, int (*)(int), char *, char **);
  34 int            rfc1522_valtok(int);
  35 int            rfc1522_valenc(int);
  36 int            rfc1522_valid(char *, char **, char **, char **, char **);
  37 void           rfc1522_copy_and_transliterate(unsigned char *, unsigned char **, size_t,
  38                                               unsigned char *, unsigned long, char *);
  39 unsigned char *rfc1522_encoded_word(unsigned char *, int, char *);
  40 char          *rfc1522_8bit(void *, int);
  41 char          *rfc1522_binary(void *, int);
  42
  43
  44 char *
  45 body_charset(MAILSTREAM *stream, long int msgno, unsigned char *section)
  46 {
  47     BODY *body;
  48     char *charset;
  49
  50
  51     if((body = mail_body(stream, msgno, section)) && body->type == TYPETEXT){
  52         if(!(charset = parameter_val(body->parameter, "charset")))
  53           charset = cpystr("US-ASCII");
  54
  55         return(charset);
  56     }
  57
  58     return(NULL);
  59 }
  60
  61
  62 /*
  63  * Copies the source string into allocated space with the 8-bit EUC codes
  64  * (on Unix) or the Shift-JIS (on PC) converted into ISO-2022-JP.
  65  * Caller is responsible for freeing the result.
  66  */
  67 unsigned char *
  68 trans_euc_to_2022_jp(unsigned char *src)
  69 {
  70     size_t len, alloc;
  71     unsigned char *rv, *p, *q;
  72     int    inside_esc_seq = 0;
  73     int    c1 = -1;             /* remembers first of pair for Shift-JIS */
  74
  75     if(!src)
  76       return(NULL);
  77
  78     len = strlen((char *) src);
  79
  80     /*
  81      * Worst possible increase is every other character an 8-bit character.
  82      * In that case, each of those gets 6 extra charactes for the escape
  83      * sequences. We're not too concerned about the extra length because
  84      * these are relatively short strings.
  85      */
  86     alloc = len + 1 + ((len+1)/2) * 6;
  87     rv = (unsigned char *) fs_get(alloc * sizeof(char));
  88
  89     for(p = src, q = rv; *p; p++){
  90         if(inside_esc_seq){
  91             if(c1 >= 0){                        /* second of a pair? */
  92                 int adjust = *p < 159;
  93                 int rowOffset = c1 < 160 ? 112 : 176;
  94                 int cellOffset = adjust ? (*p > 127 ? 32 : 31) : 126;
  95
  96                 *q++ = ((c1 - rowOffset) << 1) - adjust;
  97                 *q++ = *p - cellOffset;
  98                 c1 = -1;
  99             }
 100             else if(*p & 0x80){
 101                 *q++ = (*p & 0x7f);
 102             }
 103             else{
 104                 *q++ = '\033';
 105                 *q++ = '(';
 106                 *q++ = 'B';
 107                 *q++ = (*p);
 108                 c1 = -1;
 109                 inside_esc_seq = 0;
 110             }
 111         }
 112         else{
 113             if(*p & 0x80){
 114                 *q++ = '\033';
 115                 *q++ = '$';
 116                 *q++ = 'B';
 117                 *q++ = (*p & 0x7f);
 118                 inside_esc_seq = 1;
 119             }
 120             else{
 121                 *q++ = (*p);
 122             }
 123         }
 124     }
 125
 126     if(inside_esc_seq){
 127         *q++ = '\033';
 128         *q++ = '(';
 129         *q++ = 'B';
 130     }
 131
 132     *q = '\0';
 133
 134     return(rv);
 135 }
 136
 137
 138 /*
 139  *  * * * * * * * *      RFC 1522 support routines      * * * * * * * *
 140  *
 141  *   RFC 1522 support is *very* loosely based on code contributed
 142  *   by Lars-Erik Johansson <lej@cdg.chalmers.se>.  Thanks to Lars-Erik,
 143  *   and appologies for taking such liberties with his code.
 144  */
 145
 146 #define RFC1522_INIT    "=?"
 147 #define RFC1522_INIT_L  2
 148 #define RFC1522_TERM    "?="
 149 #define RFC1522_TERM_L  2
 150 #define RFC1522_DLIM    "?"
 151 #define RFC1522_DLIM_L  1
 152 #define RFC1522_MAXW    256     /* RFC's say 75, but no senders seem to care*/
 153 #define ESPECIALS       "()<>@,;:\"/[]?.="
 154 #define RFC1522_OVERHEAD(S)     (RFC1522_INIT_L + RFC1522_TERM_L +      \
 155                                  (2 * RFC1522_DLIM_L) + strlen(S) + 1);
 156 #define RFC1522_ENC_CHAR(C)     (((C) & 0x80) || !rfc1522_valtok(C)     \
 157                                  || (C) == '_' )
 158
 159 /*
 160  * rfc1522_decode_to_utf8 - try to decode the given source string ala RFC 2047
 161  *                  (obsoleted RFC 1522) into the given destination buffer,
 162  *                  encoded in UTF-8.
 163  *
 164  * How large should d be? The decoded string of octets will fit in
 165  * the same size string as the source string. However, because we're
 166  * translating that into UTF-8 the result may expand. Currently the
 167  * Thai character set has single octet characters which expand to
 168  * three octets in UTF-8. So it would be safe to use 3 * strlen(s)
 169  * for the size of d. One can imagine a currently non-existent
 170  * character set that expanded to 4 octets instead, so use 4 to be
 171  * super safe.
 172  *
 173  * Returns: pointer to either the destination buffer containing the
 174  *          decoded text, or a pointer to the source buffer if there was
 175  *          no valid 'encoded-word' found during scanning.
 176  */
 177 unsigned char *
 178 rfc1522_decode_to_utf8(unsigned char *d, size_t len, char *s)
 179 {
 180     unsigned char *rv = NULL, *p;
 181     char          *start = s, *sw, *enc, *txt, *ew, **q, *lang;
 182     char          *cset;
 183     unsigned long  l;
 184     int            i;
 185
 186     *d = '\0';                                  /* init destination */
 187
 188     while(s && (sw = strstr(s, RFC1522_INIT))){
 189         if(!rv)  /* there's something to do, init it */
 190           rv = d;
 191         /* validate the rest of the encoded-word */
 192         if(rfc1522_valid(sw, &cset, &enc, &txt, &ew)){
 193             /*
 194              * We may have been putting off copying the first part of the
 195              * source while waiting to see if we have to copy at all.
 196              */
 197             if(rv == d && s != start){
 198                 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) start,
 199                                                sw - start, NULL);
 200                 s = sw;
 201             }
 202
 203             /* copy everything between s and sw to destination */
 204             for(i = 0; &s[i] < sw; i++)
 205               if(!isspace((unsigned char)s[i])){ /* if some non-whitespace */
 206                   while(s < sw && d-rv<len-1)
 207                     *d++ = (unsigned char) *s++;
 208
 209                   break;
 210               }
 211
 212             enc[-1] = txt[-1] = ew[0] = '\0';   /* tie off token strings */
 213
 214             if((lang = strchr(cset, '*')) != NULL)
 215               *lang++ = '\0';
 216
 217             /* based on encoding, write the encoded text to output buffer */
 218             switch(*enc){
 219               case 'Q' :                        /* 'Q' encoding */
 220               case 'q' :
 221                 /* special hocus-pocus to deal with '_' exception, too bad */
 222                 for(l = 0L, i = 0; txt[l]; l++)
 223                   if(txt[l] == '_')
 224                     i++;
 225
 226                 if(i){
 227                     q = (char **) fs_get((i + 1) * sizeof(char *));
 228                     for(l = 0L, i = 0; txt[l]; l++)
 229                       if(txt[l] == '_'){
 230                           q[i++] = &txt[l];
 231                           txt[l] = SPACE;
 232                       }
 233
 234                     q[i] = NULL;
 235                 }
 236                 else
 237                   q = NULL;
 238
 239                 if((p = rfc822_qprint((unsigned char *)txt, strlen(txt), &l)) != NULL){
 240                     rfc1522_copy_and_transliterate(rv, &d, len, p, l, cset);
 241                     fs_give((void **)&p);       /* free encoded buf */
 242                 }
 243                 else{
 244                     if(q)
 245                       fs_give((void **) &q);
 246
 247                     goto bogus;
 248                 }
 249
 250                 if(q){                          /* restore underscores */
 251                     for(i = 0; q[i]; i++)
 252                       *(q[i]) = '_';
 253
 254                     fs_give((void **)&q);
 255                 }
 256
 257                 break;
 258
 259               case 'B' :                        /* 'B' encoding */
 260               case 'b' :
 261                 if((p = rfc822_base64((unsigned char *) txt, strlen(txt), &l)) != NULL){
 262                     rfc1522_copy_and_transliterate(rv, &d, len, p, l, cset);
 263                     fs_give((void **)&p);       /* free encoded buf */
 264                 }
 265                 else
 266                   goto bogus;
 267
 268                 break;
 269
 270               default:
 271                 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) txt,
 272                                                strlen(txt), NULL);
 273                 dprint((1, "RFC1522_decode: Unknown ENCODING: %s\n",
 274                        enc ? enc : "?"));
 275                 break;
 276             }
 277
 278             /* restore trompled source string */
 279             enc[-1] = txt[-1] = '?';
 280             ew[0]   = RFC1522_TERM[0];
 281
 282             /* advance s to start of text after encoded-word */
 283             s = ew + RFC1522_TERM_L;
 284
 285             if(lang)
 286               lang[-1] = '*';
 287         }
 288         else{
 289             /*
 290              * Found intro, but bogus data followed, treat it as normal text.
 291              */
 292             l = (sw - s) + RFC1522_INIT_L;
 293             rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, l, NULL);
 294             for(; isspace((unsigned char) *(s+l)) && d-rv<len-1;l++)
 295                 *d++ = *(s+l);  /* copy any trailing space */
 296             rv[len-1] = '\0';
 297             *d = '\0';
 298             s += l;
 299         }
 300     }
 301
 302     if(rv){
 303         if(s && *s){                            /* copy remaining text */
 304             rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, strlen(s), NULL);
 305             rv[len-1] = '\0';
 306         }
 307     }
 308     else if(s){
 309         rv = d;
 310         rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, strlen(s), NULL);
 311         rv[len-1] = '\0';
 312     }
 313
 314     return(rv ? rv : (unsigned char *) start);
 315
 316   bogus:
 317     dprint((1, "RFC1522_decode: BOGUS INPUT: -->%s<--\n",
 318            start ? start : "?"));
 319     return((unsigned char *) start);
 320 }
 321
 322
 323 /*
 324  * rfc1522_token - scan the given source line up to the end_str making
 325  *                 sure all subsequent chars are "valid" leaving endp
 326  *                 a the start of the end_str.
 327  * Returns: TRUE if we got a valid token, FALSE otherwise
 328  */
 329 int
 330 rfc1522_token(char *s, int (*valid) (int), char *end_str, char **endp)
 331 {
 332     while(*s){
 333         if((char) *s == *end_str                /* test for matching end_str */
 334            && ((end_str[1])
 335                 ? !strncmp((char *)s + 1, end_str + 1, strlen(end_str + 1))
 336                 : 1)){
 337             *endp = s;
 338             return(TRUE);
 339         }
 340
 341         if(!(*valid)(*s++))                     /* test for valid char */
 342           break;
 343     }
 344
 345     return(FALSE);
 346 }
 347
 348
 349 /*
 350  * rfc1522_valtok - test for valid character in the RFC 1522 encoded
 351  *                  word's charset and encoding fields.
 352  */
 353 int
 354 rfc1522_valtok(int c)
 355 {
 356     return(!(c == SPACE || iscntrl(c & 0x7f) || strindex(ESPECIALS, c)));
 357 }
 358
 359
 360 /*
 361  * rfc1522_valenc - test for valid character in the RFC 1522 encoded
 362  *                  word's encoded-text field.
 363  */
 364 int
 365 rfc1522_valenc(int c)
 366 {
 367     return(!(c == '?' || c == SPACE) && isprint((unsigned char)c));
 368 }
 369
 370
 371 /*
 372  * rfc1522_valid - validate the given string as to it's rfc1522-ness
 373  */
 374 int
 375 rfc1522_valid(char *s, char **charset, char **enc, char **txt, char **endp)
 376 {
 377     char *c, *e, *t, *p;
 378     int   rv;
 379
 380     rv = rfc1522_token(c = s+RFC1522_INIT_L, rfc1522_valtok, RFC1522_DLIM, &e)
 381            && rfc1522_token(++e, rfc1522_valtok, RFC1522_DLIM, &t)
 382            && rfc1522_token(++t, rfc1522_valenc, RFC1522_TERM, &p)
 383            && p - s <= RFC1522_MAXW;
 384
 385     if(charset)
 386       *charset = c;
 387
 388     if(enc)
 389       *enc = e;
 390
 391     if(txt)
 392       *txt = t;
 393
 394     if(endp)
 395       *endp = p;
 396
 397     return(rv);
 398 }
 399
 400
 401 /*
 402  * rfc1522_copy_and_transliterate - copy given buf to destination buffer
 403  *                                  as UTF-8 characters
 404  */
 405 void
 406 rfc1522_copy_and_transliterate(unsigned char  *rv,
 407                                unsigned char **d,
 408                                size_t          len,
 409                                unsigned char  *s,
 410                                unsigned long   l,
 411                                char           *cset)
 412 {
 413     unsigned long i;
 414     SIZEDTEXT     src, xsrc;
 415
 416     src.data = s;
 417     src.size = l;
 418     memset(&xsrc, 0, sizeof(SIZEDTEXT));
 419
 420     /* transliterate decoded segment to utf-8 */
 421     if(cset){
 422         if(strucmp((char *) cset, "us-ascii")
 423            && strucmp((char *) cset, "utf-8")){
 424             if(utf8_charset(cset)){
 425                 if(!utf8_text(&src, cset, &xsrc, 0L)){
 426                     /* should not happen */
 427                     panic("c-client failed to transliterate recognized characterset");
 428                 }
 429             }
 430             else{
 431                 /* non-xlatable charset */
 432                 for(i = 0; i < l; i++)
 433                   if(src.data[i] & 0x80){
 434                       xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char));
 435                       xsrc.size = l;
 436                       for(i = 0; i < l; i++)
 437                         xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i];
 438
 439                       break;
 440                   }
 441             }
 442         }
 443     }
 444     else{
 445         const CHARSET *cs;
 446
 447         src.data = s;
 448         src.size = strlen((char *) s);
 449
 450         if((cs = utf8_infercharset(&src))){
 451             if(!(cs->type == CT_ASCII || cs->type == CT_UTF8)){
 452                 if(!utf8_text_cs(&src, cs, &xsrc, 0L, 0L)){
 453                     /* should not happen */
 454                     panic("c-client failed to transliterate recognized characterset");
 455                 }
 456             }
 457         }
 458         else if((cset=ps_global->VAR_UNK_CHAR_SET)
 459                 && strucmp((char *) cset, "us-ascii")
 460                 && strucmp((char *) cset, "utf-8")
 461                 && utf8_charset(cset)){
 462                 if(!utf8_text(&src, cset, &xsrc, 0L)){
 463                     /* should not happen */
 464                     panic("c-client failed to transliterate recognized character set");
 465                 }
 466         }
 467         else{
 468             /* unknown bytes - mask off high bit chars */
 469             for(i = 0; i < l; i++)
 470               if(src.data[i] & 0x80){
 471                   xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char));
 472                   xsrc.size = l;
 473                   for(i = 0; i < l; i++)
 474                     xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i];
 475
 476                   break;
 477               }
 478         }
 479     }
 480
 481     if(xsrc.data){
 482         s = xsrc.data;
 483         l = xsrc.size;
 484     }
 485
 486     i = MIN(l,len-1-((*d)-rv));
 487     strncpy((char *) (*d), (char *) s, i);
 488     (*d)[i] = '\0';
 489     *d += l;                    /* advance dest ptr to EOL */
 490     if((*d)-rv > len-1)
 491       *d = rv+len-1;
 492
 493     if(xsrc.data && src.data != xsrc.data)
 494       fs_give((void **) &xsrc.data);
 495 }
 496
 497
 498
 499 /*
 500  * rfc1522_encode - encode the given source string ala RFC 1522,
 501  *                  IF NECESSARY, into the given destination buffer.
 502  *                  Don't bother copying if it turns out encoding
 503  *                  isn't necessary.
 504  *
 505  * Returns: pointer to either the destination buffer containing the
 506  *          encoded text, or a pointer to the source buffer if we didn't
 507  *          have to encode anything.
 508  */
 509 char *
 510 rfc1522_encode(char *d, size_t dlen, unsigned char *s, char *charset)
 511 {
 512     unsigned char *p, *q;
 513     int            n;
 514
 515     if(!s)
 516       return((char *) s);
 517
 518     if(!charset)
 519       charset = UNKNOWN_CHARSET;
 520
 521     /* look for a reason to encode */
 522     for(p = s, n = 0; *p; p++)
 523       if((*p) & 0x80){
 524           n++;
 525       }
 526       else if(*p == RFC1522_INIT[0]
 527               && !strncmp((char *) p, RFC1522_INIT, RFC1522_INIT_L)){
 528           if(rfc1522_valid((char *) p, NULL, NULL, NULL, (char **) &q))
 529             p = q + RFC1522_TERM_L - 1;         /* advance past encoded gunk */
 530       }
 531       else if(*p == ESCAPE && match_escapes((char *)(p+1))){
 532           n++;
 533       }
 534
 535     if(n){                                      /* found, encoding to do */
 536         char *rv  = d, *t,
 537               enc = (n > (2 * (p - s)) / 3) ? 'B' : 'Q';
 538
 539         while(*s){
 540             if(d-rv < dlen-1-(RFC1522_INIT_L+2*RFC1522_DLIM_L+1)){
 541                 sstrncpy(&d, RFC1522_INIT, dlen-(d-rv));        /* insert intro header, */
 542                 sstrncpy(&d, charset, dlen-(d-rv));             /* character set tag, */
 543                 sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv));        /* and encoding flavor */
 544                 if(dlen-(d-rv) > 0)
 545                   *d++ = enc;
 546
 547                 sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv));
 548             }
 549
 550             /*
 551              * feed lines to encoder such that they're guaranteed
 552              * less than RFC1522_MAXW.
 553              */
 554             p = rfc1522_encoded_word(s, enc, charset);
 555             if(enc == 'B')                      /* insert encoded data */
 556               sstrncpy(&d, t = rfc1522_binary(s, p - s), dlen-1-(d-rv));
 557             else                                /* 'Q' encoding */
 558               sstrncpy(&d, t = rfc1522_8bit(s, p - s), dlen-1-(d-rv));
 559
 560             sstrncpy(&d, RFC1522_TERM, dlen-1-(d-rv));  /* insert terminator */
 561             fs_give((void **) &t);
 562             if(*p)                              /* more src string follows */
 563               sstrncpy(&d, "\015\012 ", dlen-1-(d-rv)); /* insert cont. line */
 564
 565             s = p;                              /* advance s */
 566         }
 567
 568         rv[dlen-1] = '\0';
 569         return(rv);
 570     }
 571     else
 572       return((char *) s);                       /* no work for us here */
 573 }
 574
 575
 576
 577 /*
 578  * rfc1522_encoded_word -- cut given string into max length encoded word
 579  *
 580  * Return: pointer into 's' such that the encoded 's' is no greater
 581  *         than RFC1522_MAXW
 582  *
 583  *  NOTE: this line break code is NOT cognizant of any SI/SO
 584  *  charset requirements nor similar strategies using escape
 585  *  codes.  Hopefully this will matter little and such
 586  *  representation strategies don't also include 8bit chars.
 587  */
 588 unsigned char *
 589 rfc1522_encoded_word(unsigned char *s, int enc, char *charset)
 590 {
 591     int goal = RFC1522_MAXW - RFC1522_OVERHEAD(charset);
 592
 593     if(enc == 'B')                      /* base64 encode */
 594       for(goal = ((goal / 4) * 3) - 2; goal && *s; goal--, s++)
 595         ;
 596     else                                /* special 'Q' encoding */
 597       for(; goal && *s; s++)
 598         if((goal -= RFC1522_ENC_CHAR(*s) ? 3 : 1) < 0)
 599           break;
 600
 601     return(s);
 602 }
 603
 604
 605
 606 /*
 607  * rfc1522_8bit -- apply RFC 1522 'Q' encoding to the given 8bit buffer
 608  *
 609  * Return: alloc'd buffer containing encoded string
 610  */
 611 char *
 612 rfc1522_8bit(void *src, int slen)
 613 {
 614     char *ret = (char *) fs_get ((size_t) (3*slen + 2));
 615     char *d = ret;
 616     unsigned char c;
 617     unsigned char *s = (unsigned char *) src;
 618
 619     while (slen--) {                            /* for each character */
 620         if (((c = *s++) == '\015') && (*s == '\012') && slen) {
 621             *d++ = '\015';                      /* true line break */
 622             *d++ = *s++;
 623             slen--;
 624         }
 625         else if(c == SPACE){                    /* special encoding case */
 626             *d++ = '_';
 627         }
 628         else if(RFC1522_ENC_CHAR(c)){
 629             *d++ = '=';                         /* quote character */
 630             C2XPAIR(c, d);
 631         }
 632         else
 633           *d++ = (char) c;                      /* ordinary character */
 634     }
 635
 636     *d = '\0';                                  /* tie off destination */
 637     return(ret);
 638 }
 639
 640
 641 /*
 642  * rfc1522_binary -- apply RFC 1522 'B' encoding to the given 8bit buffer
 643  *
 644  * Return: alloc'd buffer containing encoded string
 645  */
 646 char *
 647 rfc1522_binary (void *src, int srcl)
 648 {
 649     static char *v =
 650             "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 651     unsigned char *s = (unsigned char *) src;
 652     char *ret, *d;
 653
 654     d = ret = (char *) fs_get ((size_t) ((((srcl + 2) / 3) * 4) + 1));
 655     for (; srcl; s += 3) {      /* process tuplets */
 656                                 /* byte 1: high 6 bits (1) */
 657         *d++ = v[s[0] >> 2];
 658                                 /* byte 2: low 2 bits (1), high 4 bits (2) */
 659         *d++ = v[((s[0] << 4) + (--srcl ? (s[1] >> 4) : 0)) & 0x3f];
 660                                 /* byte 3: low 4 bits (2), high 2 bits (3) */
 661         *d++ = srcl ? v[((s[1] << 2) + (--srcl ? (s[2] >> 6) :0)) & 0x3f] :'=';
 662                                 /* byte 4: low 6 bits (3) */
 663         *d++ = srcl ? v[s[2] & 0x3f] : '=';
 664         if(srcl)
 665           srcl--;               /* count third character if processed */
 666     }
 667
 668     *d = '\0';                  /* tie off string */
 669     return(ret);                /* return the resulting string */
 670 }
 671
 672
 673 /*
 674  * Checks if charset conversion is possible and which quality could be achieved
 675  *
 676  * args: from_cs -- charset to convert from
 677  *       to_cs   -- charset to convert to
 678  *
 679  * Results:
 680  * CONV_TABLE->table   -- conversion table, NULL if conversion not needed
 681  *                        or not supported
 682  * CONV_TABLE->quality -- conversion quality (conversion not supported, not
 683  *                        needed, loses special chars, or loses letters
 684  *
 685  * The other entries of CONV_TABLE are used inside this function only
 686  * and may not be used outside unless this documentation is updated.
 687  */
 688 CONV_TABLE *
 689 conversion_table(char *from_cs, char *to_cs)
 690 {
 691     int               i, j;
 692     unsigned char    *p = NULL;
 693     unsigned short   *fromtab, *totab;
 694     CONV_TABLE       *ct = NULL;
 695     const CHARSET    *from, *to;
 696     static CONV_TABLE null_tab;
 697
 698     if(!(from_cs && *from_cs && to_cs && *to_cs) || !strucmp(from_cs, to_cs)){
 699         memset(&null_tab, 0, sizeof(null_tab));
 700         null_tab.quality = CV_NO_TRANSLATE_NEEDED;
 701         return(&null_tab);
 702     }
 703
 704     /*
 705      * First check to see if we are already set up for this pair of charsets.
 706      */
 707     if((ct = ps_global->conv_table) != NULL
 708        && ct->from_charset && ct->to_charset
 709        && !strucmp(ct->from_charset, from_cs)
 710        && !strucmp(ct->to_charset, to_cs))
 711       return(ct);
 712
 713     /*
 714      * No such luck. Get rid of the cache of the previous translation table
 715      * and build a new one.
 716      */
 717     if(ct){
 718         if(ct->table && (ct->convert != gf_convert_utf8_charset))
 719           fs_give((void **) &ct->table);
 720
 721         if(ct->from_charset)
 722           fs_give((void **) &ct->from_charset);
 723
 724         if(ct->to_charset)
 725           fs_give((void **) &ct->to_charset);
 726     }
 727     else
 728       ct = ps_global->conv_table = (CONV_TABLE *) fs_get(sizeof(*ct));
 729
 730     memset(ct, 0, sizeof(*ct));
 731
 732     ct->from_charset = cpystr(from_cs);
 733     ct->to_charset   = cpystr(to_cs);
 734     ct->quality = CV_NO_TRANSLATE_POSSIBLE;
 735
 736     /*
 737      * Check to see if a translation is feasible.
 738      */
 739     from = utf8_charset(from_cs);
 740     to =   utf8_charset(to_cs);
 741
 742     if(from && to){             /* if both charsets found */
 743                                 /* no mapping if same or from is ASCII */
 744         if((from->type == to->type && from->tab == to->tab)
 745            || (from->type == CT_ASCII))
 746             ct->quality = CV_NO_TRANSLATE_NEEDED;
 747         else switch(from->type){
 748         case CT_1BYTE0:         /* 1 byte no table */
 749         case CT_1BYTE:          /* 1 byte ASCII + table 0x80-0xff */
 750         case CT_1BYTE8:         /* 1 byte table 0x00 - 0xff */
 751             switch(to->type){
 752             case CT_1BYTE0:     /* 1 byte no table */
 753             case CT_1BYTE:      /* 1 byte ASCII + table 0x80-0xff */
 754             case CT_1BYTE8:     /* 1 byte table 0x00 - 0xff */
 755                 ct->quality = (from->script & to->script) ?
 756                   CV_LOSES_SOME_LETTERS : CV_LOSES_SPECIAL_CHARS;
 757                 break;
 758             }
 759             break;
 760         case CT_UTF8:           /* variable UTF-8 encoded Unicode no table */
 761         /* If source is UTF-8, see if destination charset has an 8 or 16 bit
 762          * coded character set that we can translate to.  By special
 763          * dispensation, kludge ISO-2022-JP to EUC or Shift-JIS, but don't
 764          * try to do any other ISO 2022 charsets or UTF-7.
 765          */
 766             switch (to->type){
 767             case CT_SJIS:       /* 2 byte Shift-JIS */
 768                                 /* only win if can get EUC-JP chartab */
 769                 if(utf8_charset("EUC-JP"))
 770                     ct->quality = CV_LOSES_SOME_LETTERS;
 771                 break;
 772             case CT_ASCII:      /* 7-bit ASCII no table */
 773             case CT_1BYTE0:     /* 1 byte no table */
 774             case CT_1BYTE:      /* 1 byte ASCII + table 0x80-0xff */
 775             case CT_1BYTE8:     /* 1 byte table 0x00 - 0xff */
 776             case CT_EUC:        /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
 777             case CT_DBYTE:      /* 2 byte ASCII + utf8_eucparam */
 778             case CT_DBYTE2:     /* 2 byte ASCII + utf8_eucparam plane1/2 */
 779                 ct->quality = CV_LOSES_SOME_LETTERS;
 780                 break;
 781             }
 782             break;
 783         }
 784
 785         switch (ct->quality) {  /* need to map? */
 786         case CV_NO_TRANSLATE_POSSIBLE:
 787         case CV_NO_TRANSLATE_NEEDED:
 788           break;                /* no mapping needed */
 789         default:                /* do mapping */
 790             switch (from->type) {
 791             case CT_UTF8:       /* UTF-8 to legacy character set */
 792               if((ct->table = utf8_rmap (to_cs)) != NULL)
 793                 ct->convert = gf_convert_utf8_charset;
 794               break;
 795
 796             case CT_1BYTE0:     /* ISO 8859-1 */
 797             case CT_1BYTE:      /* low part ASCII, high part other */
 798             case CT_1BYTE8:     /* low part has some non-ASCII */
 799             /*
 800              * The fromtab and totab tables are mappings from the 128 character
 801              * positions 128-255 to their Unicode values (so unsigned shorts).
 802              * The table we are creating is such that if
 803              *
 804              *    from_char_value -> unicode_value
 805              *    to_char_value   -> same_unicode_value
 806              *
 807              *  then we want to map from_char_value -> to_char_value
 808              *
 809              * To simplify conversions we create the whole 256 element array,
 810              * with the first 128 positions just the identity. If there is no
 811              * conversion for a particular from_char_value (that is, no
 812              * to_char_value maps to the same unicode character) then we put
 813              *  '?' in that character. We may want to output blob on the PC,
 814              * but don't so far.
 815              *
 816              * If fromtab or totab are NULL, that means the mapping is simply
 817              * the identity mapping. Since that is still useful to us, we
 818              * create it on the fly.
 819              */
 820                 fromtab = (unsigned short *) from->tab;
 821                 totab   = (unsigned short *) to->tab;
 822
 823                 ct->convert = gf_convert_8bit_charset;
 824                 p = ct->table = (unsigned char *)
 825                   fs_get(256 * sizeof(unsigned char));
 826                 for(i = 0; i < 256; i++){
 827                     unsigned int fc;
 828                     p[i] = '?';
 829                     switch(from->type){ /* get "from" UCS-2 codepoint */
 830                     case CT_1BYTE0:     /* ISO 8859-1 */
 831                         fc = i;
 832                         break;
 833                     case CT_1BYTE:      /* low part ASCII, high part other */
 834                         fc = (i < 128) ? i : fromtab[i-128];
 835                         break;
 836                     case CT_1BYTE8:     /* low part has some non-ASCII */
 837                         fc = fromtab[i];
 838                         break;
 839                     }
 840                     switch(to->type){ /* match against "to" UCS-2 codepoint */
 841                     case CT_1BYTE0: /* identity match for ISO 8859-1*/
 842                         if(fc < 256)
 843                           p[i] = fc;
 844                         break;
 845                     case CT_1BYTE: /* ASCII is identity, search high part */
 846                         if(fc < 128) p[i] = fc;
 847                         else for(j = 0; j < 128; j++){
 848                             if(fc == totab[j]){
 849                                 p[i] = 128 + j;
 850                                 break;
 851                             }
 852                         }
 853                         break;
 854                     case CT_1BYTE8: /* search all codepoints */
 855                         for(j = 0; j < 256; j++){
 856                             if(fc == totab[j]){
 857                               p[i] = j;
 858                               break;
 859                             }
 860                         }
 861                         break;
 862                     }
 863                 }
 864                 break;
 865             }
 866         }
 867     }
 868
 869     return(ct);
 870 }
 871
 872
 873 /*
 874  * Replace personal names in list of addresses with
 875  * decoded personal names in UTF-8.
 876  * Assumes we can free and reallocate the name.
 877  */
 878 void
 879 decode_addr_names_to_utf8(struct mail_address *a)
 880 {
 881     for(; a; a = a->next)
 882       if(a->personal)
 883         convert_possibly_encoded_str_to_utf8(&a->personal);
 884 }
 885
 886
 887 /*
 888  * Strp is a pointer to an allocated string.
 889  * This routine will convert the string to UTF-8, possibly
 890  * freeing and re-allocating it.
 891  * The source string may or may not have RFC1522 encoding
 892  * which will be undone using rfc1522_decode.
 893  * The string will have been converted on return.
 894  */
 895 void
 896 convert_possibly_encoded_str_to_utf8(char **strp)
 897 {
 898     size_t     len, lensrc, lenresult;
 899     char      *bufp, *decoded;
 900
 901     if(!strp || !*strp || **strp == '\0')
 902       return;
 903
 904     len = 4 * strlen(*strp) + 1;
 905     bufp = (char *) fs_get(len);
 906
 907     decoded = (char *) rfc1522_decode_to_utf8((unsigned char *) bufp, len, *strp);
 908     if(decoded != (*strp)){     /* unchanged */
 909         if((lensrc=strlen(*strp)) >= (lenresult=strlen(decoded))){
 910             strncpy(*strp, decoded, lensrc);
 911             (*strp)[lensrc] = '\0';
 912         }
 913         else{
 914             fs_give((void **) strp);
 915             if(decoded == bufp){        /* this will be true */
 916                 fs_resize((void **) &bufp, lenresult+1);
 917                 *strp = bufp;
 918                 bufp = NULL;
 919             }
 920             else{                       /* this is unreachable */
 921                 *strp = cpystr(decoded);
 922             }
 923         }
 924     }
 925     /* else, already UTF-8 */
 926
 927     if(bufp)
 928       fs_give((void **) &bufp);
 929 }