pith/charset.c

   1 #if !defined(lint) && !defined(DOS)
   2 static char rcsid[] = "$Id: charset.c 1032 2008-04-11 00:30:04Z hubert@u.washington.edu $";
   3 #endif
   4
   5 /*
   6  * ========================================================================
   7  * Copyright 2013-2017 Eduardo Chappa
   8  * Copyright 2006-2008 University of Washington
   9  *
  10  * Licensed under the Apache License, Version 2.0 (the "License");
  11  * you may not use this file except in compliance with the License.
  12  * You may obtain a copy of the License at
  13  *
  14  *     http://www.apache.org/licenses/LICENSE-2.0
  15  *
  16  * ========================================================================
  17  */
  18
  19 #include "../pith/headers.h"
  20 #include "../pith/charset.h"
  21 #include "../pith/state.h"
  22 #include "../pith/conf.h"
  23 #include "../pith/escapes.h"
  24 #include "../pith/mimedesc.h"
  25 #include "../pith/filter.h"
  26 #include "../pith/string.h"
  27 #include "../pith/options.h"
  28
  29
  30 /*
  31  * Internal prototypes
  32  */
  33 int            rfc1522_token(char *, int (*)(int), char *, char **);
  34 int            rfc1522_valtok(int);
  35 int            rfc1522_valenc(int);
  36 int            rfc1522_valid(char *, char **, char **, char **, char **);
  37 void           rfc1522_copy_and_transliterate(unsigned char *, unsigned char **, size_t,
  38                                               unsigned char *, unsigned long, char *);
  39 unsigned char *rfc1522_encoded_word(unsigned char *, int, char *);
  40 char          *rfc1522_8bit(void *, int);
  41 char          *rfc1522_binary(void *, int);
  42
  43
  44 char *
  45 body_charset(MAILSTREAM *stream, long int msgno, unsigned char *section)
  46 {
  47     BODY *body;
  48     char *charset;
  49
  50
  51     if((body = mail_body(stream, msgno, section)) && body->type == TYPETEXT){
  52         if(!(charset = parameter_val(body->parameter, "charset")))
  53           charset = cpystr("US-ASCII");
  54
  55         return(charset);
  56     }
  57
  58     return(NULL);
  59 }
  60
  61
  62 /*
  63  * Copies the source string into allocated space with the 8-bit EUC codes
  64  * (on Unix) or the Shift-JIS (on PC) converted into ISO-2022-JP.
  65  * Caller is responsible for freeing the result.
  66  */
  67 unsigned char *
  68 trans_euc_to_2022_jp(unsigned char *src)
  69 {
  70     size_t len, alloc;
  71     unsigned char *rv, *p, *q;
  72     int    inside_esc_seq = 0;
  73     int    c1 = -1;             /* remembers first of pair for Shift-JIS */
  74
  75     if(!src)
  76       return(NULL);
  77
  78     len = strlen((char *) src);
  79
  80     /*
  81      * Worst possible increase is every other character an 8-bit character.
  82      * In that case, each of those gets 6 extra charactes for the escape
  83      * sequences. We're not too concerned about the extra length because
  84      * these are relatively short strings.
  85      */
  86     alloc = len + 1 + ((len+1)/2) * 6;
  87     rv = (unsigned char *) fs_get(alloc * sizeof(char));
  88
  89     for(p = src, q = rv; *p; p++){
  90         if(inside_esc_seq){
  91             if(c1 >= 0){                        /* second of a pair? */
  92                 int adjust = *p < 159;
  93                 int rowOffset = c1 < 160 ? 112 : 176;
  94                 int cellOffset = adjust ? (*p > 127 ? 32 : 31) : 126;
  95
  96                 *q++ = ((c1 - rowOffset) << 1) - adjust;
  97                 *q++ = *p - cellOffset;
  98                 c1 = -1;
  99             }
 100             else if(*p & 0x80){
 101                 *q++ = (*p & 0x7f);
 102             }
 103             else{
 104                 *q++ = '\033';
 105                 *q++ = '(';
 106                 *q++ = 'B';
 107                 *q++ = (*p);
 108                 c1 = -1;
 109                 inside_esc_seq = 0;
 110             }
 111         }
 112         else{
 113             if(*p & 0x80){
 114                 *q++ = '\033';
 115                 *q++ = '$';
 116                 *q++ = 'B';
 117                 *q++ = (*p & 0x7f);
 118                 inside_esc_seq = 1;
 119             }
 120             else{
 121                 *q++ = (*p);
 122             }
 123         }
 124     }
 125
 126     if(inside_esc_seq){
 127         *q++ = '\033';
 128         *q++ = '(';
 129         *q++ = 'B';
 130     }
 131
 132     *q = '\0';
 133
 134     return(rv);
 135 }
 136
 137
 138 /*
 139  *  * * * * * * * *      RFC 1522 support routines      * * * * * * * *
 140  *
 141  *   RFC 1522 support is *very* loosely based on code contributed
 142  *   by Lars-Erik Johansson <lej@cdg.chalmers.se>.  Thanks to Lars-Erik,
 143  *   and appologies for taking such liberties with his code.
 144  */
 145
 146 #define RFC1522_INIT    "=?"
 147 #define RFC1522_INIT_L  2
 148 #define RFC1522_TERM    "?="
 149 #define RFC1522_TERM_L  2
 150 #define RFC1522_DLIM    "?"
 151 #define RFC1522_DLIM_L  1
 152 #define RFC1522_MAXW    75      /* RFC's say 75, but no senders seem to care*/
 153 #define ESPECIALS       "()<>@,;:\"/[]?.="
 154 #define RFC1522_OVERHEAD(S)     (RFC1522_INIT_L + RFC1522_TERM_L +      \
 155                                  (2 * RFC1522_DLIM_L) + strlen(S) + 1);
 156 #define RFC1522_ENC_CHAR(C)     (((C) & 0x80) || !rfc1522_valtok(C)     \
 157                                  || (C) == '_' )
 158
 159 /*
 160  * rfc1522_decode_to_utf8 - try to decode the given source string ala RFC 2047
 161  *                  (obsoleted RFC 1522) into the given destination buffer,
 162  *                  encoded in UTF-8.
 163  *
 164  * How large should d be? The decoded string of octets will fit in
 165  * the same size string as the source string. However, because we're
 166  * translating that into UTF-8 the result may expand. Currently the
 167  * Thai character set has single octet characters which expand to
 168  * three octets in UTF-8. So it would be safe to use 3 * strlen(s)
 169  * for the size of d. One can imagine a currently non-existent
 170  * character set that expanded to 4 octets instead, so use 4 to be
 171  * super safe.
 172  *
 173  * Returns: pointer to either the destination buffer containing the
 174  *          decoded text, or a pointer to the source buffer if there was
 175  *          no valid 'encoded-word' found during scanning.
 176  */
 177 unsigned char *
 178 rfc1522_decode_to_utf8(unsigned char *d, size_t len, char *s)
 179 {
 180     unsigned char *rv = NULL, *p;
 181     char          *start = s, *sw, *enc, *txt, *ew, **q, *lang;
 182     char          *cset;
 183     unsigned long  l;
 184     int            i;
 185
 186     *d = '\0';                                  /* init destination */
 187
 188     while(s && (sw = strstr(s, RFC1522_INIT))){
 189         if(!rv)  /* there's something to do, init it */
 190           rv = d;
 191         /* validate the rest of the encoded-word */
 192         if(rfc1522_valid(sw, &cset, &enc, &txt, &ew)){
 193             /*
 194              * We may have been putting off copying the first part of the
 195              * source while waiting to see if we have to copy at all.
 196              */
 197             if(rv == d && s != start){
 198                 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) start,
 199                                                sw - start, NULL);
 200                 s = sw;
 201             }
 202
 203             /* copy everything between s and sw to destination */
 204             for(i = 0; &s[i] < sw; i++)
 205               if(!isspace((unsigned char)s[i])){ /* if some non-whitespace */
 206                   while(s < sw && d-rv<len-1)
 207                     *d++ = (unsigned char) *s++;
 208
 209                   break;
 210               }
 211
 212             enc[-1] = txt[-1] = ew[0] = '\0';   /* tie off token strings */
 213
 214             if((lang = strchr(cset, '*')) != NULL)
 215               *lang++ = '\0';
 216
 217             /* based on encoding, write the encoded text to output buffer */
 218             switch(*enc){
 219               case 'Q' :                        /* 'Q' encoding */
 220               case 'q' :
 221                 /* special hocus-pocus to deal with '_' exception, too bad */
 222                 for(l = 0L, i = 0; txt[l]; l++)
 223                   if(txt[l] == '_')
 224                     i++;
 225
 226                 if(i){
 227                     q = (char **) fs_get((i + 1) * sizeof(char *));
 228                     for(l = 0L, i = 0; txt[l]; l++)
 229                       if(txt[l] == '_'){
 230                           q[i++] = &txt[l];
 231                           txt[l] = SPACE;
 232                       }
 233
 234                     q[i] = NULL;
 235                 }
 236                 else
 237                   q = NULL;
 238
 239                 if((p = rfc822_qprint((unsigned char *)txt, strlen(txt), &l)) != NULL){
 240                     rfc1522_copy_and_transliterate(rv, &d, len, p, l, cset);
 241                     fs_give((void **)&p);       /* free encoded buf */
 242                 }
 243                 else{
 244                     if(q)
 245                       fs_give((void **) &q);
 246
 247                     goto bogus;
 248                 }
 249
 250                 if(q){                          /* restore underscores */
 251                     for(i = 0; q[i]; i++)
 252                       *(q[i]) = '_';
 253
 254                     fs_give((void **)&q);
 255                 }
 256
 257                 break;
 258
 259               case 'B' :                        /* 'B' encoding */
 260               case 'b' :
 261                 if((p = rfc822_base64((unsigned char *) txt, strlen(txt), &l)) != NULL){
 262                     rfc1522_copy_and_transliterate(rv, &d, len, p, l, cset);
 263                     fs_give((void **)&p);       /* free encoded buf */
 264                 }
 265                 else
 266                   goto bogus;
 267
 268                 break;
 269
 270               default:
 271                 rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) txt,
 272                                                strlen(txt), NULL);
 273                 dprint((1, "RFC1522_decode: Unknown ENCODING: %s\n",
 274                        enc ? enc : "?"));
 275                 break;
 276             }
 277
 278             /* restore trompled source string */
 279             enc[-1] = txt[-1] = '?';
 280             ew[0]   = RFC1522_TERM[0];
 281
 282             /* advance s to start of text after encoded-word */
 283             s = ew + RFC1522_TERM_L;
 284
 285             if(lang)
 286               lang[-1] = '*';
 287         }
 288         else{
 289             /*
 290              * Found intro, but bogus data followed, treat it as normal text.
 291              */
 292             l = (sw - s) + RFC1522_INIT_L;
 293             rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, l, NULL);
 294             for(; isspace((unsigned char) *(s+l)) && d-rv<len-1;l++)
 295                 *d++ = *(s+l);  /* copy any trailing space */
 296             rv[len-1] = '\0';
 297             *d = '\0';
 298             s += l;
 299         }
 300     }
 301
 302     if(rv){
 303         if(s && *s){                            /* copy remaining text */
 304             rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, strlen(s), NULL);
 305             rv[len-1] = '\0';
 306         }
 307     }
 308     else if(s){
 309         rv = d;
 310         rfc1522_copy_and_transliterate(rv, &d, len, (unsigned char *) s, strlen(s), NULL);
 311         rv[len-1] = '\0';
 312     }
 313
 314     return(rv ? rv : (unsigned char *) start);
 315
 316   bogus:
 317     dprint((1, "RFC1522_decode: BOGUS INPUT: -->%s<--\n",
 318            start ? start : "?"));
 319     return((unsigned char *) start);
 320 }
 321
 322
 323 /*
 324  * rfc1522_token - scan the given source line up to the end_str making
 325  *                 sure all subsequent chars are "valid" leaving endp
 326  *                 a the start of the end_str.
 327  * Returns: TRUE if we got a valid token, FALSE otherwise
 328  */
 329 int
 330 rfc1522_token(char *s, int (*valid) (int), char *end_str, char **endp)
 331 {
 332     while(*s){
 333         if((char) *s == *end_str                /* test for matching end_str */
 334            && ((end_str[1])
 335                 ? !strncmp((char *)s + 1, end_str + 1, strlen(end_str + 1))
 336                 : 1)){
 337             *endp = s;
 338             return(TRUE);
 339         }
 340
 341         if(!(*valid)(*s++))                     /* test for valid char */
 342           break;
 343     }
 344
 345     return(FALSE);
 346 }
 347
 348
 349 /*
 350  * rfc1522_valtok - test for valid character in the RFC 1522 encoded
 351  *                  word's charset and encoding fields.
 352  */
 353 int
 354 rfc1522_valtok(int c)
 355 {
 356     return(!(c == SPACE || iscntrl(c & 0x7f) || strindex(ESPECIALS, c)));
 357 }
 358
 359
 360 /*
 361  * rfc1522_valenc - test for valid character in the RFC 1522 encoded
 362  *                  word's encoded-text field.
 363  */
 364 int
 365 rfc1522_valenc(int c)
 366 {
 367     return(!(c == '?' || c == SPACE) && isprint((unsigned char)c));
 368 }
 369
 370
 371 /*
 372  * rfc1522_valid - validate the given string as to it's rfc1522-ness
 373  */
 374 int
 375 rfc1522_valid(char *s, char **charset, char **enc, char **txt, char **endp)
 376 {
 377     char *c, *e, *t, *p;
 378     int   rv;
 379
 380     rv = rfc1522_token(c = s+RFC1522_INIT_L, rfc1522_valtok, RFC1522_DLIM, &e)
 381            && rfc1522_token(++e, rfc1522_valtok, RFC1522_DLIM, &t)
 382            && rfc1522_token(++t, rfc1522_valenc, RFC1522_TERM, &p);
 383
 384     if(charset)
 385       *charset = c;
 386
 387     if(enc)
 388       *enc = e;
 389
 390     if(txt)
 391       *txt = t;
 392
 393     if(endp)
 394       *endp = p;
 395
 396     return(rv);
 397 }
 398
 399
 400 /*
 401  * rfc1522_copy_and_transliterate - copy given buf to destination buffer
 402  *                                  as UTF-8 characters
 403  */
 404 void
 405 rfc1522_copy_and_transliterate(unsigned char  *rv,
 406                                unsigned char **d,
 407                                size_t          len,
 408                                unsigned char  *s,
 409                                unsigned long   l,
 410                                char           *cset)
 411 {
 412     unsigned long i;
 413     SIZEDTEXT     src, xsrc;
 414
 415     src.data = s;
 416     src.size = l;
 417     memset(&xsrc, 0, sizeof(SIZEDTEXT));
 418
 419     /* transliterate decoded segment to utf-8 */
 420     if(cset){
 421         if(strucmp((char *) cset, "us-ascii")
 422            && strucmp((char *) cset, "utf-8")){
 423             if(utf8_charset(cset)){
 424                 if(!utf8_text(&src, cset, &xsrc, 0L)){
 425                     /* should not happen */
 426                     alpine_panic("c-client failed to transliterate recognized characterset");
 427                 }
 428             }
 429             else{
 430                 /* non-xlatable charset */
 431                 for(i = 0; i < l; i++)
 432                   if(src.data[i] & 0x80){
 433                       xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char));
 434                       xsrc.size = l;
 435                       for(i = 0; i < l; i++)
 436                         xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i];
 437
 438                       break;
 439                   }
 440             }
 441         }
 442     }
 443     else{
 444         const CHARSET *cs;
 445
 446         src.data = s;
 447         src.size = strlen((char *) s);
 448
 449         if((cs = utf8_infercharset(&src))){
 450             if(!(cs->type == CT_ASCII || cs->type == CT_UTF8)){
 451                 if(!utf8_text_cs(&src, cs, &xsrc, 0L, 0L)){
 452                     /* should not happen */
 453                     alpine_panic("c-client failed to transliterate recognized characterset");
 454                 }
 455             }
 456         }
 457         else if((cset=ps_global->VAR_UNK_CHAR_SET)
 458                 && strucmp((char *) cset, "us-ascii")
 459                 && strucmp((char *) cset, "utf-8")
 460                 && utf8_charset(cset)){
 461                 if(!utf8_text(&src, cset, &xsrc, 0L)){
 462                     /* should not happen */
 463                     alpine_panic("c-client failed to transliterate recognized character set");
 464                 }
 465         }
 466         else{
 467             /* unknown bytes - mask off high bit chars */
 468             for(i = 0; i < l; i++)
 469               if(src.data[i] & 0x80){
 470                   xsrc.data = (unsigned char *) fs_get((l+1) * sizeof(unsigned char));
 471                   xsrc.size = l;
 472                   for(i = 0; i < l; i++)
 473                     xsrc.data[i] = (src.data[i] & 0x80) ? '?' : src.data[i];
 474
 475                   break;
 476               }
 477         }
 478     }
 479
 480     if(xsrc.data){
 481         s = xsrc.data;
 482         l = xsrc.size;
 483     }
 484
 485     i = MIN(l,len-1-((*d)-rv));
 486     strncpy((char *) (*d), (char *) s, i);
 487     (*d)[i] = '\0';
 488     *d += l;                    /* advance dest ptr to EOL */
 489     if((*d)-rv > len-1)
 490       *d = rv+len-1;
 491
 492     if(xsrc.data && src.data != xsrc.data)
 493       fs_give((void **) &xsrc.data);
 494 }
 495
 496
 497
 498 /*
 499  * rfc1522_encode - encode the given source string ala RFC 1522,
 500  *                  IF NECESSARY, into the given destination buffer.
 501  *                  Don't bother copying if it turns out encoding
 502  *                  isn't necessary.
 503  *
 504  * Returns: pointer to either the destination buffer containing the
 505  *          encoded text, or a pointer to the source buffer if we didn't
 506  *          have to encode anything.
 507  */
 508 char *
 509 rfc1522_encode(char *d, size_t dlen, unsigned char *s, char *charset)
 510 {
 511     unsigned char *p, *q;
 512     int            n;
 513
 514     if(!s)
 515       return((char *) s);
 516
 517     if(!charset)
 518       charset = UNKNOWN_CHARSET;
 519
 520     /* look for a reason to encode */
 521     for(p = s, n = 0; *p; p++)
 522       if((*p) & 0x80){
 523           n++;
 524       }
 525       else if(*p == RFC1522_INIT[0]
 526               && !strncmp((char *) p, RFC1522_INIT, RFC1522_INIT_L)){
 527           if(rfc1522_valid((char *) p, NULL, NULL, NULL, (char **) &q))
 528             p = q + RFC1522_TERM_L - 1;         /* advance past encoded gunk */
 529       }
 530       else if(*p == ESCAPE && match_escapes((char *)(p+1))){
 531           n++;
 532       }
 533
 534     if(n){                                      /* found, encoding to do */
 535         char *rv  = d, *t,
 536               enc = (n > (2 * (p - s)) / 3) ? 'B' : 'Q';
 537
 538         while(*s){
 539             if(d-rv < dlen-1-(RFC1522_INIT_L+2*RFC1522_DLIM_L+1)){
 540                 sstrncpy(&d, RFC1522_INIT, dlen-(d-rv));        /* insert intro header, */
 541                 sstrncpy(&d, charset, dlen-(d-rv));             /* character set tag, */
 542                 sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv));        /* and encoding flavor */
 543                 if(dlen-(d-rv) > 0)
 544                   *d++ = enc;
 545
 546                 sstrncpy(&d, RFC1522_DLIM, dlen-(d-rv));
 547             }
 548
 549             /*
 550              * feed lines to encoder such that they're guaranteed
 551              * less than RFC1522_MAXW.
 552              */
 553             p = rfc1522_encoded_word(s, enc, charset);
 554             if(enc == 'B')                      /* insert encoded data */
 555               sstrncpy(&d, t = rfc1522_binary(s, p - s), dlen-1-(d-rv));
 556             else                                /* 'Q' encoding */
 557               sstrncpy(&d, t = rfc1522_8bit(s, p - s), dlen-1-(d-rv));
 558
 559             sstrncpy(&d, RFC1522_TERM, dlen-1-(d-rv));  /* insert terminator */
 560             fs_give((void **) &t);
 561             if(*p)                              /* more src string follows */
 562               sstrncpy(&d, "\015\012 ", dlen-1-(d-rv)); /* insert cont. line */
 563
 564             s = p;                              /* advance s */
 565         }
 566
 567         rv[dlen-1] = '\0';
 568         return(rv);
 569     }
 570     else
 571       return((char *) s);                       /* no work for us here */
 572 }
 573
 574
 575
 576 /*
 577  * rfc1522_encoded_word -- cut given string into max length encoded word
 578  *
 579  * Return: pointer into 's' such that the encoded 's' is no greater
 580  *         than RFC1522_MAXW
 581  *
 582  *  NOTE: this line break code is NOT cognizant of any SI/SO
 583  *  charset requirements nor similar strategies using escape
 584  *  codes.  Hopefully this will matter little and such
 585  *  representation strategies don't also include 8bit chars.
 586  */
 587 unsigned char *
 588 rfc1522_encoded_word(unsigned char *s, int enc, char *charset)
 589 {
 590     int goal = RFC1522_MAXW - RFC1522_OVERHEAD(charset);
 591
 592     if(enc == 'B')                      /* base64 encode */
 593       for(goal = ((goal / 4) * 3) - 2; goal && *s; goal--, s++)
 594         ;
 595     else                                /* special 'Q' encoding */
 596       for(; goal && *s; s++)
 597         if((goal -= RFC1522_ENC_CHAR(*s) ? 3 : 1) < 0)
 598           break;
 599
 600     return(s);
 601 }
 602
 603
 604
 605 /*
 606  * rfc1522_8bit -- apply RFC 1522 'Q' encoding to the given 8bit buffer
 607  *
 608  * Return: alloc'd buffer containing encoded string
 609  */
 610 char *
 611 rfc1522_8bit(void *src, int slen)
 612 {
 613     char *ret = (char *) fs_get ((size_t) (3*slen + 2));
 614     char *d = ret;
 615     unsigned char c;
 616     unsigned char *s = (unsigned char *) src;
 617
 618     while (slen--) {                            /* for each character */
 619         if (((c = *s++) == '\015') && (*s == '\012') && slen) {
 620             *d++ = '\015';                      /* true line break */
 621             *d++ = *s++;
 622             slen--;
 623         }
 624         else if(c == SPACE){                    /* special encoding case */
 625             *d++ = '_';
 626         }
 627         else if(RFC1522_ENC_CHAR(c)){
 628             *d++ = '=';                         /* quote character */
 629             C2XPAIR(c, d);
 630         }
 631         else
 632           *d++ = (char) c;                      /* ordinary character */
 633     }
 634
 635     *d = '\0';                                  /* tie off destination */
 636     return(ret);
 637 }
 638
 639
 640 /*
 641  * rfc1522_binary -- apply RFC 1522 'B' encoding to the given 8bit buffer
 642  *
 643  * Return: alloc'd buffer containing encoded string
 644  */
 645 char *
 646 rfc1522_binary (void *src, int srcl)
 647 {
 648     static char *v =
 649             "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 650     unsigned char *s = (unsigned char *) src;
 651     char *ret, *d;
 652
 653     d = ret = (char *) fs_get ((size_t) ((((srcl + 2) / 3) * 4) + 1));
 654     for (; srcl; s += 3) {      /* process tuplets */
 655                                 /* byte 1: high 6 bits (1) */
 656         *d++ = v[s[0] >> 2];
 657                                 /* byte 2: low 2 bits (1), high 4 bits (2) */
 658         *d++ = v[((s[0] << 4) + (--srcl ? (s[1] >> 4) : 0)) & 0x3f];
 659                                 /* byte 3: low 4 bits (2), high 2 bits (3) */
 660         *d++ = srcl ? v[((s[1] << 2) + (--srcl ? (s[2] >> 6) :0)) & 0x3f] :'=';
 661                                 /* byte 4: low 6 bits (3) */
 662         *d++ = srcl ? v[s[2] & 0x3f] : '=';
 663         if(srcl)
 664           srcl--;               /* count third character if processed */
 665     }
 666
 667     *d = '\0';                  /* tie off string */
 668     return(ret);                /* return the resulting string */
 669 }
 670
 671
 672 /*
 673  * Checks if charset conversion is possible and which quality could be achieved
 674  *
 675  * args: from_cs -- charset to convert from
 676  *       to_cs   -- charset to convert to
 677  *
 678  * Results:
 679  * CONV_TABLE->table   -- conversion table, NULL if conversion not needed
 680  *                        or not supported
 681  * CONV_TABLE->quality -- conversion quality (conversion not supported, not
 682  *                        needed, loses special chars, or loses letters
 683  *
 684  * The other entries of CONV_TABLE are used inside this function only
 685  * and may not be used outside unless this documentation is updated.
 686  */
 687 CONV_TABLE *
 688 conversion_table(char *from_cs, char *to_cs)
 689 {
 690     int               i, j;
 691     unsigned char    *p = NULL;
 692     unsigned short   *fromtab, *totab;
 693     CONV_TABLE       *ct = NULL;
 694     const CHARSET    *from, *to;
 695     static CONV_TABLE null_tab;
 696
 697     if(!(from_cs && *from_cs && to_cs && *to_cs) || !strucmp(from_cs, to_cs)){
 698         memset(&null_tab, 0, sizeof(null_tab));
 699         null_tab.quality = CV_NO_TRANSLATE_NEEDED;
 700         return(&null_tab);
 701     }
 702
 703     /*
 704      * First check to see if we are already set up for this pair of charsets.
 705      */
 706     if((ct = ps_global->conv_table) != NULL
 707        && ct->from_charset && ct->to_charset
 708        && !strucmp(ct->from_charset, from_cs)
 709        && !strucmp(ct->to_charset, to_cs))
 710       return(ct);
 711
 712     /*
 713      * No such luck. Get rid of the cache of the previous translation table
 714      * and build a new one.
 715      */
 716     if(ct){
 717         if(ct->table && (ct->convert != gf_convert_utf8_charset))
 718           fs_give((void **) &ct->table);
 719
 720         if(ct->from_charset)
 721           fs_give((void **) &ct->from_charset);
 722
 723         if(ct->to_charset)
 724           fs_give((void **) &ct->to_charset);
 725     }
 726     else
 727       ct = ps_global->conv_table = (CONV_TABLE *) fs_get(sizeof(*ct));
 728
 729     memset(ct, 0, sizeof(*ct));
 730
 731     ct->from_charset = cpystr(from_cs);
 732     ct->to_charset   = cpystr(to_cs);
 733     ct->quality = CV_NO_TRANSLATE_POSSIBLE;
 734
 735     /*
 736      * Check to see if a translation is feasible.
 737      */
 738     from = utf8_charset(from_cs);
 739     to =   utf8_charset(to_cs);
 740
 741     if(from && to){             /* if both charsets found */
 742                                 /* no mapping if same or from is ASCII */
 743         if((from->type == to->type && from->tab == to->tab)
 744            || (from->type == CT_ASCII))
 745             ct->quality = CV_NO_TRANSLATE_NEEDED;
 746         else switch(from->type){
 747         case CT_1BYTE0:         /* 1 byte no table */
 748         case CT_1BYTE:          /* 1 byte ASCII + table 0x80-0xff */
 749         case CT_1BYTE8:         /* 1 byte table 0x00 - 0xff */
 750             switch(to->type){
 751             case CT_1BYTE0:     /* 1 byte no table */
 752             case CT_1BYTE:      /* 1 byte ASCII + table 0x80-0xff */
 753             case CT_1BYTE8:     /* 1 byte table 0x00 - 0xff */
 754                 ct->quality = (from->script & to->script) ?
 755                   CV_LOSES_SOME_LETTERS : CV_LOSES_SPECIAL_CHARS;
 756                 break;
 757             }
 758             break;
 759         case CT_UTF8:           /* variable UTF-8 encoded Unicode no table */
 760         /* If source is UTF-8, see if destination charset has an 8 or 16 bit
 761          * coded character set that we can translate to.  By special
 762          * dispensation, kludge ISO-2022-JP to EUC or Shift-JIS, but don't
 763          * try to do any other ISO 2022 charsets or UTF-7.
 764          */
 765             switch (to->type){
 766             case CT_SJIS:       /* 2 byte Shift-JIS */
 767                                 /* only win if can get EUC-JP chartab */
 768                 if(utf8_charset("EUC-JP"))
 769                     ct->quality = CV_LOSES_SOME_LETTERS;
 770                 break;
 771             case CT_ASCII:      /* 7-bit ASCII no table */
 772             case CT_1BYTE0:     /* 1 byte no table */
 773             case CT_1BYTE:      /* 1 byte ASCII + table 0x80-0xff */
 774             case CT_1BYTE8:     /* 1 byte table 0x00 - 0xff */
 775             case CT_EUC:        /* 2 byte ASCII + utf8_eucparam base/CS2/CS3 */
 776             case CT_DBYTE:      /* 2 byte ASCII + utf8_eucparam */
 777             case CT_DBYTE2:     /* 2 byte ASCII + utf8_eucparam plane1/2 */
 778                 ct->quality = CV_LOSES_SOME_LETTERS;
 779                 break;
 780             }
 781             break;
 782         }
 783
 784         switch (ct->quality) {  /* need to map? */
 785         case CV_NO_TRANSLATE_POSSIBLE:
 786         case CV_NO_TRANSLATE_NEEDED:
 787           break;                /* no mapping needed */
 788         default:                /* do mapping */
 789             switch (from->type) {
 790             case CT_UTF8:       /* UTF-8 to legacy character set */
 791               if((ct->table = utf8_rmap (to_cs)) != NULL)
 792                 ct->convert = gf_convert_utf8_charset;
 793               break;
 794
 795             case CT_1BYTE0:     /* ISO 8859-1 */
 796             case CT_1BYTE:      /* low part ASCII, high part other */
 797             case CT_1BYTE8:     /* low part has some non-ASCII */
 798             /*
 799              * The fromtab and totab tables are mappings from the 128 character
 800              * positions 128-255 to their Unicode values (so unsigned shorts).
 801              * The table we are creating is such that if
 802              *
 803              *    from_char_value -> unicode_value
 804              *    to_char_value   -> same_unicode_value
 805              *
 806              *  then we want to map from_char_value -> to_char_value
 807              *
 808              * To simplify conversions we create the whole 256 element array,
 809              * with the first 128 positions just the identity. If there is no
 810              * conversion for a particular from_char_value (that is, no
 811              * to_char_value maps to the same unicode character) then we put
 812              *  '?' in that character. We may want to output blob on the PC,
 813              * but don't so far.
 814              *
 815              * If fromtab or totab are NULL, that means the mapping is simply
 816              * the identity mapping. Since that is still useful to us, we
 817              * create it on the fly.
 818              */
 819                 fromtab = (unsigned short *) from->tab;
 820                 totab   = (unsigned short *) to->tab;
 821
 822                 ct->convert = gf_convert_8bit_charset;
 823                 p = ct->table = (unsigned char *)
 824                   fs_get(256 * sizeof(unsigned char));
 825                 for(i = 0; i < 256; i++){
 826                     unsigned int fc;
 827                     p[i] = '?';
 828                     switch(from->type){ /* get "from" UCS-2 codepoint */
 829                     case CT_1BYTE0:     /* ISO 8859-1 */
 830                         fc = i;
 831                         break;
 832                     case CT_1BYTE:      /* low part ASCII, high part other */
 833                         fc = (i < 128) ? i : fromtab[i-128];
 834                         break;
 835                     case CT_1BYTE8:     /* low part has some non-ASCII */
 836                         fc = fromtab[i];
 837                         break;
 838                     }
 839                     switch(to->type){ /* match against "to" UCS-2 codepoint */
 840                     case CT_1BYTE0: /* identity match for ISO 8859-1*/
 841                         if(fc < 256)
 842                           p[i] = fc;
 843                         break;
 844                     case CT_1BYTE: /* ASCII is identity, search high part */
 845                         if(fc < 128) p[i] = fc;
 846                         else for(j = 0; j < 128; j++){
 847                             if(fc == totab[j]){
 848                                 p[i] = 128 + j;
 849                                 break;
 850                             }
 851                         }
 852                         break;
 853                     case CT_1BYTE8: /* search all codepoints */
 854                         for(j = 0; j < 256; j++){
 855                             if(fc == totab[j]){
 856                               p[i] = j;
 857                               break;
 858                             }
 859                         }
 860                         break;
 861                     }
 862                 }
 863                 break;
 864             }
 865         }
 866     }
 867
 868     return(ct);
 869 }
 870
 871
 872 /*
 873  * Replace personal names in list of addresses with
 874  * decoded personal names in UTF-8.
 875  * Assumes we can free and reallocate the name.
 876  */
 877 void
 878 decode_addr_names_to_utf8(struct mail_address *a)
 879 {
 880     for(; a; a = a->next)
 881       if(a->personal)
 882         convert_possibly_encoded_str_to_utf8(&a->personal);
 883 }
 884
 885
 886 /*
 887  * Strp is a pointer to an allocated string.
 888  * This routine will convert the string to UTF-8, possibly
 889  * freeing and re-allocating it.
 890  * The source string may or may not have RFC1522 encoding
 891  * which will be undone using rfc1522_decode.
 892  * The string will have been converted on return.
 893  */
 894 void
 895 convert_possibly_encoded_str_to_utf8(char **strp)
 896 {
 897     size_t     len, lensrc, lenresult;
 898     char      *bufp, *decoded;
 899
 900     if(!strp || !*strp || **strp == '\0')
 901       return;
 902
 903     len = 4 * strlen(*strp) + 1;
 904     bufp = (char *) fs_get(len);
 905
 906     decoded = (char *) rfc1522_decode_to_utf8((unsigned char *) bufp, len, *strp);
 907     if(decoded != (*strp)){     /* unchanged */
 908         if((lensrc=strlen(*strp)) >= (lenresult=strlen(decoded))){
 909             strncpy(*strp, decoded, lensrc);
 910             (*strp)[lensrc] = '\0';
 911         }
 912         else{
 913             fs_give((void **) strp);
 914             if(decoded == bufp){        /* this will be true */
 915                 fs_resize((void **) &bufp, lenresult+1);
 916                 *strp = bufp;
 917                 bufp = NULL;
 918             }
 919             else{                       /* this is unreachable */
 920                 *strp = cpystr(decoded);
 921             }
 922         }
 923     }
 924     /* else, already UTF-8 */
 925
 926     if(bufp)
 927       fs_give((void **) &bufp);
 928 }