tools/mailinfo.c

   1 /*
   2  * Another stupid program, this one parsing the headers of an
   3  * email to figure out authorship and subject
   4  */
   5 #define _GNU_SOURCE
   6 #include <stdio.h>
   7 #include <stdlib.h>
   8 #include <string.h>
   9 #include <ctype.h>
  10 #include <iconv.h>
  11
  12 static FILE *cmitmsg, *patchfile;
  13
  14 static int keep_subject = 0;
  15 static int metainfo_utf8 = 0;
  16 static char line[1000];
  17 static char date[1000];
  18 static char name[1000];
  19 static char email[1000];
  20 static char subject[1000];
  21
  22 static enum  {
  23         TE_DONTCARE, TE_QP, TE_BASE64,
  24 } transfer_encoding;
  25 static char charset[256];
  26
  27 static char multipart_boundary[1000];
  28 static int multipart_boundary_len;
  29 static int patch_lines = 0;
  30
  31 static char *sanity_check(char *name, char *email)
  32 {
  33         int len = strlen(name);
  34         if (len < 3 || len > 60)
  35                 return email;
  36         if (strchr(name, '@') || strchr(name, '<') || strchr(name, '>'))
  37                 return email;
  38         return name;
  39 }
  40
  41 static int handle_from(char *line)
  42 {
  43         char *at = strchr(line, '@');
  44         char *dst;
  45
  46         if (!at)
  47                 return 0;
  48
  49         /*
  50          * If we already have one email, don't take any confusing lines
  51          */
  52         if (*email && strchr(at+1, '@'))
  53                 return 0;
  54
  55         /* Pick up the string around '@', possibly delimited with <>
  56          * pair; that is the email part.  White them out while copying.
  57          */
  58         while (at > line) {
  59                 char c = at[-1];
  60                 if (isspace(c))
  61                         break;
  62                 if (c == '<') {
  63                         at[-1] = ' ';
  64                         break;
  65                 }
  66                 at--;
  67         }
  68         dst = email;
  69         for (;;) {
  70                 unsigned char c = *at;
  71                 if (!c || c == '>' || isspace(c)) {
  72                         if (c == '>')
  73                                 *at = ' ';
  74                         break;
  75                 }
  76                 *at++ = ' ';
  77                 *dst++ = c;
  78         }
  79         *dst++ = 0;
  80
  81         /* The remainder is name.  It could be "John Doe <john.doe@xz>"
  82          * or "john.doe@xz (John Doe)", but we have whited out the
  83          * email part, so trim from both ends, possibly removing
  84          * the () pair at the end.
  85          */
  86         at = line + strlen(line);
  87         while (at > line) {
  88                 unsigned char c = *--at;
  89                 if (!isspace(c)) {
  90                         at[(c == ')') ? 0 : 1] = 0;
  91                         break;
  92                 }
  93         }
  94
  95         at = line;
  96         for (;;) {
  97                 unsigned char c = *at;
  98                 if (!c || !isspace(c)) {
  99                         if (c == '(')
 100                                 at++;
 101                         break;
 102                 }
 103                 at++;
 104         }
 105         at = sanity_check(at, email);
 106         strcpy(name, at);
 107         return 1;
 108 }
 109
 110 static int handle_date(char *line)
 111 {
 112         strcpy(date, line);
 113         return 0;
 114 }
 115
 116 static int handle_subject(char *line)
 117 {
 118         strcpy(subject, line);
 119         return 0;
 120 }
 121
 122 /* NOTE NOTE NOTE.  We do not claim we do full MIME.  We just attempt
 123  * to have enough heuristics to grok MIME encoded patches often found
 124  * on our mailing lists.  For example, we do not even treat header lines
 125  * case insensitively.
 126  */
 127
 128 static int slurp_attr(const char *line, const char *name, char *attr)
 129 {
 130         char *ends, *ap = strcasestr(line, name);
 131         size_t sz;
 132
 133         if (!ap) {
 134                 *attr = 0;
 135                 return 0;
 136         }
 137         ap += strlen(name);
 138         if (*ap == '"') {
 139                 ap++;
 140                 ends = "\"";
 141         }
 142         else
 143                 ends = "; \t";
 144         sz = strcspn(ap, ends);
 145         memcpy(attr, ap, sz);
 146         attr[sz] = 0;
 147         return 1;
 148 }
 149
 150 static int handle_subcontent_type(char *line)
 151 {
 152         /* We do not want to mess with boundary.  Note that we do not
 153          * handle nested multipart.
 154          */
 155         slurp_attr(line, "charset=", charset);
 156         if (*charset) {
 157                 int i, c;
 158                 for (i = 0; (c = charset[i]) != 0; i++)
 159                         charset[i] = tolower(c);
 160         }
 161         return 0;
 162 }
 163
 164 static int handle_content_type(char *line)
 165 {
 166         *multipart_boundary = 0;
 167         if (slurp_attr(line, "boundary=", multipart_boundary + 2)) {
 168                 memcpy(multipart_boundary, "--", 2);
 169                 multipart_boundary_len = strlen(multipart_boundary);
 170         }
 171         slurp_attr(line, "charset=", charset);
 172         return 0;
 173 }
 174
 175 static int handle_content_transfer_encoding(char *line)
 176 {
 177         if (strcasestr(line, "base64"))
 178                 transfer_encoding = TE_BASE64;
 179         else if (strcasestr(line, "quoted-printable"))
 180                 transfer_encoding = TE_QP;
 181         else
 182                 transfer_encoding = TE_DONTCARE;
 183         return 0;
 184 }
 185
 186 static int is_multipart_boundary(const char *line)
 187 {
 188         return (!memcmp(line, multipart_boundary, multipart_boundary_len));
 189 }
 190
 191 static int eatspace(char *line)
 192 {
 193         int len = strlen(line);
 194         while (len > 0 && isspace(line[len-1]))
 195                 line[--len] = 0;
 196         return len;
 197 }
 198
 199 #define SEEN_FROM 01
 200 #define SEEN_DATE 02
 201 #define SEEN_SUBJECT 04
 202
 203 /* First lines of body can have From:, Date:, and Subject: */
 204 static int handle_inbody_header(int *seen, char *line)
 205 {
 206         if (!memcmp("From:", line, 5) && isspace(line[5])) {
 207                 if (!(*seen & SEEN_FROM) && handle_from(line+6)) {
 208                         *seen |= SEEN_FROM;
 209                         return 1;
 210                 }
 211         }
 212         if (!memcmp("Date:", line, 5) && isspace(line[5])) {
 213                 if (!(*seen & SEEN_DATE)) {
 214                         handle_date(line+6);
 215                         *seen |= SEEN_DATE;
 216                         return 1;
 217                 }
 218         }
 219         if (!memcmp("Subject:", line, 8) && isspace(line[8])) {
 220                 if (!(*seen & SEEN_SUBJECT)) {
 221                         handle_subject(line+9);
 222                         *seen |= SEEN_SUBJECT;
 223                         return 1;
 224                 }
 225         }
 226         if (!memcmp("[PATCH]", line, 7) && isspace(line[7])) {
 227                 if (!(*seen & SEEN_SUBJECT)) {
 228                         handle_subject(line);
 229                         *seen |= SEEN_SUBJECT;
 230                         return 1;
 231                 }
 232         }
 233         return 0;
 234 }
 235
 236 static char *cleanup_subject(char *subject)
 237 {
 238         if (keep_subject)
 239                 return subject;
 240         for (;;) {
 241                 char *p;
 242                 int len, remove;
 243                 switch (*subject) {
 244                 case 'r': case 'R':
 245                         if (!memcmp("e:", subject+1, 2)) {
 246                                 subject +=3;
 247                                 continue;
 248                         }
 249                         break;
 250                 case ' ': case '\t': case ':':
 251                         subject++;
 252                         continue;
 253
 254                 case '[':
 255                         p = strchr(subject, ']');
 256                         if (!p) {
 257                                 subject++;
 258                                 continue;
 259                         }
 260                         len = strlen(p);
 261                         remove = p - subject;
 262                         if (remove <= len *2) {
 263                                 subject = p+1;
 264                                 continue;
 265                         }
 266                         break;
 267                 }
 268                 return subject;
 269         }
 270 }
 271
 272 static void cleanup_space(char *buf)
 273 {
 274         unsigned char c;
 275         while ((c = *buf) != 0) {
 276                 buf++;
 277                 if (isspace(c)) {
 278                         buf[-1] = ' ';
 279                         c = *buf;
 280                         while (isspace(c)) {
 281                                 int len = strlen(buf);
 282                                 memmove(buf, buf+1, len);
 283                                 c = *buf;
 284                         }
 285                 }
 286         }
 287 }
 288
 289 typedef int (*header_fn_t)(char *);
 290 struct header_def {
 291         const char *name;
 292         header_fn_t func;
 293         int namelen;
 294 };
 295
 296 static void check_header(char *line, int len, struct header_def *header)
 297 {
 298         int i;
 299
 300         if (header[0].namelen <= 0) {
 301                 for (i = 0; header[i].name; i++)
 302                         header[i].namelen = strlen(header[i].name);
 303         }
 304         for (i = 0; header[i].name; i++) {
 305                 int len = header[i].namelen;
 306                 if (!strncasecmp(line, header[i].name, len) &&
 307                     line[len] == ':' && isspace(line[len + 1])) {
 308                         header[i].func(line + len + 2);
 309                         break;
 310                 }
 311         }
 312 }
 313
 314 static void check_subheader_line(char *line, int len)
 315 {
 316         static struct header_def header[] = {
 317                 { "Content-Type", handle_subcontent_type },
 318                 { "Content-Transfer-Encoding",
 319                   handle_content_transfer_encoding },
 320                 { NULL },
 321         };
 322         check_header(line, len, header);
 323 }
 324 static void check_header_line(char *line, int len)
 325 {
 326         static struct header_def header[] = {
 327                 { "From", handle_from },
 328                 { "Date", handle_date },
 329                 { "Subject", handle_subject },
 330                 { "Content-Type", handle_content_type },
 331                 { "Content-Transfer-Encoding",
 332                   handle_content_transfer_encoding },
 333                 { NULL },
 334         };
 335         check_header(line, len, header);
 336 }
 337
 338 static int read_one_header_line(char *line, int sz, FILE *in)
 339 {
 340         int ofs = 0;
 341         while (ofs < sz) {
 342                 int peek, len;
 343                 if (fgets(line + ofs, sz - ofs, in) == NULL)
 344                         return ofs;
 345                 len = eatspace(line + ofs);
 346                 if (len == 0)
 347                         return ofs;
 348                 peek = fgetc(in); ungetc(peek, in);
 349                 if (peek == ' ' || peek == '\t') {
 350                         /* Yuck, 2822 header "folding" */
 351                         ofs += len;
 352                         continue;
 353                 }
 354                 return ofs + len;
 355         }
 356         return ofs;
 357 }
 358
 359 static unsigned hexval(int c)
 360 {
 361         if (c >= '0' && c <= '9')
 362                 return c - '0';
 363         if (c >= 'a' && c <= 'f')
 364                 return c - 'a' + 10;
 365         if (c >= 'A' && c <= 'F')
 366                 return c - 'A' + 10;
 367         return ~0;
 368 }
 369
 370 static int decode_q_segment(char *in, char *ot, char *ep)
 371 {
 372         int c;
 373         while ((c = *in++) != 0 && (in <= ep)) {
 374                 if (c == '=') {
 375                         int d = *in++;
 376                         if (d == '\n' || !d)
 377                                 break; /* drop trailing newline */
 378                         *ot++ = ((hexval(d) << 4) | hexval(*in++));
 379                 }
 380                 else
 381                         *ot++ = c;
 382         }
 383         *ot = 0;
 384         return 0;
 385 }
 386
 387 static int decode_b_segment(char *in, char *ot, char *ep)
 388 {
 389         /* Decode in..ep, possibly in-place to ot */
 390         int c, pos = 0, acc = 0;
 391
 392         while ((c = *in++) != 0 && (in <= ep)) {
 393                 if (c == '+')
 394                         c = 62;
 395                 else if (c == '/')
 396                         c = 63;
 397                 else if ('A' <= c && c <= 'Z')
 398                         c -= 'A';
 399                 else if ('a' <= c && c <= 'z')
 400                         c -= 'a' - 26;
 401                 else if ('0' <= c && c <= '9')
 402                         c -= '0' - 52;
 403                 else if (c == '=') {
 404                         /* padding is almost like (c == 0), except we do
 405                          * not output NUL resulting only from it;
 406                          * for now we just trust the data.
 407                          */
 408                         c = 0;
 409                 }
 410                 else
 411                         continue; /* garbage */
 412                 switch (pos++) {
 413                 case 0:
 414                         acc = (c << 2);
 415                         break;
 416                 case 1:
 417                         *ot++ = (acc | (c >> 4));
 418                         acc = (c & 15) << 4;
 419                         break;
 420                 case 2:
 421                         *ot++ = (acc | (c >> 2));
 422                         acc = (c & 3) << 6;
 423                         break;
 424                 case 3:
 425                         *ot++ = (acc | c);
 426                         acc = pos = 0;
 427                         break;
 428                 }
 429         }
 430         *ot = 0;
 431         return 0;
 432 }
 433
 434 static void convert_to_utf8(char *line, char *charset)
 435 {
 436         if (*charset) {
 437                 char *in, *out;
 438                 size_t insize, outsize, nrc;
 439                 char outbuf[4096]; /* cheat */
 440                 iconv_t conv = iconv_open("utf-8", charset);
 441
 442                 if (conv == (iconv_t) -1) {
 443                         fprintf(stderr, "cannot convert from %s to utf-8\n",
 444                                 charset);
 445                         *charset = 0;
 446                         return;
 447                 }
 448                 in = line;
 449                 insize = strlen(in);
 450                 out = outbuf;
 451                 outsize = sizeof(outbuf);
 452                 nrc = iconv(conv, &in, &insize, &out, &outsize);
 453                 iconv_close(conv);
 454                 if (nrc == (size_t) -1)
 455                         return;
 456                 *out = 0;
 457                 strcpy(line, outbuf);
 458         }
 459 }
 460
 461 static void decode_header_bq(char *it)
 462 {
 463         char *in, *out, *ep, *cp, *sp;
 464         char outbuf[1000];
 465
 466         in = it;
 467         out = outbuf;
 468         while ((ep = strstr(in, "=?")) != NULL) {
 469                 int sz, encoding;
 470                 char charset_q[256], piecebuf[256];
 471                 if (in != ep) {
 472                         sz = ep - in;
 473                         memcpy(out, in, sz);
 474                         out += sz;
 475                         in += sz;
 476                 }
 477                 /* E.g.
 478                  * ep : "=?iso-2022-jp?B?GyR...?= foo"
 479                  * ep : "=?ISO-8859-1?Q?Foo=FCbar?= baz"
 480                  */
 481                 ep += 2;
 482                 cp = strchr(ep, '?');
 483                 if (!cp)
 484                         return; /* no munging */
 485                 for (sp = ep; sp < cp; sp++)
 486                         charset_q[sp - ep] = tolower(*sp);
 487                 charset_q[cp - ep] = 0;
 488                 encoding = cp[1];
 489                 if (!encoding || cp[2] != '?')
 490                         return; /* no munging */
 491                 ep = strstr(cp + 3, "?=");
 492                 if (!ep)
 493                         return; /* no munging */
 494                 switch (tolower(encoding)) {
 495                 default:
 496                         return; /* no munging */
 497                 case 'b':
 498                         sz = decode_b_segment(cp + 3, piecebuf, ep);
 499                         break;
 500                 case 'q':
 501                         sz = decode_q_segment(cp + 3, piecebuf, ep);
 502                         break;
 503                 }
 504                 if (sz < 0)
 505                         return;
 506                 if (metainfo_utf8)
 507                         convert_to_utf8(piecebuf, charset_q);
 508                 strcpy(out, piecebuf);
 509                 out += strlen(out);
 510                 in = ep + 2;
 511         }
 512         strcpy(out, in);
 513         strcpy(it, outbuf);
 514 }
 515
 516 static void decode_transfer_encoding(char *line)
 517 {
 518         char *ep;
 519
 520         switch (transfer_encoding) {
 521         case TE_QP:
 522                 ep = line + strlen(line);
 523                 decode_q_segment(line, line, ep);
 524                 break;
 525         case TE_BASE64:
 526                 ep = line + strlen(line);
 527                 decode_b_segment(line, line, ep);
 528                 break;
 529         case TE_DONTCARE:
 530                 break;
 531         }
 532 }
 533
 534 static void handle_info(void)
 535 {
 536         char *sub;
 537         static int done_info = 0;
 538
 539         if (done_info)
 540                 return;
 541
 542         done_info = 1;
 543         sub = cleanup_subject(subject);
 544         cleanup_space(name);
 545         cleanup_space(date);
 546         cleanup_space(email);
 547         cleanup_space(sub);
 548
 549         /* Unwrap inline B and Q encoding, and optionally
 550          * normalize the meta information to utf8.
 551          */
 552         decode_header_bq(name);
 553         decode_header_bq(date);
 554         decode_header_bq(email);
 555         decode_header_bq(sub);
 556         printf("Author: %s\nEmail: %s\nSubject: %s\nDate: %s\n\n",
 557                name, email, sub, date);
 558 }
 559
 560 /* We are inside message body and have read line[] already.
 561  * Spit out the commit log.
 562  */
 563 static int handle_commit_msg(void)
 564 {
 565         if (!cmitmsg)
 566                 return 0;
 567         do {
 568                 if (!memcmp("diff -", line, 6) ||
 569                     !memcmp("---", line, 3) ||
 570                     !memcmp("Index: ", line, 7))
 571                         break;
 572                 if ((multipart_boundary[0] && is_multipart_boundary(line))) {
 573                         /* We come here when the first part had only
 574                          * the commit message without any patch.  We
 575                          * pretend we have not seen this line yet, and
 576                          * go back to the loop.
 577                          */
 578                         return 1;
 579                 }
 580
 581                 /* Unwrap transfer encoding and optionally
 582                  * normalize the log message to UTF-8.
 583                  */
 584                 decode_transfer_encoding(line);
 585                 if (metainfo_utf8)
 586                         convert_to_utf8(line, charset);
 587                 fputs(line, cmitmsg);
 588         } while (fgets(line, sizeof(line), stdin) != NULL);
 589         fclose(cmitmsg);
 590         cmitmsg = NULL;
 591         return 0;
 592 }
 593
 594 /* We have done the commit message and have the first
 595  * line of the patch in line[].
 596  */
 597 static void handle_patch(void)
 598 {
 599         do {
 600                 if (multipart_boundary[0] && is_multipart_boundary(line))
 601                         break;
 602                 /* Only unwrap transfer encoding but otherwise do not
 603                  * do anything.  We do *NOT* want UTF-8 conversion
 604                  * here; we are dealing with the user payload.
 605                  */
 606                 decode_transfer_encoding(line);
 607                 fputs(line, patchfile);
 608                 patch_lines++;
 609         } while (fgets(line, sizeof(line), stdin) != NULL);
 610 }
 611
 612 /* multipart boundary and transfer encoding are set up for us, and we
 613  * are at the end of the sub header.  do equivalent of handle_body up
 614  * to the next boundary without closing patchfile --- we will expect
 615  * that the first part to contain commit message and a patch, and
 616  * handle other parts as pure patches.
 617  */
 618 static int handle_multipart_one_part(void)
 619 {
 620         int seen = 0;
 621         int n = 0;
 622         int len;
 623
 624         while (fgets(line, sizeof(line), stdin) != NULL) {
 625         again:
 626                 len = eatspace(line);
 627                 n++;
 628                 if (!len)
 629                         continue;
 630                 if (is_multipart_boundary(line))
 631                         break;
 632                 if (0 <= seen && handle_inbody_header(&seen, line))
 633                         continue;
 634                 seen = -1; /* no more inbody headers */
 635                 line[len] = '\n';
 636                 handle_info();
 637                 if (handle_commit_msg())
 638                         goto again;
 639                 handle_patch();
 640                 break;
 641         }
 642         if (n == 0)
 643                 return -1;
 644         return 0;
 645 }
 646
 647 static void handle_multipart_body(void)
 648 {
 649         int part_num = 0;
 650
 651         /* Skip up to the first boundary */
 652         while (fgets(line, sizeof(line), stdin) != NULL)
 653                 if (is_multipart_boundary(line)) {
 654                         part_num = 1;
 655                         break;
 656                 }
 657         if (!part_num)
 658                 return;
 659         /* We are on boundary line.  Start slurping the subhead. */
 660         while (1) {
 661                 int len = read_one_header_line(line, sizeof(line), stdin);
 662                 if (!len) {
 663                         if (handle_multipart_one_part() < 0)
 664                                 return;
 665                 }
 666                 else
 667                         check_subheader_line(line, len);
 668         }
 669         fclose(patchfile);
 670         if (!patch_lines) {
 671                 fprintf(stderr, "No patch found\n");
 672                 exit(1);
 673         }
 674 }
 675
 676 /* Non multipart message */
 677 static void handle_body(void)
 678 {
 679         int seen = 0;
 680
 681         while (fgets(line, sizeof(line), stdin) != NULL) {
 682                 int len = eatspace(line);
 683                 if (!len)
 684                         continue;
 685                 if (0 <= seen && handle_inbody_header(&seen, line))
 686                         continue;
 687                 seen = -1; /* no more inbody headers */
 688                 line[len] = '\n';
 689                 handle_info();
 690                 handle_commit_msg();
 691                 handle_patch();
 692                 break;
 693         }
 694         fclose(patchfile);
 695         if (!patch_lines) {
 696                 fprintf(stderr, "No patch found\n");
 697                 exit(1);
 698         }
 699 }
 700
 701 static const char mailinfo_usage[] =
 702         "git-mailinfo [-k] [-u] msg patch <mail >info";
 703
 704 static void usage(void) {
 705         fprintf(stderr, "%s\n", mailinfo_usage);
 706         exit(1);
 707 }
 708
 709 int main(int argc, char **argv)
 710 {
 711         while (1 < argc && argv[1][0] == '-') {
 712                 if (!strcmp(argv[1], "-k"))
 713                         keep_subject = 1;
 714                 else if (!strcmp(argv[1], "-u"))
 715                         metainfo_utf8 = 1;
 716                 else
 717                         usage();
 718                 argc--; argv++;
 719         }
 720
 721         if (argc != 3)
 722                 usage();
 723         cmitmsg = fopen(argv[1], "w");
 724         if (!cmitmsg) {
 725                 perror(argv[1]);
 726                 exit(1);
 727         }
 728         patchfile = fopen(argv[2], "w");
 729         if (!patchfile) {
 730                 perror(argv[2]);
 731                 exit(1);
 732         }
 733         while (1) {
 734                 int len = read_one_header_line(line, sizeof(line), stdin);
 735                 if (!len) {
 736                         if (multipart_boundary[0])
 737                                 handle_multipart_body();
 738                         else
 739                                 handle_body();
 740                         break;
 741                 }
 742                 check_header_line(line, len);
 743         }
 744         return 0;
 745 }