pith/url.c

   1 #if !defined(lint) && !defined(DOS)
   2 static char rcsid[] = "$Id: url.c 769 2007-10-24 00:15:40Z hubert@u.washington.edu $";
   3 #endif
   4
   5 /*
   6  * ========================================================================
   7  * Copyright 2006-2007 University of Washington
   8  * Copyright 2013-2014 Eduardo Chappa
   9  *
  10  * Licensed under the Apache License, Version 2.0 (the "License");
  11  * you may not use this file except in compliance with the License.
  12  * You may obtain a copy of the License at
  13  *
  14  *     http://www.apache.org/licenses/LICENSE-2.0
  15  *
  16  * ========================================================================
  17  */
  18
  19 #include "../pith/headers.h"
  20 #include "../pith/url.h"
  21 #include "../pith/mailview.h"
  22 #include "../pith/string.h"
  23
  24 /*
  25  * Internal prototypes
  26  */
  27 char *rfc1738_scheme_part(char *);
  28 int   rfc1738uchar(char *);
  29 int   rfc1738xchar(char *);
  30
  31
  32 /*
  33  *  * * * * * * * *      RFC 1738 support routines      * * * * * * * *
  34  */
  35
  36
  37 /*
  38  * Various helpful definitions
  39  */
  40 #define RFC1738_SAFE    "$-_.+"                 /* "safe" */
  41 #define RFC1738_EXTRA   "!*'(),"                /* "extra" */
  42 #define RFC1738_RSVP    ";/?:@&="               /* "reserved" */
  43 #define RFC1738_NEWS    "-.+_"                  /* valid for "news:" URL */
  44 #define RFC1738_FUDGE   "#{}|\\^~[]"            /* Unsafe, but popular */
  45 #define RFC1738_ESC(S)  (*(S) == '%' && isxpair((S) + 1))
  46
  47
  48 /*
  49  * rfc1738_scan -- Scan the given line for possible URLs as defined
  50  *                 in RFC1738
  51  */
  52 char *
  53 rfc1738_scan(char *line, int *len)
  54 {
  55     char *colon, *start, *end;
  56     int   n;
  57
  58     /* process each : in the line */
  59     for(; (colon = strindex(line, ':')) != NULL; line = end){
  60         end = colon + 1;
  61         if(colon == line)               /* zero length scheme? */
  62           continue;
  63
  64         /*
  65          * Valid URL (ala RFC1738 BNF)?  First, first look to the
  66          * left to make sure there are valid "scheme" chars...
  67          */
  68         start = colon - 1;
  69         while(1)
  70           if(!(isdigit((unsigned char) *start)
  71                || isalpha((unsigned char) *start)
  72                || strchr("+-.", *start))){
  73               start++;                  /* advance over bogus char */
  74               break;
  75           }
  76           else if(start > line)
  77             start--;
  78           else
  79             break;
  80
  81         /*
  82          * Make sure everyhing up to the colon is a known scheme...
  83          */
  84         if(start && (n = colon - start) && !isdigit((unsigned char) *start)
  85            && (((n == 2
  86                  && (*start == 'w' || *start == 'W')
  87                  && (*(start+1) == 's' || *(start+1) == 'S'))
  88                 || (n == 3
  89                  && (((*start == 'F' || *start == 'f')
  90                         && !struncmp(start+1, "tp", 2))
  91                      ||
  92                     ((*start == 'w' || *start == 'W')
  93                         && !struncmp(start+1, "ss", 2))))
  94                 || (n == 4
  95                     && (((*start == 'H' || *start == 'h')
  96                          && !struncmp(start + 1, "ttp", 3))
  97                         || ((*start == 'N' || *start == 'n')
  98                             && !struncmp(start + 1, "ews", 3))
  99                         || ((*start == 'N' || *start == 'n')
 100                             && !struncmp(start + 1, "ntp", 3))
 101                         || ((*start == 'W' || *start == 'w')
 102                             && !struncmp(start + 1, "ais", 3))
 103 #ifdef  ENABLE_LDAP
 104                         || ((*start == 'L' || *start == 'l')
 105                             && !struncmp(start + 1, "dap", 3))
 106 #endif
 107                         || ((*start == 'I' || *start == 'i')
 108                             && !struncmp(start + 1, "map", 3))
 109                         || ((*start == 'F' || *start == 'f')
 110                             && !struncmp(start + 1, "ile", 3))))
 111                 || (n == 5
 112                     && (*start == 'H' || *start == 'h')
 113                     && !struncmp(start+1, "ttps", 4))
 114                 || (n == 6
 115                     && (((*start == 'G' || *start == 'g')
 116                          && !struncmp(start+1, "opher", 5))
 117                         || ((*start == 'M' || *start == 'm')
 118                             && !struncmp(start + 1, "ailto", 5))
 119                         || ((*start == 'T' || *start == 't')
 120                             && !struncmp(start + 1, "elnet", 5))))
 121                 || (n == 8
 122                     && (*start == 'P' || *start == 'p')
 123                     && !struncmp(start + 1, "rospero", 7))
 124                 || (n == 11
 125                     && (*start == 'x' || *start == 'X')
 126                     && !struncmp(start + 1, "-pine-help", 10))
 127                 || (n == 13
 128                     && (*start == 'x' || *start == 'X')
 129                     && !struncmp(start + 1, "-alpine-help", 12)))
 130                || url_external_specific_handler(start, n))){
 131                 /*
 132                  * Second, make sure that everything to the right of the
 133                  * colon is valid for a "schemepart"...
 134                  */
 135
 136             if((end = rfc1738_scheme_part(colon + 1)) - colon > 1){
 137                 int i, j;
 138
 139                 /* make sure something useful follows colon */
 140                 for(i = 0, j = end - colon; i < j; i++)
 141                   if(!strchr(RFC1738_RSVP, colon[i]))
 142                     break;
 143
 144                 if(i != j){
 145                     *len = end - start;
 146
 147                     /*
 148                      * Special case handling for comma.
 149                      * See the problem is comma's valid, but if it's the
 150                      * last character in the url, it's likely intended
 151                      * as a delimiter in the text rather part of the URL.
 152                      * In most cases any way, that's why we have the
 153                      * exception.
 154                      */
 155                     if(*(end - 1) == ','
 156                        || (*(end - 1) == '.' && (!*end  || *end == ' ')))
 157                       (*len)--;
 158
 159                     if(*len - (colon - start) > 0)
 160                       return(start);
 161                 }
 162             }
 163         }
 164     }
 165
 166     return(NULL);
 167 }
 168
 169
 170 /*
 171  * rfc1738_scheme_part - make sure what's to the right of the
 172  *                       colon is valid
 173  *
 174  * NOTE: we have a problem matching closing parens when users
 175  *       bracket the url in parens.  So, lets try terminating our
 176  *       match on any closing paren that doesn't have a coresponding
 177  *       open-paren.
 178  */
 179 char *
 180 rfc1738_scheme_part(char *s)
 181 {
 182     int n, paren = 0, bracket = 0;
 183
 184     while(1)
 185       switch(*s){
 186         default :
 187           if((n = rfc1738xchar(s)) != 0){
 188               s += n;
 189               break;
 190           }
 191
 192         case '\0' :
 193           return(s);
 194
 195         case '[' :
 196           bracket++;
 197           s++;
 198           break;
 199
 200         case ']' :
 201           if(bracket--){
 202               s++;
 203               break;
 204           }
 205
 206           return(s);
 207
 208         case '(' :
 209           paren++;
 210           s++;
 211           break;
 212
 213         case ')' :
 214           if(paren--){
 215               s++;
 216               break;
 217           }
 218
 219           return(s);
 220       }
 221 }
 222
 223
 224
 225 /*
 226  * rfc1738_str - convert rfc1738 escaped octets in place
 227  */
 228 char *
 229 rfc1738_str(char *s)
 230 {
 231     register char *p = s, *q = s;
 232
 233     while(1)
 234       switch(*q = *p++){
 235         case '%' :
 236           if(isxpair(p)){
 237               *q = X2C(p);
 238               p += 2;
 239           }
 240
 241         default :
 242           q++;
 243           break;
 244
 245         case '\0':
 246           return(s);
 247       }
 248 }
 249
 250
 251 /*
 252  * rfc1738uchar - returns TRUE if the given char fits RFC 1738 "uchar" BNF
 253  */
 254 int
 255 rfc1738uchar(char *s)
 256 {
 257     int valid = (RFC1738_ESC(s))                /* "escape" */
 258              ? 2
 259              : (isalnum((unsigned char) *s)     /* alphanumeric */
 260                 || strchr(RFC1738_SAFE, *s)     /* other special stuff */
 261                 || strchr(RFC1738_EXTRA, *s));
 262
 263     if(!valid){
 264         char *t;
 265         UCS ucs;
 266         CBUF_S cbuf;
 267
 268         cbuf.cbuf[0] = '\0';
 269         cbuf.cbufp = cbuf.cbuf;
 270         cbuf.cbufend = cbuf.cbuf;
 271
 272         for(t = s; t && *t; t++){
 273            if(utf8_to_ucs4_oneatatime((unsigned char) *t & 0xff, &cbuf, &ucs, NULL)){
 274              if ((ucs >= 0x00A0 && ucs <= 0xD7FF)
 275                 || (ucs >= 0xE000 && ucs <= 0xFDCF)
 276                 || (ucs >= 0xFDF0 && ucs <= 0xFFEF)
 277                 || (ucs >= 0x10000 && ucs <= 0x1FFFD)
 278                 || (ucs >= 0x20000 && ucs <= 0x2FFFD)
 279                 || (ucs >= 0x30000 && ucs <= 0x3FFFD)
 280                 || (ucs >= 0x40000 && ucs <= 0x4FFFD)
 281                 || (ucs >= 0x50000 && ucs <= 0x5FFFD)
 282                 || (ucs >= 0x60000 && ucs <= 0x6FFFD)
 283                 || (ucs >= 0x70000 && ucs <= 0x7FFFD)
 284                 || (ucs >= 0x80000 && ucs <= 0x8FFFD)
 285                 || (ucs >= 0x90000 && ucs <= 0x9FFFD)
 286                 || (ucs >= 0xA0000 && ucs <= 0xAFFFD)
 287                 || (ucs >= 0xB0000 && ucs <= 0xBFFFD)
 288                 || (ucs >= 0xC0000 && ucs <= 0xCFFFD)
 289                 || (ucs >= 0xD0000 && ucs <= 0xDFFFD)
 290                 || (ucs >= 0xE0000 && ucs <= 0xEFFFD)
 291                 || (ucs >= 0xF0000 && ucs <= 0xFFFFD)
 292                 || (ucs >= 0x100000 && ucs <= 0x10FFFD))
 293                    valid = t-s+1;
 294                 break;
 295            }
 296         }
 297     }
 298     return valid;
 299 }
 300
 301
 302 /*
 303  * rfc1738xchar - returns TRUE if the given char fits RFC 1738 "xchar" BNF
 304  */
 305 int
 306 rfc1738xchar(char *s)
 307 {
 308     int n;
 309
 310     return((n = rfc1738uchar(s))
 311             ? n
 312             : (strchr(RFC1738_RSVP, *s) != NULL
 313                || strchr(RFC1738_FUDGE, *s)));
 314 }
 315
 316
 317 /*
 318  * rfc1738_num - return long value of a string of digits, possibly escaped
 319  */
 320 unsigned long
 321 rfc1738_num(char **s)
 322 {
 323     register char *p = *s;
 324     unsigned long n = 0L;
 325
 326     for(; *p; p++)
 327       if(*p == '%' && isxpair(p+1)){
 328           int c = X2C(p+1);
 329           if(isdigit((unsigned char) c)){
 330               n = (c - '0') + (n * 10);
 331               p += 2;
 332           }
 333           else
 334             break;
 335       }
 336       else if(isdigit((unsigned char) *p))
 337         n = (*p - '0') + (n * 10);
 338       else
 339         break;
 340
 341     *s = p;
 342     return(n);
 343 }
 344
 345
 346 int
 347 rfc1738_group(char *s)
 348 {
 349     return(isalnum((unsigned char) *s)
 350            || RFC1738_ESC(s)
 351            || strchr(RFC1738_NEWS, *s));
 352 }
 353
 354
 355 /*
 356  * Encode (hexify) a mailto url.
 357  *
 358  * Args  s -- src url
 359  *
 360  * Returns  An allocated string which is suitably encoded.
 361  *          Result should be freed by caller.
 362  *
 363  * Since we don't know here which characters are reserved characters (? and &)
 364  * for use in delimiting the pieces of the url and which are just those
 365  * characters contained in the data that should be encoded, we always encode
 366  * them. That's because we know we don't use those as reserved characters.
 367  * If you do use those as reserved characters you have to encode each part
 368  * separately.
 369  */
 370 char *
 371 rfc1738_encode_mailto(char *s)
 372 {
 373     char *d, *ret = NULL;
 374
 375     if(s){
 376         /* Worst case, encode every character */
 377         ret = d = (char *)fs_get((3*strlen(s) + 1) * sizeof(char));
 378         while(*s){
 379             if(isalnum((unsigned char)*s)
 380                || strchr(RFC1738_SAFE, *s)
 381                || strchr(RFC1738_EXTRA, *s))
 382               *d++ = *s++;
 383             else{
 384                 *d++ = '%';
 385                 C2XPAIR(*s, d);
 386                 s++;
 387             }
 388         }
 389
 390         *d = '\0';
 391     }
 392
 393     return(ret);
 394 }
 395
 396
 397 /*
 398  *  * * * * * * * *      RFC 1808 support routines      * * * * * * * *
 399  */
 400
 401
 402 int
 403 rfc1808_tokens(char *url, char **scheme, char **net_loc, char **path,
 404                char **parms, char **query, char **frag)
 405 {
 406     char *p, *q, *start, *tmp = cpystr(url);
 407
 408     start = tmp;
 409     if((p = strchr(start, '#')) != NULL){       /* fragment spec? */
 410         *p++ = '\0';
 411         if(*p)
 412           *frag = cpystr(p);
 413     }
 414
 415     if((p = strchr(start, ':')) && p != start){ /* scheme part? */
 416         for(q = start; q < p; q++)
 417           if(!(isdigit((unsigned char) *q)
 418                || isalpha((unsigned char) *q)
 419                || strchr("+-.", *q)))
 420             break;
 421
 422         if(p == q){
 423             *p++ = '\0';
 424             *scheme = cpystr(start);
 425             start = p;
 426         }
 427     }
 428
 429     if(*start == '/' && *(start+1) == '/'){ /* net_loc */
 430         if((p = strchr(start+2, '/')) != NULL)
 431           *p++ = '\0';
 432
 433         *net_loc = cpystr(start+2);
 434         if(p)
 435           start = p;
 436         else *start = '\0';             /* End of parse */
 437     }
 438
 439     if((p = strchr(start, '?')) != NULL){
 440         *p++ = '\0';
 441         *query = cpystr(p);
 442     }
 443
 444     if((p = strchr(start, ';')) != NULL){
 445         *p++ = '\0';
 446         *parms = cpystr(p);
 447     }
 448
 449     if(*start)
 450       *path = cpystr(start);
 451
 452     fs_give((void **) &tmp);
 453
 454     return(1);
 455 }
 456
 457
 458
 459 /*
 460  * web_host_scan -- Scan the given line for possible web host names
 461  *
 462  * NOTE: scan below is limited to DNS names ala RFC1034
 463  */
 464 char *
 465 web_host_scan(char *line, int *len)
 466 {
 467     char *end, last = '\0';
 468
 469     for(; *line; last = *line++)
 470       if((*line == 'w' || *line == 'W')
 471          && (!last || !(isalnum((unsigned char) last)
 472                         || last == '.' || last == '-' || last == '/'))
 473          && (((*(line + 1) == 'w' || *(line + 1) == 'W')        /* "www." */
 474               && (*(line + 2) == 'w' || *(line + 2) == 'W'))
 475              || ((*(line + 1) == 'e' || *(line + 1) == 'E')     /* "web." */
 476                  && (*(line + 2) == 'b' || *(line + 2) == 'B')))
 477          && (*(line + 3) == '.')){
 478           end = rfc1738_scheme_part(line + 3);
 479           if((*len = end - line) > ((*(line+3) == '.') ? 4 : 3)){
 480               /* Dread comma exception, see note in rfc1738_scan */
 481               if(strchr(",:", *(line + (*len) - 1))
 482                  || (*(line + (*len) - 1) == '.'
 483                      && (!*(line + (*len)) || *(line + (*len)) == ' ')))
 484                 (*len)--;
 485
 486               return(line);
 487           }
 488           else
 489             line += 3;
 490       }
 491
 492     return(NULL);
 493 }
 494
 495
 496 /*
 497  * mail_addr_scan -- Scan the given line for possible RFC822 addr-spec's
 498  *
 499  * NOTE: Well, OK, not strictly addr-specs since there's alot of junk
 500  *       we're tying to sift thru and we'd like to minimize false-pos
 501  *       matches.
 502  */
 503 char *
 504 mail_addr_scan(char *line, int *len)
 505 {
 506     char *amp, *start, *end;
 507 /*
 508  * This list is not the whole standards-based list, this is just a list
 509  * of likely email address characters. We don't want to include everything
 510  * because punctuation in the text might get mixed in with the address.
 511  */
 512 #define NONALPHANUMOK ".-_+%/="
 513
 514     /* process each : in the line */
 515     for(; (amp = strindex(line, '@')) != NULL; line = end){
 516         end = amp + 1;
 517         /* zero length addr? */
 518         if(amp == line || !(isalnum((unsigned char) *(start = amp - 1))
 519                             || strchr(NONALPHANUMOK, *start)))
 520           continue;
 521
 522         /*
 523          * Valid address (ala RFC822 BNF)?  First, first look to the
 524          * left to make sure there are valid "scheme" chars...
 525          */
 526         while(1)
 527           /* NOTE: we're not doing quoted-strings */
 528           if(!(isalnum((unsigned char) *start) || strchr(NONALPHANUMOK, *start))){
 529               /* advance over bogus char, and erase leading punctuation */
 530               for(start++; *start && strchr(NONALPHANUMOK, *start); start++)
 531                 ;
 532
 533               break;
 534           }
 535           else if(start > line)
 536             start--;
 537           else
 538             break;
 539
 540         /*
 541          * Make sure everyhing up to the colon is a known scheme...
 542          */
 543         if(start && (amp - start) > 0){
 544             /*
 545              * Second, make sure that everything to the right of
 546              * amp is valid for a "domain"...
 547              */
 548             if(*(end = amp + 1) == '['){ /* domain literal */
 549                 int dots = 3;
 550
 551                 for(++end; *end ; end++)
 552                   if(*end == ']'){
 553                       if(!dots){
 554                           *len = end - start + 1;
 555                           return(start);
 556                       }
 557                       else
 558                         break;          /* bogus */
 559                   }
 560                   else if(*end == '.'){
 561                       if(--dots < 0)
 562                         break;          /* bogus */
 563                   }
 564                   else if(!isdigit((unsigned char) *end))
 565                     break;              /* bogus */
 566             }
 567             else if(isalnum((unsigned char) *end)){ /* domain name? */
 568                 for(++end; ; end++)
 569                   if(!(*end && (isalnum((unsigned char) *end)
 570                                 || *end == '-'
 571                                 || *end == '.'
 572                                 || *end == '_'))){
 573                       /* can't end with dash, dot or underscore */
 574                       while(!isalnum((unsigned char) *(end - 1)))
 575                         end--;
 576
 577                       *len = end - start;
 578                       return(start);
 579                   }
 580             }
 581         }
 582     }
 583
 584     return(NULL);
 585 }