pith/url.c

   1 /*
   2  * ========================================================================
   3  * Copyright 2006-2007 University of Washington
   4  * Copyright 2013-2022 Eduardo Chappa
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * ========================================================================
  13  */
  14
  15 #include "../pith/headers.h"
  16 #include "../pith/url.h"
  17 #include "../pith/mailview.h"
  18 #include "../pith/string.h"
  19
  20 /*
  21  * Internal prototypes
  22  */
  23 char *rfc1738_scheme_part(char *);
  24 int   rfc1738uchar(char *);
  25 int   rfc1738xchar(char *);
  26
  27
  28 /*
  29  *  * * * * * * * *      RFC 1738 support routines      * * * * * * * *
  30  */
  31
  32
  33 /*
  34  * Various helpful definitions
  35  */
  36 #define RFC1738_SAFE    "$-_.+"                 /* "safe" */
  37 #define RFC1738_EXTRA   "!*'(),"                /* "extra" */
  38 #define RFC1738_RSVP    ";/?:@&="               /* "reserved" */
  39 #define RFC1738_NEWS    "-.+_"                  /* valid for "news:" URL */
  40 #define RFC1738_FUDGE   "#{}|\\^~[]"            /* Unsafe, but popular */
  41 #define RFC1738_ESC(S)  (*(S) == '%' && isxpair((S) + 1))
  42
  43
  44 /*
  45  * rfc1738_scan -- Scan the given line for possible URLs as defined
  46  *                 in RFC1738
  47  */
  48 char *
  49 rfc1738_scan(char *line, int *len)
  50 {
  51     char *colon, *start, *end;
  52     int   n;
  53
  54     /* process each : in the line */
  55     for(; (colon = strindex(line, ':')) != NULL; line = end){
  56         end = colon + 1;
  57         if(colon == line)               /* zero length scheme? */
  58           continue;
  59
  60         /*
  61          * Valid URL (ala RFC1738 BNF)?  First, first look to the
  62          * left to make sure there are valid "scheme" chars...
  63          */
  64         start = colon - 1;
  65         while(1)
  66           if(!(isdigit((unsigned char) *start)
  67                || isalpha((unsigned char) *start)
  68                || strchr("+-.", *start))){
  69               start++;                  /* advance over bogus char */
  70               break;
  71           }
  72           else if(start > line)
  73             start--;
  74           else
  75             break;
  76
  77         /*
  78          * Make sure everything up to the colon is a known scheme...
  79          */
  80         if(start && (n = colon - start) && !isdigit((unsigned char) *start)
  81            && (((n == 2
  82                  && (*start == 'w' || *start == 'W')
  83                  && (*(start+1) == 's' || *(start+1) == 'S'))
  84                 || (n == 3
  85                  && (((*start == 'F' || *start == 'f')
  86                         && !struncmp(start+1, "tp", 2))
  87                      ||
  88                     ((*start == 'w' || *start == 'W')
  89                         && !struncmp(start+1, "ss", 2))))
  90                 || (n == 4
  91                     && (((*start == 'H' || *start == 'h')
  92                          && !struncmp(start + 1, "ttp", 3))
  93                         || ((*start == 'N' || *start == 'n')
  94                             && !struncmp(start + 1, "ews", 3))
  95                         || ((*start == 'N' || *start == 'n')
  96                             && !struncmp(start + 1, "ntp", 3))
  97                         || ((*start == 'W' || *start == 'w')
  98                             && !struncmp(start + 1, "ais", 3))
  99 #ifdef  ENABLE_LDAP
 100                         || ((*start == 'L' || *start == 'l')
 101                             && !struncmp(start + 1, "dap", 3))
 102 #endif
 103                         || ((*start == 'I' || *start == 'i')
 104                             && !struncmp(start + 1, "map", 3))
 105                         || ((*start == 'F' || *start == 'f')
 106                             && !struncmp(start + 1, "ile", 3))))
 107                 || (n == 5
 108                     && (*start == 'H' || *start == 'h')
 109                     && !struncmp(start+1, "ttps", 4))
 110                 || (n == 6
 111                     && (((*start == 'G' || *start == 'g')
 112                          && !struncmp(start+1, "opher", 5))
 113                         || ((*start == 'M' || *start == 'm')
 114                             && !struncmp(start + 1, "ailto", 5))
 115                         || ((*start == 'T' || *start == 't')
 116                             && !struncmp(start + 1, "elnet", 5))))
 117                 || (n == 8
 118                     && (*start == 'P' || *start == 'p')
 119                     && !struncmp(start + 1, "rospero", 7))
 120                 || (n == 11
 121                     && (*start == 'x' || *start == 'X')
 122                     && !struncmp(start + 1, "-pine-help", 10))
 123                 || (n == 13
 124                     && (*start == 'x' || *start == 'X')
 125                     && !struncmp(start + 1, "-alpine-help", 12)))
 126                || url_external_specific_handler(start, n))){
 127                 /*
 128                  * Second, make sure that everything to the right of the
 129                  * colon is valid for a "schemepart"...
 130                  */
 131
 132             if((end = rfc1738_scheme_part(colon + 1)) - colon > 1){
 133                 int i, j;
 134
 135                 /* make sure something useful follows colon */
 136                 for(i = 0, j = end - colon; i < j; i++)
 137                   if(!strchr(RFC1738_RSVP, colon[i]))
 138                     break;
 139
 140                 if(i != j){
 141                     *len = end - start;
 142
 143                     /*
 144                      * Special case handling for comma.
 145                      * See the problem is comma's valid, but if it's the
 146                      * last character in the url, it's likely intended
 147                      * as a delimiter in the text rather part of the URL.
 148                      * In most cases any way, that's why we have the
 149                      * exception.
 150                      */
 151                     if(*(end - 1) == ','
 152                        || (*(end - 1) == '.' && (!*end  || *end == ' ')))
 153                       (*len)--;
 154
 155                     if(*len - (colon - start) > 0)
 156                       return(start);
 157                 }
 158             }
 159         }
 160     }
 161
 162     return(NULL);
 163 }
 164
 165
 166 /*
 167  * rfc1738_scheme_part - make sure what's to the right of the
 168  *                       colon is valid
 169  *
 170  * NOTE: we have a problem matching closing parens when users
 171  *       bracket the url in parens.  So, lets try terminating our
 172  *       match on any closing paren that doesn't have a corresponding
 173  *       open-paren.
 174  */
 175 char *
 176 rfc1738_scheme_part(char *s)
 177 {
 178     int n, paren = 0, bracket = 0;
 179
 180     while(1)
 181       switch(*s){
 182         default :
 183           if((n = rfc1738xchar(s)) != 0){
 184               s += n;
 185               break;
 186           }
 187
 188         case '\0' :
 189           return(s);
 190
 191         case '[' :
 192           bracket++;
 193           s++;
 194           break;
 195
 196         case ']' :
 197           if(bracket--){
 198               s++;
 199               break;
 200           }
 201
 202           return(s);
 203
 204         case '(' :
 205           paren++;
 206           s++;
 207           break;
 208
 209         case ')' :
 210           if(paren--){
 211               s++;
 212               break;
 213           }
 214
 215           return(s);
 216       }
 217 }
 218
 219
 220
 221 /*
 222  * rfc1738_str - convert rfc1738 escaped octets in place
 223  */
 224 char *
 225 rfc1738_str(char *s)
 226 {
 227     register char *p = s, *q = s;
 228
 229     while(1)
 230       switch(*q = *p++){
 231         case '%' :
 232           if(isxpair(p)){
 233               *q = X2C(p);
 234               p += 2;
 235           }
 236
 237         default :
 238           q++;
 239           break;
 240
 241         case '\0':
 242           return(s);
 243       }
 244 }
 245
 246
 247 /*
 248  * rfc1738uchar - returns TRUE if the given char fits RFC 1738 "uchar" BNF
 249  */
 250 int
 251 rfc1738uchar(char *s)
 252 {
 253     int valid = (RFC1738_ESC(s))                /* "escape" */
 254              ? 2
 255              : (isalnum((unsigned char) *s)     /* alphanumeric */
 256                 || strchr(RFC1738_SAFE, *s)     /* other special stuff */
 257                 || strchr(RFC1738_EXTRA, *s));
 258
 259     if(!valid){
 260         char *t;
 261         UCS ucs;
 262         CBUF_S cbuf;
 263
 264         cbuf.cbuf[0] = '\0';
 265         cbuf.cbufp = cbuf.cbuf;
 266         cbuf.cbufend = cbuf.cbuf;
 267
 268         for(t = s; t && *t; t++){
 269            if(utf8_to_ucs4_oneatatime((unsigned char) *t & 0xff, &cbuf, &ucs, NULL)){
 270              if ((ucs >= 0x00A0 && ucs <= 0xD7FF)
 271                 || (ucs >= 0xE000 && ucs <= 0xFDCF)
 272                 || (ucs >= 0xFDF0 && ucs <= 0xFFEF)
 273                 || (ucs >= 0x10000 && ucs <= 0x1FFFD)
 274                 || (ucs >= 0x20000 && ucs <= 0x2FFFD)
 275                 || (ucs >= 0x30000 && ucs <= 0x3FFFD)
 276                 || (ucs >= 0x40000 && ucs <= 0x4FFFD)
 277                 || (ucs >= 0x50000 && ucs <= 0x5FFFD)
 278                 || (ucs >= 0x60000 && ucs <= 0x6FFFD)
 279                 || (ucs >= 0x70000 && ucs <= 0x7FFFD)
 280                 || (ucs >= 0x80000 && ucs <= 0x8FFFD)
 281                 || (ucs >= 0x90000 && ucs <= 0x9FFFD)
 282                 || (ucs >= 0xA0000 && ucs <= 0xAFFFD)
 283                 || (ucs >= 0xB0000 && ucs <= 0xBFFFD)
 284                 || (ucs >= 0xC0000 && ucs <= 0xCFFFD)
 285                 || (ucs >= 0xD0000 && ucs <= 0xDFFFD)
 286                 || (ucs >= 0xE0000 && ucs <= 0xEFFFD)
 287                 || (ucs >= 0xF0000 && ucs <= 0xFFFFD)
 288                 || (ucs >= 0x100000 && ucs <= 0x10FFFD))
 289                    valid = t-s+1;
 290                 break;
 291            }
 292         }
 293     }
 294     return valid;
 295 }
 296
 297
 298 /*
 299  * rfc1738xchar - returns TRUE if the given char fits RFC 1738 "xchar" BNF
 300  */
 301 int
 302 rfc1738xchar(char *s)
 303 {
 304     int n;
 305
 306     return((n = rfc1738uchar(s))
 307             ? n
 308             : (strchr(RFC1738_RSVP, *s) != NULL
 309                || strchr(RFC1738_FUDGE, *s)));
 310 }
 311
 312
 313 /*
 314  * rfc1738_num - return long value of a string of digits, possibly escaped
 315  */
 316 unsigned long
 317 rfc1738_num(char **s)
 318 {
 319     register char *p = *s;
 320     unsigned long n = 0L;
 321
 322     for(; *p; p++)
 323       if(*p == '%' && isxpair(p+1)){
 324           int c = X2C(p+1);
 325           if(isdigit((unsigned char) c)){
 326               n = (c - '0') + (n * 10);
 327               p += 2;
 328           }
 329           else
 330             break;
 331       }
 332       else if(isdigit((unsigned char) *p))
 333         n = (*p - '0') + (n * 10);
 334       else
 335         break;
 336
 337     *s = p;
 338     return(n);
 339 }
 340
 341
 342 int
 343 rfc1738_group(char *s)
 344 {
 345     return(isalnum((unsigned char) *s)
 346            || RFC1738_ESC(s)
 347            || strchr(RFC1738_NEWS, *s));
 348 }
 349
 350
 351 /*
 352  * Encode (hexify) a mailto url.
 353  *
 354  * Args  s -- src url
 355  *
 356  * Returns  An allocated string which is suitably encoded.
 357  *          Result should be freed by caller.
 358  *
 359  * Since we don't know here which characters are reserved characters (? and &)
 360  * for use in delimiting the pieces of the url and which are just those
 361  * characters contained in the data that should be encoded, we always encode
 362  * them. That's because we know we don't use those as reserved characters.
 363  * If you do use those as reserved characters you have to encode each part
 364  * separately.
 365  */
 366 char *
 367 rfc1738_encode_mailto(char *s)
 368 {
 369     char *d, *ret = NULL;
 370
 371     if(s){
 372         /* Worst case, encode every character */
 373         ret = d = (char *)fs_get((3*strlen(s) + 1) * sizeof(char));
 374         while(*s){
 375             if(isalnum((unsigned char)*s)
 376                || strchr(RFC1738_SAFE, *s)
 377                || strchr(RFC1738_EXTRA, *s))
 378               *d++ = *s++;
 379             else{
 380                 *d++ = '%';
 381                 C2XPAIR(*s, d);
 382                 s++;
 383             }
 384         }
 385
 386         *d = '\0';
 387     }
 388
 389     return(ret);
 390 }
 391
 392
 393 /*
 394  *  * * * * * * * *      RFC 1808 support routines      * * * * * * * *
 395  */
 396
 397
 398 int
 399 rfc1808_tokens(char *url, char **scheme, char **net_loc, char **path,
 400                char **parms, char **query, char **frag)
 401 {
 402     char *p, *q, *start, *tmp = cpystr(url);
 403
 404     start = tmp;
 405     if((p = strchr(start, '#')) != NULL){       /* fragment spec? */
 406         *p++ = '\0';
 407         if(*p)
 408           *frag = cpystr(p);
 409     }
 410
 411     if((p = strchr(start, ':')) && p != start){ /* scheme part? */
 412         for(q = start; q < p; q++)
 413           if(!(isdigit((unsigned char) *q)
 414                || isalpha((unsigned char) *q)
 415                || strchr("+-.", *q)))
 416             break;
 417
 418         if(p == q){
 419             *p++ = '\0';
 420             *scheme = cpystr(start);
 421             start = p;
 422         }
 423     }
 424
 425     if(*start == '/' && *(start+1) == '/'){ /* net_loc */
 426         if((p = strchr(start+2, '/')) != NULL)
 427           *p++ = '\0';
 428
 429         *net_loc = cpystr(start+2);
 430         if(p)
 431           start = p;
 432         else *start = '\0';             /* End of parse */
 433     }
 434
 435     if((p = strchr(start, '?')) != NULL){
 436         *p++ = '\0';
 437         *query = cpystr(p);
 438     }
 439
 440     if((p = strchr(start, ';')) != NULL){
 441         *p++ = '\0';
 442         *parms = cpystr(p);
 443     }
 444
 445     if(*start)
 446       *path = cpystr(start);
 447
 448     fs_give((void **) &tmp);
 449
 450     return(1);
 451 }
 452
 453
 454
 455 /*
 456  * web_host_scan -- Scan the given line for possible web host names
 457  *
 458  * NOTE: scan below is limited to DNS names ala RFC1034
 459  */
 460 char *
 461 web_host_scan(char *line, int *len)
 462 {
 463     char *end, last = '\0';
 464
 465     for(; *line; last = *line++)
 466       if((*line == 'w' || *line == 'W')
 467          && (!last || !(isalnum((unsigned char) last)
 468                         || last == '.' || last == '-' || last == '/'))
 469          && (((*(line + 1) == 'w' || *(line + 1) == 'W')        /* "www." */
 470               && (*(line + 2) == 'w' || *(line + 2) == 'W'))
 471              || ((*(line + 1) == 'e' || *(line + 1) == 'E')     /* "web." */
 472                  && (*(line + 2) == 'b' || *(line + 2) == 'B')))
 473          && (*(line + 3) == '.')){
 474           end = rfc1738_scheme_part(line + 3);
 475           if((*len = end - line) > ((*(line+3) == '.') ? 4 : 3)){
 476               /* Dread comma exception, see note in rfc1738_scan */
 477               if(strchr(",:", *(line + (*len) - 1))
 478                  || (*(line + (*len) - 1) == '.'
 479                      && (!*(line + (*len)) || *(line + (*len)) == ' ')))
 480                 (*len)--;
 481
 482               return(line);
 483           }
 484           else
 485             line += 3;
 486       }
 487
 488     return(NULL);
 489 }
 490
 491
 492 /*
 493  * mail_addr_scan -- Scan the given line for possible RFC822 addr-spec's
 494  *
 495  * NOTE: Well, OK, not strictly addr-specs since there's a lot of junk
 496  *       we're tying to sift thru and we'd like to minimize false-pos
 497  *       matches.
 498  */
 499 char *
 500 mail_addr_scan(char *line, int *len)
 501 {
 502     char *amp, *start, *end;
 503 /*
 504  * This list is not the whole standards-based list, this is just a list
 505  * of likely email address characters. We don't want to include everything
 506  * because punctuation in the text might get mixed in with the address.
 507  */
 508 #define NONALPHANUMOK ".-_+%/="
 509
 510     /* process each : in the line */
 511     for(; (amp = strindex(line, '@')) != NULL; line = end){
 512         end = amp + 1;
 513         /* zero length addr? */
 514         if(amp == line || !(isalnum((unsigned char) *(start = amp - 1))
 515                             || strchr(NONALPHANUMOK, *start)))
 516           continue;
 517
 518         /*
 519          * Valid address (ala RFC822 BNF)?  First, first look to the
 520          * left to make sure there are valid "scheme" chars...
 521          */
 522         while(1)
 523           /* NOTE: we're not doing quoted-strings */
 524           if(!(isalnum((unsigned char) *start) || strchr(NONALPHANUMOK, *start))){
 525               /* advance over bogus char, and erase leading punctuation */
 526               for(start++; *start && strchr(NONALPHANUMOK, *start); start++)
 527                 ;
 528
 529               break;
 530           }
 531           else if(start > line)
 532             start--;
 533           else
 534             break;
 535
 536         /*
 537          * Make sure everything up to the colon is a known scheme...
 538          */
 539         if(start && (amp - start) > 0){
 540             /*
 541              * Second, make sure that everything to the right of
 542              * amp is valid for a "domain"...
 543              */
 544             if(*(end = amp + 1) == '['){ /* domain literal */
 545                 int dots = 3;
 546
 547                 for(++end; *end ; end++)
 548                   if(*end == ']'){
 549                       if(!dots){
 550                           *len = end - start + 1;
 551                           return(start);
 552                       }
 553                       else
 554                         break;          /* bogus */
 555                   }
 556                   else if(*end == '.'){
 557                       if(--dots < 0)
 558                         break;          /* bogus */
 559                   }
 560                   else if(!isdigit((unsigned char) *end))
 561                     break;              /* bogus */
 562             }
 563             else if(isalnum((unsigned char) *end)){ /* domain name? */
 564                 for(++end; ; end++)
 565                   if(!(*end && (isalnum((unsigned char) *end)
 566                                 || *end == '-'
 567                                 || *end == '.'
 568                                 || *end == '_'))){
 569                       /* can't end with dash, dot or underscore */
 570                       while(!isalnum((unsigned char) *(end - 1)))
 571                         end--;
 572
 573                       *len = end - start;
 574                       return(start);
 575                   }
 576             }
 577         }
 578     }
 579
 580     return(NULL);
 581 }