* Update to version 2.19.5
[alpine.git] / pith / url.c
blob6735b29f16788dd5bbaeef76e90654ef213c92e6
1 #if !defined(lint) && !defined(DOS)
2 static char rcsid[] = "$Id: url.c 769 2007-10-24 00:15:40Z hubert@u.washington.edu $";
3 #endif
5 /*
6 * ========================================================================
7 * Copyright 2006-2007 University of Washington
8 * Copyright 2013-2014 Eduardo Chappa
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
14 * http://www.apache.org/licenses/LICENSE-2.0
16 * ========================================================================
19 #include "../pith/headers.h"
20 #include "../pith/url.h"
21 #include "../pith/mailview.h"
22 #include "../pith/string.h"
25 * Internal prototypes
27 char *rfc1738_scheme_part(char *);
28 int rfc1738uchar(char *);
29 int rfc1738xchar(char *);
33 * * * * * * * * * RFC 1738 support routines * * * * * * * *
38 * Various helpful definitions
40 #define RFC1738_SAFE "$-_.+" /* "safe" */
41 #define RFC1738_EXTRA "!*'()," /* "extra" */
42 #define RFC1738_RSVP ";/?:@&=" /* "reserved" */
43 #define RFC1738_NEWS "-.+_" /* valid for "news:" URL */
44 #define RFC1738_FUDGE "#{}|\\^~[]" /* Unsafe, but popular */
45 #define RFC1738_ESC(S) (*(S) == '%' && isxpair((S) + 1))
49 * rfc1738_scan -- Scan the given line for possible URLs as defined
50 * in RFC1738
52 char *
53 rfc1738_scan(char *line, int *len)
55 char *colon, *start, *end;
56 int n;
58 /* process each : in the line */
59 for(; (colon = strindex(line, ':')) != NULL; line = end){
60 end = colon + 1;
61 if(colon == line) /* zero length scheme? */
62 continue;
65 * Valid URL (ala RFC1738 BNF)? First, first look to the
66 * left to make sure there are valid "scheme" chars...
68 start = colon - 1;
69 while(1)
70 if(!(isdigit((unsigned char) *start)
71 || isalpha((unsigned char) *start)
72 || strchr("+-.", *start))){
73 start++; /* advance over bogus char */
74 break;
76 else if(start > line)
77 start--;
78 else
79 break;
82 * Make sure everyhing up to the colon is a known scheme...
84 if(start && (n = colon - start) && !isdigit((unsigned char) *start)
85 && (((n == 2
86 && (*start == 'w' || *start == 'W')
87 && (*(start+1) == 's' || *(start+1) == 'S'))
88 || (n == 3
89 && (((*start == 'F' || *start == 'f')
90 && !struncmp(start+1, "tp", 2))
92 ((*start == 'w' || *start == 'W')
93 && !struncmp(start+1, "ss", 2))))
94 || (n == 4
95 && (((*start == 'H' || *start == 'h')
96 && !struncmp(start + 1, "ttp", 3))
97 || ((*start == 'N' || *start == 'n')
98 && !struncmp(start + 1, "ews", 3))
99 || ((*start == 'N' || *start == 'n')
100 && !struncmp(start + 1, "ntp", 3))
101 || ((*start == 'W' || *start == 'w')
102 && !struncmp(start + 1, "ais", 3))
103 #ifdef ENABLE_LDAP
104 || ((*start == 'L' || *start == 'l')
105 && !struncmp(start + 1, "dap", 3))
106 #endif
107 || ((*start == 'I' || *start == 'i')
108 && !struncmp(start + 1, "map", 3))
109 || ((*start == 'F' || *start == 'f')
110 && !struncmp(start + 1, "ile", 3))))
111 || (n == 5
112 && (*start == 'H' || *start == 'h')
113 && !struncmp(start+1, "ttps", 4))
114 || (n == 6
115 && (((*start == 'G' || *start == 'g')
116 && !struncmp(start+1, "opher", 5))
117 || ((*start == 'M' || *start == 'm')
118 && !struncmp(start + 1, "ailto", 5))
119 || ((*start == 'T' || *start == 't')
120 && !struncmp(start + 1, "elnet", 5))))
121 || (n == 8
122 && (*start == 'P' || *start == 'p')
123 && !struncmp(start + 1, "rospero", 7))
124 || (n == 11
125 && (*start == 'x' || *start == 'X')
126 && !struncmp(start + 1, "-pine-help", 10))
127 || (n == 13
128 && (*start == 'x' || *start == 'X')
129 && !struncmp(start + 1, "-alpine-help", 12)))
130 || url_external_specific_handler(start, n))){
132 * Second, make sure that everything to the right of the
133 * colon is valid for a "schemepart"...
136 if((end = rfc1738_scheme_part(colon + 1)) - colon > 1){
137 int i, j;
139 /* make sure something useful follows colon */
140 for(i = 0, j = end - colon; i < j; i++)
141 if(!strchr(RFC1738_RSVP, colon[i]))
142 break;
144 if(i != j){
145 *len = end - start;
148 * Special case handling for comma.
149 * See the problem is comma's valid, but if it's the
150 * last character in the url, it's likely intended
151 * as a delimiter in the text rather part of the URL.
152 * In most cases any way, that's why we have the
153 * exception.
155 if(*(end - 1) == ','
156 || (*(end - 1) == '.' && (!*end || *end == ' ')))
157 (*len)--;
159 if(*len - (colon - start) > 0)
160 return(start);
166 return(NULL);
171 * rfc1738_scheme_part - make sure what's to the right of the
172 * colon is valid
174 * NOTE: we have a problem matching closing parens when users
175 * bracket the url in parens. So, lets try terminating our
176 * match on any closing paren that doesn't have a coresponding
177 * open-paren.
179 char *
180 rfc1738_scheme_part(char *s)
182 int n, paren = 0, bracket = 0;
184 while(1)
185 switch(*s){
186 default :
187 if((n = rfc1738xchar(s)) != 0){
188 s += n;
189 break;
192 case '\0' :
193 return(s);
195 case '[' :
196 bracket++;
197 s++;
198 break;
200 case ']' :
201 if(bracket--){
202 s++;
203 break;
206 return(s);
208 case '(' :
209 paren++;
210 s++;
211 break;
213 case ')' :
214 if(paren--){
215 s++;
216 break;
219 return(s);
226 * rfc1738_str - convert rfc1738 escaped octets in place
228 char *
229 rfc1738_str(char *s)
231 register char *p = s, *q = s;
233 while(1)
234 switch(*q = *p++){
235 case '%' :
236 if(isxpair(p)){
237 *q = X2C(p);
238 p += 2;
241 default :
242 q++;
243 break;
245 case '\0':
246 return(s);
252 * rfc1738uchar - returns TRUE if the given char fits RFC 1738 "uchar" BNF
255 rfc1738uchar(char *s)
257 int valid = (RFC1738_ESC(s)) /* "escape" */
259 : (isalnum((unsigned char) *s) /* alphanumeric */
260 || strchr(RFC1738_SAFE, *s) /* other special stuff */
261 || strchr(RFC1738_EXTRA, *s));
263 if(!valid){
264 char *t;
265 UCS ucs;
266 CBUF_S cbuf;
268 cbuf.cbuf[0] = '\0';
269 cbuf.cbufp = cbuf.cbuf;
270 cbuf.cbufend = cbuf.cbuf;
272 for(t = s; t && *t; t++){
273 if(utf8_to_ucs4_oneatatime((unsigned char) *t & 0xff, &cbuf, &ucs, NULL)){
274 if ((ucs >= 0x00A0 && ucs <= 0xD7FF)
275 || (ucs >= 0xE000 && ucs <= 0xFDCF)
276 || (ucs >= 0xFDF0 && ucs <= 0xFFEF)
277 || (ucs >= 0x10000 && ucs <= 0x1FFFD)
278 || (ucs >= 0x20000 && ucs <= 0x2FFFD)
279 || (ucs >= 0x30000 && ucs <= 0x3FFFD)
280 || (ucs >= 0x40000 && ucs <= 0x4FFFD)
281 || (ucs >= 0x50000 && ucs <= 0x5FFFD)
282 || (ucs >= 0x60000 && ucs <= 0x6FFFD)
283 || (ucs >= 0x70000 && ucs <= 0x7FFFD)
284 || (ucs >= 0x80000 && ucs <= 0x8FFFD)
285 || (ucs >= 0x90000 && ucs <= 0x9FFFD)
286 || (ucs >= 0xA0000 && ucs <= 0xAFFFD)
287 || (ucs >= 0xB0000 && ucs <= 0xBFFFD)
288 || (ucs >= 0xC0000 && ucs <= 0xCFFFD)
289 || (ucs >= 0xD0000 && ucs <= 0xDFFFD)
290 || (ucs >= 0xE0000 && ucs <= 0xEFFFD)
291 || (ucs >= 0xF0000 && ucs <= 0xFFFFD)
292 || (ucs >= 0x100000 && ucs <= 0x10FFFD))
293 valid = t-s+1;
294 break;
298 return valid;
303 * rfc1738xchar - returns TRUE if the given char fits RFC 1738 "xchar" BNF
306 rfc1738xchar(char *s)
308 int n;
310 return((n = rfc1738uchar(s))
312 : (strchr(RFC1738_RSVP, *s) != NULL
313 || strchr(RFC1738_FUDGE, *s)));
318 * rfc1738_num - return long value of a string of digits, possibly escaped
320 unsigned long
321 rfc1738_num(char **s)
323 register char *p = *s;
324 unsigned long n = 0L;
326 for(; *p; p++)
327 if(*p == '%' && isxpair(p+1)){
328 int c = X2C(p+1);
329 if(isdigit((unsigned char) c)){
330 n = (c - '0') + (n * 10);
331 p += 2;
333 else
334 break;
336 else if(isdigit((unsigned char) *p))
337 n = (*p - '0') + (n * 10);
338 else
339 break;
341 *s = p;
342 return(n);
347 rfc1738_group(char *s)
349 return(isalnum((unsigned char) *s)
350 || RFC1738_ESC(s)
351 || strchr(RFC1738_NEWS, *s));
356 * Encode (hexify) a mailto url.
358 * Args s -- src url
360 * Returns An allocated string which is suitably encoded.
361 * Result should be freed by caller.
363 * Since we don't know here which characters are reserved characters (? and &)
364 * for use in delimiting the pieces of the url and which are just those
365 * characters contained in the data that should be encoded, we always encode
366 * them. That's because we know we don't use those as reserved characters.
367 * If you do use those as reserved characters you have to encode each part
368 * separately.
370 char *
371 rfc1738_encode_mailto(char *s)
373 char *d, *ret = NULL;
375 if(s){
376 /* Worst case, encode every character */
377 ret = d = (char *)fs_get((3*strlen(s) + 1) * sizeof(char));
378 while(*s){
379 if(isalnum((unsigned char)*s)
380 || strchr(RFC1738_SAFE, *s)
381 || strchr(RFC1738_EXTRA, *s))
382 *d++ = *s++;
383 else{
384 *d++ = '%';
385 C2XPAIR(*s, d);
386 s++;
390 *d = '\0';
393 return(ret);
398 * * * * * * * * * RFC 1808 support routines * * * * * * * *
403 rfc1808_tokens(char *url, char **scheme, char **net_loc, char **path,
404 char **parms, char **query, char **frag)
406 char *p, *q, *start, *tmp = cpystr(url);
408 start = tmp;
409 if((p = strchr(start, '#')) != NULL){ /* fragment spec? */
410 *p++ = '\0';
411 if(*p)
412 *frag = cpystr(p);
415 if((p = strchr(start, ':')) && p != start){ /* scheme part? */
416 for(q = start; q < p; q++)
417 if(!(isdigit((unsigned char) *q)
418 || isalpha((unsigned char) *q)
419 || strchr("+-.", *q)))
420 break;
422 if(p == q){
423 *p++ = '\0';
424 *scheme = cpystr(start);
425 start = p;
429 if(*start == '/' && *(start+1) == '/'){ /* net_loc */
430 if((p = strchr(start+2, '/')) != NULL)
431 *p++ = '\0';
433 *net_loc = cpystr(start+2);
434 if(p)
435 start = p;
436 else *start = '\0'; /* End of parse */
439 if((p = strchr(start, '?')) != NULL){
440 *p++ = '\0';
441 *query = cpystr(p);
444 if((p = strchr(start, ';')) != NULL){
445 *p++ = '\0';
446 *parms = cpystr(p);
449 if(*start)
450 *path = cpystr(start);
452 fs_give((void **) &tmp);
454 return(1);
460 * web_host_scan -- Scan the given line for possible web host names
462 * NOTE: scan below is limited to DNS names ala RFC1034
464 char *
465 web_host_scan(char *line, int *len)
467 char *end, last = '\0';
469 for(; *line; last = *line++)
470 if((*line == 'w' || *line == 'W')
471 && (!last || !(isalnum((unsigned char) last)
472 || last == '.' || last == '-' || last == '/'))
473 && (((*(line + 1) == 'w' || *(line + 1) == 'W') /* "www." */
474 && (*(line + 2) == 'w' || *(line + 2) == 'W'))
475 || ((*(line + 1) == 'e' || *(line + 1) == 'E') /* "web." */
476 && (*(line + 2) == 'b' || *(line + 2) == 'B')))
477 && (*(line + 3) == '.')){
478 end = rfc1738_scheme_part(line + 3);
479 if((*len = end - line) > ((*(line+3) == '.') ? 4 : 3)){
480 /* Dread comma exception, see note in rfc1738_scan */
481 if(strchr(",:", *(line + (*len) - 1))
482 || (*(line + (*len) - 1) == '.'
483 && (!*(line + (*len)) || *(line + (*len)) == ' ')))
484 (*len)--;
486 return(line);
488 else
489 line += 3;
492 return(NULL);
497 * mail_addr_scan -- Scan the given line for possible RFC822 addr-spec's
499 * NOTE: Well, OK, not strictly addr-specs since there's alot of junk
500 * we're tying to sift thru and we'd like to minimize false-pos
501 * matches.
503 char *
504 mail_addr_scan(char *line, int *len)
506 char *amp, *start, *end;
508 * This list is not the whole standards-based list, this is just a list
509 * of likely email address characters. We don't want to include everything
510 * because punctuation in the text might get mixed in with the address.
512 #define NONALPHANUMOK ".-_+%/="
514 /* process each : in the line */
515 for(; (amp = strindex(line, '@')) != NULL; line = end){
516 end = amp + 1;
517 /* zero length addr? */
518 if(amp == line || !(isalnum((unsigned char) *(start = amp - 1))
519 || strchr(NONALPHANUMOK, *start)))
520 continue;
523 * Valid address (ala RFC822 BNF)? First, first look to the
524 * left to make sure there are valid "scheme" chars...
526 while(1)
527 /* NOTE: we're not doing quoted-strings */
528 if(!(isalnum((unsigned char) *start) || strchr(NONALPHANUMOK, *start))){
529 /* advance over bogus char, and erase leading punctuation */
530 for(start++; *start && strchr(NONALPHANUMOK, *start); start++)
533 break;
535 else if(start > line)
536 start--;
537 else
538 break;
541 * Make sure everyhing up to the colon is a known scheme...
543 if(start && (amp - start) > 0){
545 * Second, make sure that everything to the right of
546 * amp is valid for a "domain"...
548 if(*(end = amp + 1) == '['){ /* domain literal */
549 int dots = 3;
551 for(++end; *end ; end++)
552 if(*end == ']'){
553 if(!dots){
554 *len = end - start + 1;
555 return(start);
557 else
558 break; /* bogus */
560 else if(*end == '.'){
561 if(--dots < 0)
562 break; /* bogus */
564 else if(!isdigit((unsigned char) *end))
565 break; /* bogus */
567 else if(isalnum((unsigned char) *end)){ /* domain name? */
568 for(++end; ; end++)
569 if(!(*end && (isalnum((unsigned char) *end)
570 || *end == '-'
571 || *end == '.'
572 || *end == '_'))){
573 /* can't end with dash, dot or underscore */
574 while(!isalnum((unsigned char) *(end - 1)))
575 end--;
577 *len = end - start;
578 return(start);
584 return(NULL);