1 #if !defined(lint) && !defined(DOS)
2 static char rcsid
[] = "$Id: url.c 769 2007-10-24 00:15:40Z hubert@u.washington.edu $";
6 * ========================================================================
7 * Copyright 2006-2007 University of Washington
8 * Copyright 2013-2014 Eduardo Chappa
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
14 * http://www.apache.org/licenses/LICENSE-2.0
16 * ========================================================================
19 #include "../pith/headers.h"
20 #include "../pith/url.h"
21 #include "../pith/mailview.h"
22 #include "../pith/string.h"
27 char *rfc1738_scheme_part(char *);
28 int rfc1738uchar(char *);
29 int rfc1738xchar(char *);
33 * * * * * * * * * RFC 1738 support routines * * * * * * * *
38 * Various helpful definitions
40 #define RFC1738_SAFE "$-_.+" /* "safe" */
41 #define RFC1738_EXTRA "!*'()," /* "extra" */
42 #define RFC1738_RSVP ";/?:@&=" /* "reserved" */
43 #define RFC1738_NEWS "-.+_" /* valid for "news:" URL */
44 #define RFC1738_FUDGE "#{}|\\^~[]" /* Unsafe, but popular */
45 #define RFC1738_ESC(S) (*(S) == '%' && isxpair((S) + 1))
49 * rfc1738_scan -- Scan the given line for possible URLs as defined
53 rfc1738_scan(char *line
, int *len
)
55 char *colon
, *start
, *end
;
58 /* process each : in the line */
59 for(; (colon
= strindex(line
, ':')) != NULL
; line
= end
){
61 if(colon
== line
) /* zero length scheme? */
65 * Valid URL (ala RFC1738 BNF)? First, first look to the
66 * left to make sure there are valid "scheme" chars...
70 if(!(isdigit((unsigned char) *start
)
71 || isalpha((unsigned char) *start
)
72 || strchr("+-.", *start
))){
73 start
++; /* advance over bogus char */
82 * Make sure everyhing up to the colon is a known scheme...
84 if(start
&& (n
= colon
- start
) && !isdigit((unsigned char) *start
)
86 && (*start
== 'w' || *start
== 'W')
87 && (*(start
+1) == 's' || *(start
+1) == 'S'))
89 && (((*start
== 'F' || *start
== 'f')
90 && !struncmp(start
+1, "tp", 2))
92 ((*start
== 'w' || *start
== 'W')
93 && !struncmp(start
+1, "ss", 2))))
95 && (((*start
== 'H' || *start
== 'h')
96 && !struncmp(start
+ 1, "ttp", 3))
97 || ((*start
== 'N' || *start
== 'n')
98 && !struncmp(start
+ 1, "ews", 3))
99 || ((*start
== 'N' || *start
== 'n')
100 && !struncmp(start
+ 1, "ntp", 3))
101 || ((*start
== 'W' || *start
== 'w')
102 && !struncmp(start
+ 1, "ais", 3))
104 || ((*start
== 'L' || *start
== 'l')
105 && !struncmp(start
+ 1, "dap", 3))
107 || ((*start
== 'I' || *start
== 'i')
108 && !struncmp(start
+ 1, "map", 3))
109 || ((*start
== 'F' || *start
== 'f')
110 && !struncmp(start
+ 1, "ile", 3))))
112 && (*start
== 'H' || *start
== 'h')
113 && !struncmp(start
+1, "ttps", 4))
115 && (((*start
== 'G' || *start
== 'g')
116 && !struncmp(start
+1, "opher", 5))
117 || ((*start
== 'M' || *start
== 'm')
118 && !struncmp(start
+ 1, "ailto", 5))
119 || ((*start
== 'T' || *start
== 't')
120 && !struncmp(start
+ 1, "elnet", 5))))
122 && (*start
== 'P' || *start
== 'p')
123 && !struncmp(start
+ 1, "rospero", 7))
125 && (*start
== 'x' || *start
== 'X')
126 && !struncmp(start
+ 1, "-pine-help", 10))
128 && (*start
== 'x' || *start
== 'X')
129 && !struncmp(start
+ 1, "-alpine-help", 12)))
130 || url_external_specific_handler(start
, n
))){
132 * Second, make sure that everything to the right of the
133 * colon is valid for a "schemepart"...
136 if((end
= rfc1738_scheme_part(colon
+ 1)) - colon
> 1){
139 /* make sure something useful follows colon */
140 for(i
= 0, j
= end
- colon
; i
< j
; i
++)
141 if(!strchr(RFC1738_RSVP
, colon
[i
]))
148 * Special case handling for comma.
149 * See the problem is comma's valid, but if it's the
150 * last character in the url, it's likely intended
151 * as a delimiter in the text rather part of the URL.
152 * In most cases any way, that's why we have the
156 || (*(end
- 1) == '.' && (!*end
|| *end
== ' ')))
159 if(*len
- (colon
- start
) > 0)
171 * rfc1738_scheme_part - make sure what's to the right of the
174 * NOTE: we have a problem matching closing parens when users
175 * bracket the url in parens. So, lets try terminating our
176 * match on any closing paren that doesn't have a coresponding
180 rfc1738_scheme_part(char *s
)
182 int n
, paren
= 0, bracket
= 0;
187 if((n
= rfc1738xchar(s
)) != 0){
226 * rfc1738_str - convert rfc1738 escaped octets in place
231 register char *p
= s
, *q
= s
;
252 * rfc1738uchar - returns TRUE if the given char fits RFC 1738 "uchar" BNF
255 rfc1738uchar(char *s
)
257 int valid
= (RFC1738_ESC(s
)) /* "escape" */
259 : (isalnum((unsigned char) *s
) /* alphanumeric */
260 || strchr(RFC1738_SAFE
, *s
) /* other special stuff */
261 || strchr(RFC1738_EXTRA
, *s
));
269 cbuf
.cbufp
= cbuf
.cbuf
;
270 cbuf
.cbufend
= cbuf
.cbuf
;
272 for(t
= s
; t
&& *t
; t
++){
273 if(utf8_to_ucs4_oneatatime((unsigned char) *t
& 0xff, &cbuf
, &ucs
, NULL
)){
274 if ((ucs
>= 0x00A0 && ucs
<= 0xD7FF)
275 || (ucs
>= 0xE000 && ucs
<= 0xFDCF)
276 || (ucs
>= 0xFDF0 && ucs
<= 0xFFEF)
277 || (ucs
>= 0x10000 && ucs
<= 0x1FFFD)
278 || (ucs
>= 0x20000 && ucs
<= 0x2FFFD)
279 || (ucs
>= 0x30000 && ucs
<= 0x3FFFD)
280 || (ucs
>= 0x40000 && ucs
<= 0x4FFFD)
281 || (ucs
>= 0x50000 && ucs
<= 0x5FFFD)
282 || (ucs
>= 0x60000 && ucs
<= 0x6FFFD)
283 || (ucs
>= 0x70000 && ucs
<= 0x7FFFD)
284 || (ucs
>= 0x80000 && ucs
<= 0x8FFFD)
285 || (ucs
>= 0x90000 && ucs
<= 0x9FFFD)
286 || (ucs
>= 0xA0000 && ucs
<= 0xAFFFD)
287 || (ucs
>= 0xB0000 && ucs
<= 0xBFFFD)
288 || (ucs
>= 0xC0000 && ucs
<= 0xCFFFD)
289 || (ucs
>= 0xD0000 && ucs
<= 0xDFFFD)
290 || (ucs
>= 0xE0000 && ucs
<= 0xEFFFD)
291 || (ucs
>= 0xF0000 && ucs
<= 0xFFFFD)
292 || (ucs
>= 0x100000 && ucs
<= 0x10FFFD))
303 * rfc1738xchar - returns TRUE if the given char fits RFC 1738 "xchar" BNF
306 rfc1738xchar(char *s
)
310 return((n
= rfc1738uchar(s
))
312 : (strchr(RFC1738_RSVP
, *s
) != NULL
313 || strchr(RFC1738_FUDGE
, *s
)));
318 * rfc1738_num - return long value of a string of digits, possibly escaped
321 rfc1738_num(char **s
)
323 register char *p
= *s
;
324 unsigned long n
= 0L;
327 if(*p
== '%' && isxpair(p
+1)){
329 if(isdigit((unsigned char) c
)){
330 n
= (c
- '0') + (n
* 10);
336 else if(isdigit((unsigned char) *p
))
337 n
= (*p
- '0') + (n
* 10);
347 rfc1738_group(char *s
)
349 return(isalnum((unsigned char) *s
)
351 || strchr(RFC1738_NEWS
, *s
));
356 * Encode (hexify) a mailto url.
360 * Returns An allocated string which is suitably encoded.
361 * Result should be freed by caller.
363 * Since we don't know here which characters are reserved characters (? and &)
364 * for use in delimiting the pieces of the url and which are just those
365 * characters contained in the data that should be encoded, we always encode
366 * them. That's because we know we don't use those as reserved characters.
367 * If you do use those as reserved characters you have to encode each part
371 rfc1738_encode_mailto(char *s
)
373 char *d
, *ret
= NULL
;
376 /* Worst case, encode every character */
377 ret
= d
= (char *)fs_get((3*strlen(s
) + 1) * sizeof(char));
379 if(isalnum((unsigned char)*s
)
380 || strchr(RFC1738_SAFE
, *s
)
381 || strchr(RFC1738_EXTRA
, *s
))
398 * * * * * * * * * RFC 1808 support routines * * * * * * * *
403 rfc1808_tokens(char *url
, char **scheme
, char **net_loc
, char **path
,
404 char **parms
, char **query
, char **frag
)
406 char *p
, *q
, *start
, *tmp
= cpystr(url
);
409 if((p
= strchr(start
, '#')) != NULL
){ /* fragment spec? */
415 if((p
= strchr(start
, ':')) && p
!= start
){ /* scheme part? */
416 for(q
= start
; q
< p
; q
++)
417 if(!(isdigit((unsigned char) *q
)
418 || isalpha((unsigned char) *q
)
419 || strchr("+-.", *q
)))
424 *scheme
= cpystr(start
);
429 if(*start
== '/' && *(start
+1) == '/'){ /* net_loc */
430 if((p
= strchr(start
+2, '/')) != NULL
)
433 *net_loc
= cpystr(start
+2);
436 else *start
= '\0'; /* End of parse */
439 if((p
= strchr(start
, '?')) != NULL
){
444 if((p
= strchr(start
, ';')) != NULL
){
450 *path
= cpystr(start
);
452 fs_give((void **) &tmp
);
460 * web_host_scan -- Scan the given line for possible web host names
462 * NOTE: scan below is limited to DNS names ala RFC1034
465 web_host_scan(char *line
, int *len
)
467 char *end
, last
= '\0';
469 for(; *line
; last
= *line
++)
470 if((*line
== 'w' || *line
== 'W')
471 && (!last
|| !(isalnum((unsigned char) last
)
472 || last
== '.' || last
== '-' || last
== '/'))
473 && (((*(line
+ 1) == 'w' || *(line
+ 1) == 'W') /* "www." */
474 && (*(line
+ 2) == 'w' || *(line
+ 2) == 'W'))
475 || ((*(line
+ 1) == 'e' || *(line
+ 1) == 'E') /* "web." */
476 && (*(line
+ 2) == 'b' || *(line
+ 2) == 'B')))
477 && (*(line
+ 3) == '.')){
478 end
= rfc1738_scheme_part(line
+ 3);
479 if((*len
= end
- line
) > ((*(line
+3) == '.') ? 4 : 3)){
480 /* Dread comma exception, see note in rfc1738_scan */
481 if(strchr(",:", *(line
+ (*len
) - 1))
482 || (*(line
+ (*len
) - 1) == '.'
483 && (!*(line
+ (*len
)) || *(line
+ (*len
)) == ' ')))
497 * mail_addr_scan -- Scan the given line for possible RFC822 addr-spec's
499 * NOTE: Well, OK, not strictly addr-specs since there's alot of junk
500 * we're tying to sift thru and we'd like to minimize false-pos
504 mail_addr_scan(char *line
, int *len
)
506 char *amp
, *start
, *end
;
508 * This list is not the whole standards-based list, this is just a list
509 * of likely email address characters. We don't want to include everything
510 * because punctuation in the text might get mixed in with the address.
512 #define NONALPHANUMOK ".-_+%/="
514 /* process each : in the line */
515 for(; (amp
= strindex(line
, '@')) != NULL
; line
= end
){
517 /* zero length addr? */
518 if(amp
== line
|| !(isalnum((unsigned char) *(start
= amp
- 1))
519 || strchr(NONALPHANUMOK
, *start
)))
523 * Valid address (ala RFC822 BNF)? First, first look to the
524 * left to make sure there are valid "scheme" chars...
527 /* NOTE: we're not doing quoted-strings */
528 if(!(isalnum((unsigned char) *start
) || strchr(NONALPHANUMOK
, *start
))){
529 /* advance over bogus char, and erase leading punctuation */
530 for(start
++; *start
&& strchr(NONALPHANUMOK
, *start
); start
++)
535 else if(start
> line
)
541 * Make sure everyhing up to the colon is a known scheme...
543 if(start
&& (amp
- start
) > 0){
545 * Second, make sure that everything to the right of
546 * amp is valid for a "domain"...
548 if(*(end
= amp
+ 1) == '['){ /* domain literal */
551 for(++end
; *end
; end
++)
554 *len
= end
- start
+ 1;
560 else if(*end
== '.'){
564 else if(!isdigit((unsigned char) *end
))
567 else if(isalnum((unsigned char) *end
)){ /* domain name? */
569 if(!(*end
&& (isalnum((unsigned char) *end
)
573 /* can't end with dash, dot or underscore */
574 while(!isalnum((unsigned char) *(end
- 1)))