2 * ========================================================================
3 * Copyright 2006-2007 University of Washington
4 * Copyright 2013-2022 Eduardo Chappa
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * ========================================================================
15 #include "../pith/headers.h"
16 #include "../pith/url.h"
17 #include "../pith/mailview.h"
18 #include "../pith/string.h"
23 char *rfc1738_scheme_part(char *);
24 int rfc1738uchar(char *);
25 int rfc1738xchar(char *);
29 * * * * * * * * * RFC 1738 support routines * * * * * * * *
34 * Various helpful definitions
36 #define RFC1738_SAFE "$-_.+" /* "safe" */
37 #define RFC1738_EXTRA "!*'()," /* "extra" */
38 #define RFC1738_RSVP ";/?:@&=" /* "reserved" */
39 #define RFC1738_NEWS "-.+_" /* valid for "news:" URL */
40 #define RFC1738_FUDGE "#{}|\\^~[]" /* Unsafe, but popular */
41 #define RFC1738_ESC(S) (*(S) == '%' && isxpair((S) + 1))
45 * rfc1738_scan -- Scan the given line for possible URLs as defined
49 rfc1738_scan(char *line
, int *len
)
51 char *colon
, *start
, *end
;
54 /* process each : in the line */
55 for(; (colon
= strindex(line
, ':')) != NULL
; line
= end
){
57 if(colon
== line
) /* zero length scheme? */
61 * Valid URL (ala RFC1738 BNF)? First, first look to the
62 * left to make sure there are valid "scheme" chars...
66 if(!(isdigit((unsigned char) *start
)
67 || isalpha((unsigned char) *start
)
68 || strchr("+-.", *start
))){
69 start
++; /* advance over bogus char */
78 * Make sure everything up to the colon is a known scheme...
80 if(start
&& (n
= colon
- start
) && !isdigit((unsigned char) *start
)
82 && (*start
== 'w' || *start
== 'W')
83 && (*(start
+1) == 's' || *(start
+1) == 'S'))
85 && (((*start
== 'F' || *start
== 'f')
86 && !struncmp(start
+1, "tp", 2))
88 ((*start
== 'w' || *start
== 'W')
89 && !struncmp(start
+1, "ss", 2))))
91 && (((*start
== 'H' || *start
== 'h')
92 && !struncmp(start
+ 1, "ttp", 3))
93 || ((*start
== 'N' || *start
== 'n')
94 && !struncmp(start
+ 1, "ews", 3))
95 || ((*start
== 'N' || *start
== 'n')
96 && !struncmp(start
+ 1, "ntp", 3))
97 || ((*start
== 'W' || *start
== 'w')
98 && !struncmp(start
+ 1, "ais", 3))
100 || ((*start
== 'L' || *start
== 'l')
101 && !struncmp(start
+ 1, "dap", 3))
103 || ((*start
== 'I' || *start
== 'i')
104 && !struncmp(start
+ 1, "map", 3))
105 || ((*start
== 'F' || *start
== 'f')
106 && !struncmp(start
+ 1, "ile", 3))))
108 && (*start
== 'H' || *start
== 'h')
109 && !struncmp(start
+1, "ttps", 4))
111 && (((*start
== 'G' || *start
== 'g')
112 && !struncmp(start
+1, "opher", 5))
113 || ((*start
== 'M' || *start
== 'm')
114 && !struncmp(start
+ 1, "ailto", 5))
115 || ((*start
== 'T' || *start
== 't')
116 && !struncmp(start
+ 1, "elnet", 5))))
118 && (*start
== 'P' || *start
== 'p')
119 && !struncmp(start
+ 1, "rospero", 7))
121 && (*start
== 'x' || *start
== 'X')
122 && !struncmp(start
+ 1, "-pine-help", 10))
124 && (*start
== 'x' || *start
== 'X')
125 && !struncmp(start
+ 1, "-alpine-help", 12)))
126 || url_external_specific_handler(start
, n
))){
128 * Second, make sure that everything to the right of the
129 * colon is valid for a "schemepart"...
132 if((end
= rfc1738_scheme_part(colon
+ 1)) - colon
> 1){
135 /* make sure something useful follows colon */
136 for(i
= 0, j
= end
- colon
; i
< j
; i
++)
137 if(!strchr(RFC1738_RSVP
, colon
[i
]))
144 * Special case handling for comma.
145 * See the problem is comma's valid, but if it's the
146 * last character in the url, it's likely intended
147 * as a delimiter in the text rather part of the URL.
148 * In most cases any way, that's why we have the
152 || (*(end
- 1) == '.' && (!*end
|| *end
== ' ')))
155 if(*len
- (colon
- start
) > 0)
167 * rfc1738_scheme_part - make sure what's to the right of the
170 * NOTE: we have a problem matching closing parens when users
171 * bracket the url in parens. So, lets try terminating our
172 * match on any closing paren that doesn't have a corresponding
176 rfc1738_scheme_part(char *s
)
178 int n
, paren
= 0, bracket
= 0;
183 if((n
= rfc1738xchar(s
)) != 0){
222 * rfc1738_str - convert rfc1738 escaped octets in place
227 register char *p
= s
, *q
= s
;
248 * rfc1738uchar - returns TRUE if the given char fits RFC 1738 "uchar" BNF
251 rfc1738uchar(char *s
)
253 int valid
= (RFC1738_ESC(s
)) /* "escape" */
255 : (isalnum((unsigned char) *s
) /* alphanumeric */
256 || strchr(RFC1738_SAFE
, *s
) /* other special stuff */
257 || strchr(RFC1738_EXTRA
, *s
));
265 cbuf
.cbufp
= cbuf
.cbuf
;
266 cbuf
.cbufend
= cbuf
.cbuf
;
268 for(t
= s
; t
&& *t
; t
++){
269 if(utf8_to_ucs4_oneatatime((unsigned char) *t
& 0xff, &cbuf
, &ucs
, NULL
)){
270 if ((ucs
>= 0x00A0 && ucs
<= 0xD7FF)
271 || (ucs
>= 0xE000 && ucs
<= 0xFDCF)
272 || (ucs
>= 0xFDF0 && ucs
<= 0xFFEF)
273 || (ucs
>= 0x10000 && ucs
<= 0x1FFFD)
274 || (ucs
>= 0x20000 && ucs
<= 0x2FFFD)
275 || (ucs
>= 0x30000 && ucs
<= 0x3FFFD)
276 || (ucs
>= 0x40000 && ucs
<= 0x4FFFD)
277 || (ucs
>= 0x50000 && ucs
<= 0x5FFFD)
278 || (ucs
>= 0x60000 && ucs
<= 0x6FFFD)
279 || (ucs
>= 0x70000 && ucs
<= 0x7FFFD)
280 || (ucs
>= 0x80000 && ucs
<= 0x8FFFD)
281 || (ucs
>= 0x90000 && ucs
<= 0x9FFFD)
282 || (ucs
>= 0xA0000 && ucs
<= 0xAFFFD)
283 || (ucs
>= 0xB0000 && ucs
<= 0xBFFFD)
284 || (ucs
>= 0xC0000 && ucs
<= 0xCFFFD)
285 || (ucs
>= 0xD0000 && ucs
<= 0xDFFFD)
286 || (ucs
>= 0xE0000 && ucs
<= 0xEFFFD)
287 || (ucs
>= 0xF0000 && ucs
<= 0xFFFFD)
288 || (ucs
>= 0x100000 && ucs
<= 0x10FFFD))
299 * rfc1738xchar - returns TRUE if the given char fits RFC 1738 "xchar" BNF
302 rfc1738xchar(char *s
)
306 return((n
= rfc1738uchar(s
))
308 : (strchr(RFC1738_RSVP
, *s
) != NULL
309 || strchr(RFC1738_FUDGE
, *s
)));
314 * rfc1738_num - return long value of a string of digits, possibly escaped
317 rfc1738_num(char **s
)
319 register char *p
= *s
;
320 unsigned long n
= 0L;
323 if(*p
== '%' && isxpair(p
+1)){
325 if(isdigit((unsigned char) c
)){
326 n
= (c
- '0') + (n
* 10);
332 else if(isdigit((unsigned char) *p
))
333 n
= (*p
- '0') + (n
* 10);
343 rfc1738_group(char *s
)
345 return(isalnum((unsigned char) *s
)
347 || strchr(RFC1738_NEWS
, *s
));
352 * Encode (hexify) a mailto url.
356 * Returns An allocated string which is suitably encoded.
357 * Result should be freed by caller.
359 * Since we don't know here which characters are reserved characters (? and &)
360 * for use in delimiting the pieces of the url and which are just those
361 * characters contained in the data that should be encoded, we always encode
362 * them. That's because we know we don't use those as reserved characters.
363 * If you do use those as reserved characters you have to encode each part
367 rfc1738_encode_mailto(char *s
)
369 char *d
, *ret
= NULL
;
372 /* Worst case, encode every character */
373 ret
= d
= (char *)fs_get((3*strlen(s
) + 1) * sizeof(char));
375 if(isalnum((unsigned char)*s
)
376 || strchr(RFC1738_SAFE
, *s
)
377 || strchr(RFC1738_EXTRA
, *s
))
394 * * * * * * * * * RFC 1808 support routines * * * * * * * *
399 rfc1808_tokens(char *url
, char **scheme
, char **net_loc
, char **path
,
400 char **parms
, char **query
, char **frag
)
402 char *p
, *q
, *start
, *tmp
= cpystr(url
);
405 if((p
= strchr(start
, '#')) != NULL
){ /* fragment spec? */
411 if((p
= strchr(start
, ':')) && p
!= start
){ /* scheme part? */
412 for(q
= start
; q
< p
; q
++)
413 if(!(isdigit((unsigned char) *q
)
414 || isalpha((unsigned char) *q
)
415 || strchr("+-.", *q
)))
420 *scheme
= cpystr(start
);
425 if(*start
== '/' && *(start
+1) == '/'){ /* net_loc */
426 if((p
= strchr(start
+2, '/')) != NULL
)
429 *net_loc
= cpystr(start
+2);
432 else *start
= '\0'; /* End of parse */
435 if((p
= strchr(start
, '?')) != NULL
){
440 if((p
= strchr(start
, ';')) != NULL
){
446 *path
= cpystr(start
);
448 fs_give((void **) &tmp
);
456 * web_host_scan -- Scan the given line for possible web host names
458 * NOTE: scan below is limited to DNS names ala RFC1034
461 web_host_scan(char *line
, int *len
)
463 char *end
, last
= '\0';
465 for(; *line
; last
= *line
++)
466 if((*line
== 'w' || *line
== 'W')
467 && (!last
|| !(isalnum((unsigned char) last
)
468 || last
== '.' || last
== '-' || last
== '/'))
469 && (((*(line
+ 1) == 'w' || *(line
+ 1) == 'W') /* "www." */
470 && (*(line
+ 2) == 'w' || *(line
+ 2) == 'W'))
471 || ((*(line
+ 1) == 'e' || *(line
+ 1) == 'E') /* "web." */
472 && (*(line
+ 2) == 'b' || *(line
+ 2) == 'B')))
473 && (*(line
+ 3) == '.')){
474 end
= rfc1738_scheme_part(line
+ 3);
475 if((*len
= end
- line
) > ((*(line
+3) == '.') ? 4 : 3)){
476 /* Dread comma exception, see note in rfc1738_scan */
477 if(strchr(",:", *(line
+ (*len
) - 1))
478 || (*(line
+ (*len
) - 1) == '.'
479 && (!*(line
+ (*len
)) || *(line
+ (*len
)) == ' ')))
493 * mail_addr_scan -- Scan the given line for possible RFC822 addr-spec's
495 * NOTE: Well, OK, not strictly addr-specs since there's a lot of junk
496 * we're tying to sift thru and we'd like to minimize false-pos
500 mail_addr_scan(char *line
, int *len
)
502 char *amp
, *start
, *end
;
504 * This list is not the whole standards-based list, this is just a list
505 * of likely email address characters. We don't want to include everything
506 * because punctuation in the text might get mixed in with the address.
508 #define NONALPHANUMOK ".-_+%/="
510 /* process each : in the line */
511 for(; (amp
= strindex(line
, '@')) != NULL
; line
= end
){
513 /* zero length addr? */
514 if(amp
== line
|| !(isalnum((unsigned char) *(start
= amp
- 1))
515 || strchr(NONALPHANUMOK
, *start
)))
519 * Valid address (ala RFC822 BNF)? First, first look to the
520 * left to make sure there are valid "scheme" chars...
523 /* NOTE: we're not doing quoted-strings */
524 if(!(isalnum((unsigned char) *start
) || strchr(NONALPHANUMOK
, *start
))){
525 /* advance over bogus char, and erase leading punctuation */
526 for(start
++; *start
&& strchr(NONALPHANUMOK
, *start
); start
++)
531 else if(start
> line
)
537 * Make sure everything up to the colon is a known scheme...
539 if(start
&& (amp
- start
) > 0){
541 * Second, make sure that everything to the right of
542 * amp is valid for a "domain"...
544 if(*(end
= amp
+ 1) == '['){ /* domain literal */
547 for(++end
; *end
; end
++)
550 *len
= end
- start
+ 1;
556 else if(*end
== '.'){
560 else if(!isdigit((unsigned char) *end
))
563 else if(isalnum((unsigned char) *end
)){ /* domain name? */
565 if(!(*end
&& (isalnum((unsigned char) *end
)
569 /* can't end with dash, dot or underscore */
570 while(!isalnum((unsigned char) *(end
- 1)))