Add support for tab-completion when selecting by rule
[alpine.git] / pith / url.c
blobd7c34fda0d17567d475c93f655f2f09cd623d6ac
1 /*
2 * ========================================================================
3 * Copyright 2006-2007 University of Washington
4 * Copyright 2013-2022 Eduardo Chappa
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * ========================================================================
15 #include "../pith/headers.h"
16 #include "../pith/url.h"
17 #include "../pith/mailview.h"
18 #include "../pith/string.h"
21 * Internal prototypes
23 char *rfc1738_scheme_part(char *);
24 int rfc1738uchar(char *);
25 int rfc1738xchar(char *);
29 * * * * * * * * * RFC 1738 support routines * * * * * * * *
34 * Various helpful definitions
36 #define RFC1738_SAFE "$-_.+" /* "safe" */
37 #define RFC1738_EXTRA "!*'()," /* "extra" */
38 #define RFC1738_RSVP ";/?:@&=" /* "reserved" */
39 #define RFC1738_NEWS "-.+_" /* valid for "news:" URL */
40 #define RFC1738_FUDGE "#{}|\\^~[]" /* Unsafe, but popular */
41 #define RFC1738_ESC(S) (*(S) == '%' && isxpair((S) + 1))
45 * rfc1738_scan -- Scan the given line for possible URLs as defined
46 * in RFC1738
48 char *
49 rfc1738_scan(char *line, int *len)
51 char *colon, *start, *end;
52 int n;
54 /* process each : in the line */
55 for(; (colon = strindex(line, ':')) != NULL; line = end){
56 end = colon + 1;
57 if(colon == line) /* zero length scheme? */
58 continue;
61 * Valid URL (ala RFC1738 BNF)? First, first look to the
62 * left to make sure there are valid "scheme" chars...
64 start = colon - 1;
65 while(1)
66 if(!(isdigit((unsigned char) *start)
67 || isalpha((unsigned char) *start)
68 || strchr("+-.", *start))){
69 start++; /* advance over bogus char */
70 break;
72 else if(start > line)
73 start--;
74 else
75 break;
78 * Make sure everything up to the colon is a known scheme...
80 if(start && (n = colon - start) && !isdigit((unsigned char) *start)
81 && (((n == 2
82 && (*start == 'w' || *start == 'W')
83 && (*(start+1) == 's' || *(start+1) == 'S'))
84 || (n == 3
85 && (((*start == 'F' || *start == 'f')
86 && !struncmp(start+1, "tp", 2))
88 ((*start == 'w' || *start == 'W')
89 && !struncmp(start+1, "ss", 2))))
90 || (n == 4
91 && (((*start == 'H' || *start == 'h')
92 && !struncmp(start + 1, "ttp", 3))
93 || ((*start == 'N' || *start == 'n')
94 && !struncmp(start + 1, "ews", 3))
95 || ((*start == 'N' || *start == 'n')
96 && !struncmp(start + 1, "ntp", 3))
97 || ((*start == 'W' || *start == 'w')
98 && !struncmp(start + 1, "ais", 3))
99 #ifdef ENABLE_LDAP
100 || ((*start == 'L' || *start == 'l')
101 && !struncmp(start + 1, "dap", 3))
102 #endif
103 || ((*start == 'I' || *start == 'i')
104 && !struncmp(start + 1, "map", 3))
105 || ((*start == 'F' || *start == 'f')
106 && !struncmp(start + 1, "ile", 3))))
107 || (n == 5
108 && (*start == 'H' || *start == 'h')
109 && !struncmp(start+1, "ttps", 4))
110 || (n == 6
111 && (((*start == 'G' || *start == 'g')
112 && !struncmp(start+1, "opher", 5))
113 || ((*start == 'M' || *start == 'm')
114 && !struncmp(start + 1, "ailto", 5))
115 || ((*start == 'T' || *start == 't')
116 && !struncmp(start + 1, "elnet", 5))))
117 || (n == 8
118 && (*start == 'P' || *start == 'p')
119 && !struncmp(start + 1, "rospero", 7))
120 || (n == 11
121 && (*start == 'x' || *start == 'X')
122 && !struncmp(start + 1, "-pine-help", 10))
123 || (n == 13
124 && (*start == 'x' || *start == 'X')
125 && !struncmp(start + 1, "-alpine-help", 12)))
126 || url_external_specific_handler(start, n))){
128 * Second, make sure that everything to the right of the
129 * colon is valid for a "schemepart"...
132 if((end = rfc1738_scheme_part(colon + 1)) - colon > 1){
133 int i, j;
135 /* make sure something useful follows colon */
136 for(i = 0, j = end - colon; i < j; i++)
137 if(!strchr(RFC1738_RSVP, colon[i]))
138 break;
140 if(i != j){
141 *len = end - start;
144 * Special case handling for comma.
145 * See the problem is comma's valid, but if it's the
146 * last character in the url, it's likely intended
147 * as a delimiter in the text rather part of the URL.
148 * In most cases any way, that's why we have the
149 * exception.
151 if(*(end - 1) == ','
152 || (*(end - 1) == '.' && (!*end || *end == ' ')))
153 (*len)--;
155 if(*len - (colon - start) > 0)
156 return(start);
162 return(NULL);
167 * rfc1738_scheme_part - make sure what's to the right of the
168 * colon is valid
170 * NOTE: we have a problem matching closing parens when users
171 * bracket the url in parens. So, lets try terminating our
172 * match on any closing paren that doesn't have a corresponding
173 * open-paren.
175 char *
176 rfc1738_scheme_part(char *s)
178 int n, paren = 0, bracket = 0;
180 while(1)
181 switch(*s){
182 default :
183 if((n = rfc1738xchar(s)) != 0){
184 s += n;
185 break;
188 case '\0' :
189 return(s);
191 case '[' :
192 bracket++;
193 s++;
194 break;
196 case ']' :
197 if(bracket--){
198 s++;
199 break;
202 return(s);
204 case '(' :
205 paren++;
206 s++;
207 break;
209 case ')' :
210 if(paren--){
211 s++;
212 break;
215 return(s);
222 * rfc1738_str - convert rfc1738 escaped octets in place
224 char *
225 rfc1738_str(char *s)
227 register char *p = s, *q = s;
229 while(1)
230 switch(*q = *p++){
231 case '%' :
232 if(isxpair(p)){
233 *q = X2C(p);
234 p += 2;
237 default :
238 q++;
239 break;
241 case '\0':
242 return(s);
248 * rfc1738uchar - returns TRUE if the given char fits RFC 1738 "uchar" BNF
251 rfc1738uchar(char *s)
253 int valid = (RFC1738_ESC(s)) /* "escape" */
255 : (isalnum((unsigned char) *s) /* alphanumeric */
256 || strchr(RFC1738_SAFE, *s) /* other special stuff */
257 || strchr(RFC1738_EXTRA, *s));
259 if(!valid){
260 char *t;
261 UCS ucs;
262 CBUF_S cbuf;
264 cbuf.cbuf[0] = '\0';
265 cbuf.cbufp = cbuf.cbuf;
266 cbuf.cbufend = cbuf.cbuf;
268 for(t = s; t && *t; t++){
269 if(utf8_to_ucs4_oneatatime((unsigned char) *t & 0xff, &cbuf, &ucs, NULL)){
270 if ((ucs >= 0x00A0 && ucs <= 0xD7FF)
271 || (ucs >= 0xE000 && ucs <= 0xFDCF)
272 || (ucs >= 0xFDF0 && ucs <= 0xFFEF)
273 || (ucs >= 0x10000 && ucs <= 0x1FFFD)
274 || (ucs >= 0x20000 && ucs <= 0x2FFFD)
275 || (ucs >= 0x30000 && ucs <= 0x3FFFD)
276 || (ucs >= 0x40000 && ucs <= 0x4FFFD)
277 || (ucs >= 0x50000 && ucs <= 0x5FFFD)
278 || (ucs >= 0x60000 && ucs <= 0x6FFFD)
279 || (ucs >= 0x70000 && ucs <= 0x7FFFD)
280 || (ucs >= 0x80000 && ucs <= 0x8FFFD)
281 || (ucs >= 0x90000 && ucs <= 0x9FFFD)
282 || (ucs >= 0xA0000 && ucs <= 0xAFFFD)
283 || (ucs >= 0xB0000 && ucs <= 0xBFFFD)
284 || (ucs >= 0xC0000 && ucs <= 0xCFFFD)
285 || (ucs >= 0xD0000 && ucs <= 0xDFFFD)
286 || (ucs >= 0xE0000 && ucs <= 0xEFFFD)
287 || (ucs >= 0xF0000 && ucs <= 0xFFFFD)
288 || (ucs >= 0x100000 && ucs <= 0x10FFFD))
289 valid = t-s+1;
290 break;
294 return valid;
299 * rfc1738xchar - returns TRUE if the given char fits RFC 1738 "xchar" BNF
302 rfc1738xchar(char *s)
304 int n;
306 return((n = rfc1738uchar(s))
308 : (strchr(RFC1738_RSVP, *s) != NULL
309 || strchr(RFC1738_FUDGE, *s)));
314 * rfc1738_num - return long value of a string of digits, possibly escaped
316 unsigned long
317 rfc1738_num(char **s)
319 register char *p = *s;
320 unsigned long n = 0L;
322 for(; *p; p++)
323 if(*p == '%' && isxpair(p+1)){
324 int c = X2C(p+1);
325 if(isdigit((unsigned char) c)){
326 n = (c - '0') + (n * 10);
327 p += 2;
329 else
330 break;
332 else if(isdigit((unsigned char) *p))
333 n = (*p - '0') + (n * 10);
334 else
335 break;
337 *s = p;
338 return(n);
343 rfc1738_group(char *s)
345 return(isalnum((unsigned char) *s)
346 || RFC1738_ESC(s)
347 || strchr(RFC1738_NEWS, *s));
352 * Encode (hexify) a mailto url.
354 * Args s -- src url
356 * Returns An allocated string which is suitably encoded.
357 * Result should be freed by caller.
359 * Since we don't know here which characters are reserved characters (? and &)
360 * for use in delimiting the pieces of the url and which are just those
361 * characters contained in the data that should be encoded, we always encode
362 * them. That's because we know we don't use those as reserved characters.
363 * If you do use those as reserved characters you have to encode each part
364 * separately.
366 char *
367 rfc1738_encode_mailto(char *s)
369 char *d, *ret = NULL;
371 if(s){
372 /* Worst case, encode every character */
373 ret = d = (char *)fs_get((3*strlen(s) + 1) * sizeof(char));
374 while(*s){
375 if(isalnum((unsigned char)*s)
376 || strchr(RFC1738_SAFE, *s)
377 || strchr(RFC1738_EXTRA, *s))
378 *d++ = *s++;
379 else{
380 *d++ = '%';
381 C2XPAIR(*s, d);
382 s++;
386 *d = '\0';
389 return(ret);
394 * * * * * * * * * RFC 1808 support routines * * * * * * * *
399 rfc1808_tokens(char *url, char **scheme, char **net_loc, char **path,
400 char **parms, char **query, char **frag)
402 char *p, *q, *start, *tmp = cpystr(url);
404 start = tmp;
405 if((p = strchr(start, '#')) != NULL){ /* fragment spec? */
406 *p++ = '\0';
407 if(*p)
408 *frag = cpystr(p);
411 if((p = strchr(start, ':')) && p != start){ /* scheme part? */
412 for(q = start; q < p; q++)
413 if(!(isdigit((unsigned char) *q)
414 || isalpha((unsigned char) *q)
415 || strchr("+-.", *q)))
416 break;
418 if(p == q){
419 *p++ = '\0';
420 *scheme = cpystr(start);
421 start = p;
425 if(*start == '/' && *(start+1) == '/'){ /* net_loc */
426 if((p = strchr(start+2, '/')) != NULL)
427 *p++ = '\0';
429 *net_loc = cpystr(start+2);
430 if(p)
431 start = p;
432 else *start = '\0'; /* End of parse */
435 if((p = strchr(start, '?')) != NULL){
436 *p++ = '\0';
437 *query = cpystr(p);
440 if((p = strchr(start, ';')) != NULL){
441 *p++ = '\0';
442 *parms = cpystr(p);
445 if(*start)
446 *path = cpystr(start);
448 fs_give((void **) &tmp);
450 return(1);
456 * web_host_scan -- Scan the given line for possible web host names
458 * NOTE: scan below is limited to DNS names ala RFC1034
460 char *
461 web_host_scan(char *line, int *len)
463 char *end, last = '\0';
465 for(; *line; last = *line++)
466 if((*line == 'w' || *line == 'W')
467 && (!last || !(isalnum((unsigned char) last)
468 || last == '.' || last == '-' || last == '/'))
469 && (((*(line + 1) == 'w' || *(line + 1) == 'W') /* "www." */
470 && (*(line + 2) == 'w' || *(line + 2) == 'W'))
471 || ((*(line + 1) == 'e' || *(line + 1) == 'E') /* "web." */
472 && (*(line + 2) == 'b' || *(line + 2) == 'B')))
473 && (*(line + 3) == '.')){
474 end = rfc1738_scheme_part(line + 3);
475 if((*len = end - line) > ((*(line+3) == '.') ? 4 : 3)){
476 /* Dread comma exception, see note in rfc1738_scan */
477 if(strchr(",:", *(line + (*len) - 1))
478 || (*(line + (*len) - 1) == '.'
479 && (!*(line + (*len)) || *(line + (*len)) == ' ')))
480 (*len)--;
482 return(line);
484 else
485 line += 3;
488 return(NULL);
493 * mail_addr_scan -- Scan the given line for possible RFC822 addr-spec's
495 * NOTE: Well, OK, not strictly addr-specs since there's a lot of junk
496 * we're tying to sift thru and we'd like to minimize false-pos
497 * matches.
499 char *
500 mail_addr_scan(char *line, int *len)
502 char *amp, *start, *end;
504 * This list is not the whole standards-based list, this is just a list
505 * of likely email address characters. We don't want to include everything
506 * because punctuation in the text might get mixed in with the address.
508 #define NONALPHANUMOK ".-_+%/="
510 /* process each : in the line */
511 for(; (amp = strindex(line, '@')) != NULL; line = end){
512 end = amp + 1;
513 /* zero length addr? */
514 if(amp == line || !(isalnum((unsigned char) *(start = amp - 1))
515 || strchr(NONALPHANUMOK, *start)))
516 continue;
519 * Valid address (ala RFC822 BNF)? First, first look to the
520 * left to make sure there are valid "scheme" chars...
522 while(1)
523 /* NOTE: we're not doing quoted-strings */
524 if(!(isalnum((unsigned char) *start) || strchr(NONALPHANUMOK, *start))){
525 /* advance over bogus char, and erase leading punctuation */
526 for(start++; *start && strchr(NONALPHANUMOK, *start); start++)
529 break;
531 else if(start > line)
532 start--;
533 else
534 break;
537 * Make sure everything up to the colon is a known scheme...
539 if(start && (amp - start) > 0){
541 * Second, make sure that everything to the right of
542 * amp is valid for a "domain"...
544 if(*(end = amp + 1) == '['){ /* domain literal */
545 int dots = 3;
547 for(++end; *end ; end++)
548 if(*end == ']'){
549 if(!dots){
550 *len = end - start + 1;
551 return(start);
553 else
554 break; /* bogus */
556 else if(*end == '.'){
557 if(--dots < 0)
558 break; /* bogus */
560 else if(!isdigit((unsigned char) *end))
561 break; /* bogus */
563 else if(isalnum((unsigned char) *end)){ /* domain name? */
564 for(++end; ; end++)
565 if(!(*end && (isalnum((unsigned char) *end)
566 || *end == '-'
567 || *end == '.'
568 || *end == '_'))){
569 /* can't end with dash, dot or underscore */
570 while(!isalnum((unsigned char) *(end - 1)))
571 end--;
573 *len = end - start;
574 return(start);
580 return(NULL);