expand() filenames given to the ~@ tilde escape..
[s-mailx.git] / junk.c
blob4921231665188aa73acdb9c857bdb32aa5e88d2c
1 /*
2 * Heirloom mailx - a mail user agent derived from Berkeley Mail.
4 * Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
5 */
6 /*
7 * Copyright (c) 2004
8 * Gunnar Ritter. All rights reserved.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by Gunnar Ritter
21 * and his contributors.
22 * 4. Neither the name of Gunnar Ritter nor the names of his contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
39 #ifndef lint
40 #ifdef DOSCCS
41 static char sccsid[] = "@(#)junk.c 1.75 (gritter) 9/14/08";
42 #endif
43 #endif /* not lint */
45 #include "config.h"
46 #include "rcv.h"
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #include <limits.h>
51 #include <time.h>
52 #include <unistd.h>
53 #include <utime.h>
55 #ifdef HAVE_MMAP
56 #include <sys/mman.h>
57 #else /* !HAVE_MMAP */
58 #define mmap(a, b, c, d, e, f) MAP_FAILED
59 #define munmap(a, b)
60 #endif /* !HAVE_MMAP */
61 #ifndef HAVE_MREMAP
62 #define mremap(a, b, c, d) MAP_FAILED
63 #endif /* !HAVE_MREMAP */
65 #ifndef MAP_FAILED
66 #define MAP_FAILED ((void *)-1)
67 #endif /* !MAP_FAILED */
69 #include "extern.h"
70 #include "md5.h"
73 * Mail -- a mail program
75 * Junk classification, mostly according to Paul Graham's "A Plan for Spam",
76 * August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
77 * Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
79 * Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
80 * Classification using Chained Tokens", February 2004,
81 * <http://www.nuclearelephant.com/papers/chained.html>.
84 #define DFL .40
85 #define THR .9
86 #define MID .5
88 #define MAX2 0x0000ffff
89 #define MAX3 0x00ffffffUL
90 #define MAX4 0xffffffffUL
93 * The dictionary consists of two files forming a hash table. The hash
94 * consists of the first 56 bits of the result of applying MD5 to the
95 * input word. This scheme ensures that collisions are unlikely enough
96 * to make junk detection work; according to the birthday paradox, a
97 * 50 % probability for one single collision is reached at 2^28 entries.
99 * To make the chain structure independent from input, the MD5 input is
100 * xor'ed with a random number. This makes it impossible that someone uses
101 * a carefully crafted message for a denial-of-service attack against the
102 * database.
104 #define SIZEOF_node 17
105 #define OF_node_hash 0 /* first 32 bits of MD5 of word|mangle */
106 #define OF_node_next 4 /* bit-negated table index of next node */
107 #define OF_node_good 8 /* number of times this appeared in good msgs */
108 #define OF_node_bad 11 /* number of times this appeared in bad msgs */
109 #define OF_node_prob_O 14 /* table_version<1: precomputed probability */
110 #define OF_node_hash2 14 /* upper 3 bytes of MD5 hash */
111 static char *nodes;
113 #define SIZEOF_super 262164
114 #define OF_super_size 0 /* allocated nodes in the chain file */
115 #define OF_super_used 4 /* used nodes in the chain file */
116 #define OF_super_ngood 8 /* number of good messages scanned so far */
117 #define OF_super_nbad 12 /* number of bad messages scanned so far */
118 #define OF_super_mangle 16 /* used to mangle the MD5 input */
119 #define OF_super_bucket 20 /* 65536 bit-negated node indices */
120 #define SIZEOF_entry 4
121 static char *super;
123 static size_t super_mmapped;
124 static size_t nodes_mmapped;
125 static int rw_map;
126 static int chained_tokens;
129 * Version history
130 * ---------------
131 * 0 Initial version
132 * 1 Fixed the mangling; it was ineffective in version 0.
133 * Hash extended to 56 bits.
135 static int table_version;
136 #define current_table_version 1
138 #define get(e) \
139 ((unsigned)(((char *)(e))[0]&0377) + \
140 ((unsigned)(((char *)(e))[1]&0377) << 8) + \
141 ((unsigned)(((char *)(e))[2]&0377) << 16))
143 #define put(e, n) \
144 (((char *)(e))[0] = (n) & 0x0000ff, \
145 ((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
146 ((char *)(e))[2] = ((n) & 0xff0000) >> 16)
148 #define f2s(d) (smin(((unsigned)((d) * MAX3)), MAX3))
150 #define s2f(s) ((float)(s) / MAX3)
152 #define getn(p) \
153 ((unsigned long)(((char *)(p))[0]&0377) + \
154 ((unsigned long)(((char *)(p))[1]&0377) << 8) + \
155 ((unsigned long)(((char *)(p))[2]&0377) << 16) + \
156 ((unsigned long)(((char *)(p))[3]&0377) << 24))
158 #define putn(p, n) \
159 (((char *)(p))[0] = (n) & 0x000000ffUL, \
160 ((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
161 ((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
162 ((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
164 struct lexstat {
165 char *save;
166 int price;
167 int url;
168 int lastc;
169 int hadamp;
170 enum loc {
171 FROM_LINE = 0,
172 HEADER = 1,
173 BODY = 2
174 } loc;
175 enum html {
176 HTML_NONE = 0,
177 HTML_TEXT = 1,
178 HTML_TAG = 2,
179 HTML_SKIP = 3
180 } html;
181 char tag[8];
182 char *tagp;
183 char field[LINESIZE];
186 #define constituent(c, b, i, price, hadamp) \
187 ((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
188 (c) == '$' || (c) == '!' || (c) == '_' || \
189 (c) == '#' || (c) == '%' || (c) == '&' || \
190 ((c) == ';' && hadamp) || \
191 ((c) == '-' && !(price)) || \
192 (((c) == '.' || (c) == ',' || (c) == '/') && \
193 (i) > 0 && digitchar((b)[(i)-1]&0377)))
195 #define url_xchar(c) \
196 (((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
197 !spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
198 (c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
199 (c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
200 (c) != '#' && (c) != '"')
202 enum db {
203 SUPER = 0,
204 NODES = 1
207 enum entry {
208 GOOD = 0,
209 BAD = 1
212 static const char README1[] = "\
213 This is a junk mail database maintained by mailx(1). It does not contain any\n\
214 of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
215 used for lookup. It is thus possible to tell if some given word was likely\n\
216 contained in your mail from examining this data, at best.\n";
217 static const char README2[] = "\n\
218 The database files are stored in compress(1) format by default. This saves\n\
219 some space, but leads to higher processor usage when the database is read\n\
220 or updated. You can use uncompress(1) on these files if you prefer to store\n\
221 them in flat form.\n";
223 static int verbose;
224 static int _debug;
225 static FILE *sfp, *nfp;
226 static char *sname, *nname;
228 static enum okay getdb(int rw);
229 static void putdb(void);
230 static void relsedb(void);
231 static FILE *dbfp(enum db db, int rw, int *compressed, char **fn);
232 static char *lookup(unsigned long h1, unsigned long h2, int create);
233 static unsigned long grow(unsigned long size);
234 static char *nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
235 struct lexstat *sp, int *stop);
236 static void join(char **buf, size_t *bufsize, const char *s1, const char *s2);
237 static void add(const char *word, enum entry entry, struct lexstat *sp,
238 int incr);
239 static enum okay scan(struct message *m, enum entry entry,
240 void (*func)(const char *, enum entry, struct lexstat *, int),
241 int arg);
242 static void recompute(void);
243 static float getprob(char *n);
244 static int insert(int *msgvec, enum entry entry, int incr);
245 static void clsf(struct message *m);
246 static void rate(const char *word, enum entry entry, struct lexstat *sp,
247 int unused);
248 static void dbhash(const char *word, unsigned long *h1, unsigned long *h2);
249 static void mkmangle(void);
251 static enum okay
252 getdb(int rw)
254 void *zp = NULL;
255 long n;
256 int compressed;
258 chained_tokens = value("chained-junk-tokens") != NULL;
259 if ((sfp = dbfp(SUPER, rw, &compressed, &sname)) == (FILE *)-1)
260 return STOP;
261 if (sfp && !compressed) {
262 super = mmap(NULL, SIZEOF_super,
263 rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
264 MAP_SHARED, fileno(sfp), 0);
265 if (super != MAP_FAILED) {
266 super_mmapped = SIZEOF_super;
267 goto skip;
270 super_mmapped = 0;
271 super = smalloc(SIZEOF_super);
272 if (sfp) {
273 if (compressed)
274 zp = zalloc(sfp);
275 if ((compressed ? zread(zp, super, SIZEOF_super)
276 != SIZEOF_super :
277 fread(super, 1, SIZEOF_super, sfp)
278 != SIZEOF_super) ||
279 ferror(sfp)) {
280 fprintf(stderr, "Error reading junk mail database.\n");
281 memset(super, 0, SIZEOF_super);
282 mkmangle();
283 if (compressed)
284 zfree(zp);
285 Fclose(sfp);
286 sfp = NULL;
287 } else if (compressed)
288 zfree(zp);
289 } else {
290 memset(super, 0, SIZEOF_super);
291 mkmangle();
293 skip: if ((n = getn(&super[OF_super_size])) == 0) {
294 n = 1;
295 putn(&super[OF_super_size], 1);
297 if (sfp && (nfp = dbfp(NODES, rw, &compressed, &nname)) != NULL) {
298 if (nfp == (FILE *)-1) {
299 relsedb();
300 return STOP;
303 rw_map = rw;
304 if (sfp && nfp && !compressed) {
305 nodes = mmap(NULL, n * SIZEOF_node,
306 rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
307 MAP_SHARED, fileno(nfp), 0);
308 if (nodes != MAP_FAILED) {
309 nodes_mmapped = n * SIZEOF_node;
310 return OKAY;
313 nodes_mmapped = 0;
314 nodes = smalloc(n * SIZEOF_node);
315 if (sfp && nfp) {
316 if (compressed)
317 zp = zalloc(nfp);
318 if ((compressed ? zread(zp, nodes, n * SIZEOF_node)
319 != n * SIZEOF_node :
320 fread(nodes, 1, n * SIZEOF_node, nfp)
321 != n * SIZEOF_node) ||
322 ferror(nfp)) {
323 fprintf(stderr, "Error reading junk mail database.\n");
324 memset(nodes, 0, n * SIZEOF_node);
325 memset(super, 0, SIZEOF_super);
326 mkmangle();
327 putn(&super[OF_super_size], n);
329 if (compressed)
330 zfree(zp);
331 Fclose(nfp);
332 nfp = NULL;
333 } else
334 memset(nodes, 0, n * SIZEOF_node);
335 if (sfp) {
336 Fclose(sfp);
337 sfp = NULL;
339 return OKAY;
342 static void
343 putdb(void)
345 void *zp;
346 int scomp, ncomp;
348 if (!super_mmapped && (sfp = dbfp(SUPER, O_WRONLY, &scomp, &sname))
349 == NULL || sfp == (FILE *)-1)
350 return;
351 if (!nodes_mmapped && (nfp = dbfp(NODES, O_WRONLY, &ncomp, &nname))
352 == NULL || nfp == (FILE *)-1)
353 return;
354 if (super_mmapped == 0 || nodes_mmapped == 0)
355 holdint();
357 * Use utime() with mmap() since Linux does not update st_mtime
358 * reliably otherwise.
360 if (super_mmapped)
361 utime(sname, NULL);
362 else if (scomp) {
363 zp = zalloc(sfp);
364 zwrite(zp, super, SIZEOF_super);
365 zfree(zp);
366 trunc(sfp);
367 } else
368 fwrite(super, 1, SIZEOF_super, sfp);
369 if (nodes_mmapped)
370 utime(nname, NULL);
371 else if (ncomp) {
372 zp = zalloc(nfp);
373 zwrite(zp, nodes, getn(&super[OF_super_size]) * SIZEOF_node);
374 zfree(zp);
375 trunc(nfp);
376 } else
377 fwrite(nodes, 1,
378 getn(&super[OF_super_size]) * SIZEOF_node, nfp);
379 if (super_mmapped == 0 || nodes_mmapped == 0)
380 relseint();
383 static void
384 relsedb(void)
386 if (super_mmapped) {
387 munmap(super, super_mmapped);
388 super_mmapped = 0;
389 } else
390 free(super);
391 if (nodes_mmapped) {
392 munmap(nodes, nodes_mmapped);
393 nodes_mmapped = 0;
394 } else
395 free(nodes);
396 if (sfp && sfp != (FILE *)-1) {
397 Fclose(sfp);
398 sfp = NULL;
400 if (nfp && nfp != (FILE *)-1) {
401 Fclose(nfp);
402 nfp = NULL;
406 static FILE *
407 dbfp(enum db db, int rw, int *compressed, char **fn)
409 FILE *fp, *rp;
410 char *dir;
411 struct flock flp;
412 char *sfx[][2] = {
413 { "super", "nodes" },
414 { "super1", "nodes1" }
416 char **sf;
417 char *zfx[][2] = {
418 { "super.Z", "nodes.Z" },
419 { "super1.Z", "nodes1.Z" }
421 char **zf;
422 int n;
424 if ((dir = value("junkdb")) == NULL) {
425 fprintf(stderr, "No junk mail database specified. "
426 "Set the junkdb variable.\n");
427 return (FILE *)-1;
429 dir = expand(dir);
430 if (makedir(dir) == STOP) {
431 fprintf(stderr, "Cannot create directory \"%s\"\n.", dir);
432 return (FILE *)-1;
434 if (rw!=O_WRONLY)
435 table_version = current_table_version;
436 loop: sf = sfx[table_version];
437 zf = zfx[table_version];
438 *fn = salloc((n = strlen(dir)) + 40);
439 strcpy(*fn, dir);
440 (*fn)[n] = '/';
441 *compressed = 0;
442 strcpy(&(*fn)[n+1], sf[db]);
443 if ((fp = Fopen(*fn, rw!=O_RDONLY ? "r+" : "r")) != NULL)
444 goto okay;
445 *compressed = 1;
446 strcpy(&(*fn)[n+1], zf[db]);
447 if ((fp = Fopen(*fn, rw ? "r+" : "r")) == NULL &&
448 rw==O_WRONLY ? (fp = Fopen(*fn, "w+")) == NULL : 0) {
449 fprintf(stderr, "Cannot open junk mail database \"%s\".\n",*fn);
450 return NULL;
452 if (rw==O_WRONLY) {
453 strcpy(&(*fn)[n+1], "README");
454 if (access(*fn, F_OK) < 0 && (rp = Fopen(*fn, "w")) != NULL) {
455 fputs(README1, rp);
456 fputs(README2, rp);
457 Fclose(rp);
459 } else if (fp == NULL) {
460 if (table_version > 0) {
461 table_version--;
462 goto loop;
463 } else
464 table_version = current_table_version;
466 okay: if (fp) {
467 flp.l_type = rw!=O_RDONLY ? F_WRLCK : F_RDLCK;
468 flp.l_start = 0;
469 flp.l_len = 0;
470 flp.l_whence = SEEK_SET;
471 fcntl(fileno(fp), F_SETLKW, &flp);
473 return fp;
476 static char *
477 lookup(unsigned long h1, unsigned long h2, int create)
479 char *n, *lastn = NULL;
480 unsigned long c, lastc = MAX4, used, size;
482 used = getn(&super[OF_super_used]);
483 size = getn(&super[OF_super_size]);
484 c = ~getn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry]);
485 n = &nodes[c*SIZEOF_node];
486 while (c < used) {
487 if (getn(&n[OF_node_hash]) == h1 &&
488 (table_version < 1 ? 1 :
489 get(&n[OF_node_hash2]) == h2))
490 return n;
491 lastc = c;
492 lastn = n;
493 c = ~getn(&n[OF_node_next]);
494 n = &nodes[c*SIZEOF_node];
496 if (create) {
497 if (used >= size) {
498 if ((size = grow(size)) == 0)
499 return NULL;
500 lastn = &nodes[lastc*SIZEOF_node];
502 putn(&super[OF_super_used], used+1);
503 n = &nodes[used*SIZEOF_node];
504 putn(&n[OF_node_hash], h1);
505 put(&n[OF_node_hash2], h2);
506 if (lastc < used)
507 putn(&lastn[OF_node_next], ~used);
508 else
509 putn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry],
510 ~used);
511 return n;
512 } else
513 return NULL;
516 static unsigned long
517 grow(unsigned long size)
519 unsigned long incr, newsize;
520 void *onodes;
522 incr = size > MAX2 ? MAX2 : size;
523 newsize = size + incr;
524 if (newsize > MAX4-MAX2) {
525 oflo: fprintf(stderr, "Junk mail database overflow.\n");
526 return 0;
528 if (nodes_mmapped) {
529 if (lseek(fileno(nfp), newsize*SIZEOF_node-1, SEEK_SET)
530 == (off_t)-1 || write(fileno(nfp),"\0",1) != 1)
531 goto oflo;
532 onodes = nodes;
533 if ((nodes = mremap(nodes, nodes_mmapped, newsize*SIZEOF_node,
534 MREMAP_MAYMOVE)) == MAP_FAILED) {
535 if ((nodes = mmap(NULL, newsize*SIZEOF_node,
536 rw_map!=O_RDONLY ?
537 PROT_READ|PROT_WRITE :
538 PROT_READ,
539 MAP_SHARED, fileno(nfp), 0))
540 == MAP_FAILED) {
541 nodes = onodes;
542 goto oflo;
544 munmap(onodes, nodes_mmapped);
546 nodes_mmapped = newsize*SIZEOF_node;
547 } else {
548 nodes = srealloc(nodes, newsize*SIZEOF_node);
549 memset(&nodes[size*SIZEOF_node], 0, incr*SIZEOF_node);
551 size = newsize;
552 putn(&super[OF_super_size], size);
553 return size;
556 #define SAVE(c) { \
557 if (i+j >= (long)*bufsize-4) \
558 *buf = srealloc(*buf, *bufsize += 32); \
559 (*buf)[j+i] = (c); \
560 i += (*buf)[j+i] != '\0'; \
563 static char *
564 nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
565 struct lexstat *sp, int *stop)
567 int c, i, j, k;
568 char *cp, *cq;
570 loop: *stop = 0;
571 sp->hadamp = 0;
572 if (sp->save) {
573 i = j = 0;
574 for (cp = sp->save; *cp; cp++) {
575 SAVE(*cp&0377)
577 SAVE('\0')
578 free(sp->save);
579 sp->save = NULL;
580 goto out;
582 if (sp->loc == FROM_LINE)
583 while (*count > 0 && (c = getc(fp)) != EOF) {
584 sp->lastc = c;
585 if (c == '\n') {
586 sp->loc = HEADER;
587 break;
590 i = 0;
591 j = 0;
592 if (sp->loc == HEADER && sp->field[0]) {
593 field: cp = sp->field;
594 do {
595 c = *cp&0377;
596 SAVE(c)
597 cp++;
598 } while (*cp);
599 j = i;
600 i = 0;
602 if (sp->price) {
603 sp->price = 0;
604 SAVE('$')
606 while (*count > 0 && (c = getc(fp)) != EOF) {
607 (*count)--;
608 if (c == '\0' && table_version >= 1) {
609 sp->loc = HEADER;
610 sp->lastc = '\n';
611 *stop = 1;
612 continue;
614 if (c == '\b' && table_version >= 1) {
615 sp->html = HTML_TEXT;
616 continue;
618 if (c == '<' && sp->html == HTML_TEXT) {
619 sp->html = HTML_TAG;
620 sp->tagp = sp->tag;
621 continue;
623 if (sp->html == HTML_TAG) {
624 if (spacechar(c)) {
625 *sp->tagp = '\0';
626 if (!asccasecmp(sp->tag, "a") ||
627 !asccasecmp(sp->tag, "img") ||
628 !asccasecmp(sp->tag, "font") ||
629 !asccasecmp(sp->tag, "span") ||
630 !asccasecmp(sp->tag, "meta") ||
631 !asccasecmp(sp->tag, "table") ||
632 !asccasecmp(sp->tag, "tr") ||
633 !asccasecmp(sp->tag, "td") ||
634 !asccasecmp(sp->tag, "p"))
635 sp->html = HTML_TEXT;
636 else
637 sp->html = HTML_SKIP;
638 } else if (c == '>') {
639 sp->html = HTML_TEXT;
640 continue;
641 } else {
642 if (sp->tagp - sp->tag < sizeof sp->tag - 1)
643 *sp->tagp++ = c;
644 continue;
647 if (sp->html == HTML_SKIP) {
648 if (c == '>')
649 sp->html = HTML_TEXT;
650 continue;
652 if (c == '$' && i == 0)
653 sp->price = 1;
654 if (sp->loc == HEADER && sp->lastc == '\n') {
655 if (!spacechar(c)) {
656 k = 0;
657 while (k < sizeof sp->field - 3) {
658 sp->field[k++] = c;
659 if (*count <= 0 ||
660 (c = getc(fp)) == EOF)
661 break;
662 if (spacechar(c) || c == ':') {
663 ungetc(c, fp);
664 break;
666 sp->lastc = c;
667 (*count)--;
669 sp->field[k++] = '*';
670 sp->field[k] = '\0';
671 j = 0;
672 *stop = 1;
673 goto field;
674 } else if (c == '\n') {
675 j = 0;
676 sp->loc = BODY;
677 sp->html = HTML_NONE;
678 *stop = 1;
681 if (sp->url) {
682 if (!url_xchar(c)) {
683 sp->url = 0;
684 cp = sp->save = smalloc(i+6);
685 for (cq = "HOST*"; *cq; cq++)
686 *cp++ = *cq;
687 for (cq = &(*buf)[j]; *cq != ':'; cq++);
688 cq += 3; /* skip "://" */
689 while (cq < &(*buf)[i+j] &&
690 (alnumchar(*cq&0377) ||
691 *cq == '.' || *cq == '-'))
692 *cp++ = *cq++;
693 *cp = '\0';
694 *stop = 1;
695 break;
697 SAVE(c)
698 } else if (constituent(c, *buf, i+j, sp->price, sp->hadamp) ||
699 sp->loc == HEADER && c == '.' &&
700 asccasecmp(sp->field, "subject*")) {
701 if (c == '&')
702 sp->hadamp = 1;
703 SAVE(c)
704 } else if (i > 0 && c == ':' && *count > 2) {
705 if ((c = getc(fp)) != '/') {
706 ungetc(c, fp);
707 break;
709 (*count)--;
710 if ((c = getc(fp)) != '/') {
711 ungetc(c, fp);
712 break;
714 (*count)--;
715 sp->url = 1;
716 SAVE('\0')
717 cp = savestr(*buf);
718 j = i = 0;
719 for (cq = "URL*"; *cq; cq++) {
720 SAVE(*cq&0377)
722 j = i;
723 i = 0;
724 do {
725 if (alnumchar(*cp&0377)) {
726 SAVE(*cp&0377)
727 } else
728 i = 0;
729 } while (*++cp);
730 for (cq = "://"; *cq; cq++) {
731 SAVE(*cq&0377)
733 } else if (i > 1 && ((*buf)[i+j-1] == ',' ||
734 (*buf)[i+j-1] == '.') && !digitchar(c)) {
735 i--;
736 ungetc(c, fp);
737 (*count)++;
738 break;
739 } else if (i > 0) {
740 sp->lastc = c;
741 break;
743 sp->lastc = c;
745 out: if (i > 0) {
746 SAVE('\0')
747 c = 0;
748 for (k = 0; k < i; k++)
749 if (digitchar((*buf)[k+j]&0377))
750 c++;
751 else if (!alphachar((*buf)[k+j]&0377) &&
752 (*buf)[k+j] != '$') {
753 c = 0;
754 break;
756 if (c == i)
757 goto loop;
759 * Including the results of other filtering software (the
760 * 'X-Spam' fields) might seem tempting, but will also rate
761 * their false negatives good with this filter. Therefore
762 * these fields are ignored.
764 * Handling 'Received' fields is difficult since they include
765 * lots of both useless and interesting words for our purposes.
767 if (sp->loc == HEADER &&
768 (asccasecmp(sp->field, "message-id*") == 0 ||
769 asccasecmp(sp->field, "references*") == 0 ||
770 asccasecmp(sp->field, "in-reply-to*") == 0 ||
771 asccasecmp(sp->field, "status*") == 0 ||
772 asccasecmp(sp->field, "x-status*") == 0 ||
773 asccasecmp(sp->field, "date*") == 0 ||
774 asccasecmp(sp->field, "delivery-date*") == 0 ||
775 ascncasecmp(sp->field, "x-spam", 6) == 0 ||
776 ascncasecmp(sp->field, "x-pstn", 6) == 0 ||
777 ascncasecmp(sp->field, "x-scanned", 9) == 0 ||
778 asccasecmp(sp->field, "received*") == 0 &&
779 ((2*c > i) || i < 4 ||
780 asccasestr(*buf, "localhost") != NULL)))
781 goto loop;
782 return *buf;
784 return NULL;
787 #define JOINCHECK if (i >= *bufsize) \
788 *buf = srealloc(*buf, *bufsize += 32)
789 static void
790 join(char **buf, size_t *bufsize, const char *s1, const char *s2)
792 int i = 0;
794 while (*s1) {
795 JOINCHECK;
796 (*buf)[i++] = *s1++;
798 JOINCHECK;
799 (*buf)[i++] = ' ';
800 do {
801 JOINCHECK;
802 (*buf)[i++] = *s2;
803 } while (*s2++);
806 /*ARGSUSED3*/
807 static void
808 add(const char *word, enum entry entry, struct lexstat *sp, int incr)
810 unsigned c;
811 unsigned long h1, h2;
812 char *n;
814 dbhash(word, &h1, &h2);
815 if ((n = lookup(h1, h2, 1)) != NULL) {
816 switch (entry) {
817 case GOOD:
818 c = get(&n[OF_node_good]);
819 if (incr>0 && c<MAX3-incr || incr<0 && c>=-incr) {
820 c += incr;
821 put(&n[OF_node_good], c);
823 break;
824 case BAD:
825 c = get(&n[OF_node_bad]);
826 if (incr>0 && c<MAX3-incr || incr<0 && c>=-incr) {
827 c += incr;
828 put(&n[OF_node_bad], c);
830 break;
835 static enum okay
836 scan(struct message *m, enum entry entry,
837 void (*func)(const char *, enum entry, struct lexstat *, int),
838 int arg)
840 FILE *fp;
841 char *buf0 = NULL, *buf1 = NULL, *buf2 = NULL, **bp, *cp;
842 size_t bufsize0 = 0, bufsize1 = 0, bufsize2 = 0, *zp, count;
843 struct lexstat *sp;
844 int stop;
846 if ((fp = Ftemp(&cp, "Ra", "w+", 0600, 1)) == NULL) {
847 perror("tempfile");
848 return STOP;
850 rm(cp);
851 Ftfree(&cp);
852 if (send(m, fp, NULL, NULL, SEND_TOFLTR, NULL) < 0) {
853 Fclose(fp);
854 return STOP;
856 fflush(fp);
857 rewind(fp);
858 sp = scalloc(1, sizeof *sp);
859 count = fsize(fp);
860 bp = &buf0;
861 zp = &bufsize0;
862 while (nextword(bp, zp, &count, fp, sp, &stop) != NULL) {
863 (*func)(*bp, entry, sp, arg);
864 if (chained_tokens && buf0 && *buf0 && buf1 && *buf1 && !stop) {
865 join(&buf2, &bufsize2, bp == &buf1 ? buf0 : buf1, *bp);
866 (*func)(buf2, entry, sp, arg);
868 bp = bp == &buf1 ? &buf0 : &buf1;
869 zp = zp == &bufsize1 ? &bufsize0 : &bufsize1;
871 free(buf0);
872 free(buf1);
873 free(buf2);
874 free(sp);
875 Fclose(fp);
876 return OKAY;
879 static void
880 recompute(void)
882 unsigned long used, i;
883 unsigned s;
884 char *n;
885 float p;
887 used = getn(&super[OF_super_used]);
888 for (i = 0; i < used; i++) {
889 n = &nodes[i*SIZEOF_node];
890 p = getprob(n);
891 s = f2s(p);
892 put(&n[OF_node_prob_O], s);
896 static float
897 getprob(char *n)
899 unsigned long ngood, nbad;
900 unsigned g, b;
901 float p, BOT, TOP;
903 ngood = getn(&super[OF_super_ngood]);
904 nbad = getn(&super[OF_super_nbad]);
905 if (ngood + nbad >= 18000) {
906 BOT = .0001;
907 TOP = .9999;
908 } else if (ngood + nbad >= 9000) {
909 BOT = .001;
910 TOP = .999;
911 } else {
912 BOT = .01;
913 TOP = .99;
915 g = get(&n[OF_node_good]) * 2;
916 b = get(&n[OF_node_bad]);
917 if (g + b >= 5) {
918 p = smin(1.0, nbad ? (float)b/nbad : 0.0) /
919 (smin(1.0, ngood ? (float)g/ngood : 0.0) +
920 smin(1.0, nbad ? (float)b/nbad : 0.0));
921 p = smin(TOP, p);
922 p = smax(BOT, p);
923 if (p == TOP && b <= 10 && g == 0)
924 p -= BOT;
925 else if (p == BOT && g <= 10 && b == 0)
926 p += BOT;
927 } else if (g == 0 && b == 0)
928 p = DFL;
929 else
930 p = 0;
931 return p;
934 static int
935 insert(int *msgvec, enum entry entry, int incr)
937 int *ip;
938 unsigned long u = 0;
940 verbose = value("verbose") != NULL;
941 if (getdb(O_RDWR) != OKAY)
942 return 1;
943 switch (entry) {
944 case GOOD:
945 u = getn(&super[OF_super_ngood]);
946 break;
947 case BAD:
948 u = getn(&super[OF_super_nbad]);
949 break;
951 for (ip = msgvec; *ip; ip++) {
952 setdot(&message[*ip-1]);
953 if (incr > 0 && u == MAX4-incr+1) {
954 fprintf(stderr, "Junk mail database overflow.\n");
955 break;
956 } else if (incr < 0 && -incr > u) {
957 fprintf(stderr, "Junk mail database underflow.\n");
958 break;
960 u += incr;
961 if (entry == GOOD && incr > 0 || entry == BAD && incr < 0)
962 message[*ip-1].m_flag &= ~MJUNK;
963 else
964 message[*ip-1].m_flag |= MJUNK;
965 scan(&message[*ip-1], entry, add, incr);
967 switch (entry) {
968 case GOOD:
969 putn(&super[OF_super_ngood], u);
970 break;
971 case BAD:
972 putn(&super[OF_super_nbad], u);
973 break;
975 if (table_version < 1)
976 recompute();
977 putdb();
978 relsedb();
979 return 0;
983 cgood(void *v)
985 return insert(v, GOOD, 1);
989 cjunk(void *v)
991 return insert(v, BAD, 1);
995 cungood(void *v)
997 return insert(v, GOOD, -1);
1001 cunjunk(void *v)
1003 return insert(v, BAD, -1);
1006 int
1007 cclassify(void *v)
1009 int *msgvec = v, *ip;
1011 verbose = value("verbose") != NULL;
1012 _debug = debug || value("debug") != NULL;
1013 if (getdb(O_RDONLY) != OKAY)
1014 return 1;
1015 for (ip = msgvec; *ip; ip++) {
1016 setdot(&message[*ip-1]);
1017 clsf(&message[*ip-1]);
1019 relsedb();
1020 return 0;
1023 #define BEST 15
1024 static struct {
1025 float dist;
1026 float prob;
1027 char *word;
1028 unsigned long hash1;
1029 unsigned long hash2;
1030 enum loc loc;
1031 } best[BEST];
1033 static void
1034 clsf(struct message *m)
1036 int i;
1037 float a = 1, b = 1, r;
1039 if (verbose)
1040 fprintf(stderr, "Examining message %d\n",
1041 (int)(m - &message[0] + 1));
1042 for (i = 0; i < BEST; i++) {
1043 best[i].dist = 0;
1044 best[i].prob = -1;
1046 if (scan(m, -1, rate, 0) != OKAY)
1047 return;
1048 if (best[0].prob == -1) {
1049 if (verbose)
1050 fprintf(stderr, "No information found.\n");
1051 m->m_flag &= ~MJUNK;
1052 return;
1054 for (i = 0; i < BEST; i++) {
1055 if (best[i].prob == -1)
1056 break;
1057 if (verbose)
1058 fprintf(stderr, "Probe %2d: \"%s\", hash=%lu:%lu "
1059 "prob=%.4g dist=%.4g\n",
1060 i+1, prstr(best[i].word),
1061 best[i].hash1, best[i].hash2,
1062 best[i].prob, best[i].dist);
1063 a *= best[i].prob;
1064 b *= 1 - best[i].prob;
1066 r = a+b > 0 ? a / (a+b) : 0;
1067 if (verbose)
1068 fprintf(stderr, "Junk probability of message %d: %g\n",
1069 (int)(m - &message[0] + 1), r);
1070 if (r > THR)
1071 m->m_flag |= MJUNK;
1072 else
1073 m->m_flag &= ~MJUNK;
1076 /*ARGSUSED4*/
1077 static void
1078 rate(const char *word, enum entry entry, struct lexstat *sp, int unused)
1080 char *n;
1081 unsigned long h1, h2;
1082 float p, d;
1083 int i, j;
1085 dbhash(word, &h1, &h2);
1086 if ((n = lookup(h1, h2, 0)) != NULL) {
1087 p = getprob(n);
1088 } else
1089 p = DFL;
1090 if (_debug)
1091 fprintf(stderr, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1, h2,
1092 n ? get(&n[OF_node_good]) : 0,
1093 n ? get(&n[OF_node_bad]) : 0,
1094 p, prstr(word));
1095 if (p == 0)
1096 return;
1097 d = p >= MID ? p - MID : MID - p;
1098 if (d >= best[BEST-1].dist)
1099 for (i = 0; i < BEST; i++) {
1100 if (h1 == best[i].hash1 && h2 == best[i].hash2)
1101 break;
1103 * For equal distance, this selection prefers
1104 * words with a low probability, since a false
1105 * negative is better than a false positive,
1106 * and since experience has shown that false
1107 * positives are more likely otherwise. Then,
1108 * words from the end of the header and from
1109 * the start of the body are preferred. This
1110 * gives the most interesting verbose output.
1112 if (d > best[i].dist ||
1113 d == best[i].dist &&
1114 p < best[i].prob ||
1115 best[i].loc == HEADER &&
1116 d == best[i].dist) {
1117 for (j = BEST-2; j >= i; j--)
1118 best[j+1] = best[j];
1119 best[i].dist = d;
1120 best[i].prob = p;
1121 best[i].word = savestr(word);
1122 best[i].hash1 = h1;
1123 best[i].hash2 = h2;
1124 best[i].loc = sp->loc;
1125 break;
1130 static void
1131 dbhash(const char *word, unsigned long *h1, unsigned long *h2)
1133 unsigned char digest[16];
1134 MD5_CTX ctx;
1136 MD5Init(&ctx);
1137 MD5Update(&ctx, (unsigned char *)word, strlen(word));
1138 if (table_version >= 1)
1139 MD5Update(&ctx, (unsigned char *)&super[OF_super_mangle], 4);
1140 MD5Final(digest, &ctx);
1141 *h1 = getn(digest);
1142 if (table_version < 1) {
1143 *h1 ^= getn(&super[OF_super_mangle]);
1144 *h2 = 0;
1145 } else
1146 *h2 = get(&digest[4]);
1150 * The selection of the value for mangling is not critical. It is practically
1151 * impossible for any person to determine the exact time when the database
1152 * was created first (without looking at the database, which would reveal the
1153 * value anyway), so we just use this. The MD5 hash here ensures that each
1154 * single second gives a completely different mangling value (which is not
1155 * necessary anymore if table_version>=1, but does not hurt).
1157 static void
1158 mkmangle(void)
1160 union {
1161 time_t t;
1162 char c[16];
1163 } u;
1164 unsigned long s;
1165 unsigned char digest[16];
1166 MD5_CTX ctx;
1168 memset(&u, 0, sizeof u);
1169 time(&u.t);
1170 MD5Init(&ctx);
1171 MD5Update(&ctx, (unsigned char *)u.c, sizeof u.c);
1172 MD5Final(digest, &ctx);
1173 s = getn(digest);
1174 putn(&super[OF_super_mangle], s);
1178 cprobability(void *v)
1180 char **args = v;
1181 unsigned long used, ngood, nbad;
1182 unsigned long h1, h2;
1183 unsigned g, b;
1184 float p, d;
1185 char *n;
1187 if (*args == NULL) {
1188 fprintf(stderr, "No words given.\n");
1189 return 1;
1191 if (getdb(O_RDONLY) != OKAY)
1192 return 1;
1193 used = getn(&super[OF_super_used]);
1194 ngood = getn(&super[OF_super_ngood]);
1195 nbad = getn(&super[OF_super_nbad]);
1196 printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1197 used, ngood, nbad);
1198 do {
1199 dbhash(*args, &h1, &h2);
1200 printf("\"%s\", hash=%lu:%lu ", *args, h1, h2);
1201 if ((n = lookup(h1, h2, 0)) != NULL) {
1202 g = get(&n[OF_node_good]);
1203 b = get(&n[OF_node_bad]);
1204 printf("good=%u bad=%u ", g, b);
1205 p = getprob(n);
1206 if (p != 0) {
1207 d = p >= MID ? p - MID : MID - p;
1208 printf("prob=%.4g dist=%.4g", p, d);
1209 } else
1210 printf("too infrequent");
1211 } else
1212 printf("not in database");
1213 putchar('\n');
1214 } while (*++args);
1215 relsedb();
1216 return 0;