Tweak *rfc822-show-all* manual text
[s-mailx.git] / junk.c
blob617b56600ec8b0b893e05e9c2f059e34c68d0fd0
1 /*
2 * S-nail - a mail user agent derived from Berkeley Mail.
4 * Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
5 * Copyright (c) 2012 Steffen "Daode" Nurpmeso.
6 */
7 /*
8 * Copyright (c) 2004
9 * Gunnar Ritter. All rights reserved.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by Gunnar Ritter
22 * and his contributors.
23 * 4. Neither the name of Gunnar Ritter nor the names of his contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
40 #include "config.h"
41 #include "rcv.h"
43 #ifdef USE_JUNK
44 #include <sys/stat.h>
45 #include <fcntl.h>
46 #include <limits.h>
47 #include <time.h>
48 #include <unistd.h>
49 #include <utime.h>
51 #ifdef HAVE_MMAP
52 #include <sys/mman.h>
53 #else /* !HAVE_MMAP */
54 #define mmap(a, b, c, d, e, f) MAP_FAILED
55 #define munmap(a, b)
56 #endif /* !HAVE_MMAP */
57 #ifndef HAVE_MREMAP
58 #define mremap(a, b, c, d) MAP_FAILED
59 #endif /* !HAVE_MREMAP */
61 #ifndef MAP_FAILED
62 #define MAP_FAILED ((void *)-1)
63 #endif /* !MAP_FAILED */
65 #include "extern.h"
66 #include "md5.h"
69 * Mail -- a mail program
71 * Junk classification, mostly according to Paul Graham's "A Plan for Spam",
72 * August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
73 * Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
75 * Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
76 * Classification using Chained Tokens", February 2004,
77 * <http://www.nuclearelephant.com/papers/chained.html>.
80 #define DFL .40
81 #define THR .9
82 #define MID .5
84 #define MAX2 0x0000ffff
85 #define MAX3 0x00ffffffUL
86 #define MAX4 0xffffffffUL
89 * The dictionary consists of two files forming a hash table. The hash
90 * consists of the first 56 bits of the result of applying MD5 to the
91 * input word. This scheme ensures that collisions are unlikely enough
92 * to make junk detection work; according to the birthday paradox, a
93 * 50 % probability for one single collision is reached at 2^28 entries.
95 * To make the chain structure independent from input, the MD5 input is
96 * xor'ed with a random number. This makes it impossible that someone uses
97 * a carefully crafted message for a denial-of-service attack against the
98 * database.
100 #define SIZEOF_node 17
101 #define OF_node_hash 0 /* first 32 bits of MD5 of word|mangle */
102 #define OF_node_next 4 /* bit-negated table index of next node */
103 #define OF_node_good 8 /* number of times this appeared in good msgs */
104 #define OF_node_bad 11 /* number of times this appeared in bad msgs */
105 #define OF_node_prob_O 14 /* table_version<1: precomputed probability */
106 #define OF_node_hash2 14 /* upper 3 bytes of MD5 hash */
107 static char *nodes;
109 #define SIZEOF_super 262164
110 #define OF_super_size 0 /* allocated nodes in the chain file */
111 #define OF_super_used 4 /* used nodes in the chain file */
112 #define OF_super_ngood 8 /* number of good messages scanned so far */
113 #define OF_super_nbad 12 /* number of bad messages scanned so far */
114 #define OF_super_mangle 16 /* used to mangle the MD5 input */
115 #define OF_super_bucket 20 /* 65536 bit-negated node indices */
116 #define SIZEOF_entry 4
117 static char *super;
119 static size_t super_mmapped;
120 static size_t nodes_mmapped;
121 static int rw_map;
122 static int chained_tokens;
125 * Version history
126 * ---------------
127 * 0 Initial version
128 * 1 Fixed the mangling; it was ineffective in version 0.
129 * Hash extended to 56 bits.
131 static int table_version;
132 #define current_table_version 1
134 #define get(e) \
135 ((unsigned)(((char *)(e))[0]&0377) + \
136 ((unsigned)(((char *)(e))[1]&0377) << 8) + \
137 ((unsigned)(((char *)(e))[2]&0377) << 16))
139 #define put(e, n) \
140 (((char *)(e))[0] = (n) & 0x0000ff, \
141 ((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
142 ((char *)(e))[2] = ((n) & 0xff0000) >> 16)
144 #define f2s(d) (smin(((unsigned)((d) * MAX3)), MAX3))
146 #define s2f(s) ((float)(s) / MAX3)
148 #define getn(p) \
149 ((unsigned long)(((char *)(p))[0]&0377) + \
150 ((unsigned long)(((char *)(p))[1]&0377) << 8) + \
151 ((unsigned long)(((char *)(p))[2]&0377) << 16) + \
152 ((unsigned long)(((char *)(p))[3]&0377) << 24))
154 #define putn(p, n) \
155 (((char *)(p))[0] = (n) & 0x000000ffUL, \
156 ((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
157 ((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
158 ((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
160 struct lexstat {
161 char *save;
162 int price;
163 int url;
164 int lastc;
165 int hadamp;
166 enum loc {
167 FROM_LINE = 0,
168 HEADER = 1,
169 BODY = 2
170 } loc;
171 enum html {
172 HTML_NONE = 0,
173 HTML_TEXT = 1,
174 HTML_TAG = 2,
175 HTML_SKIP = 3
176 } html;
177 char tag[8];
178 char *tagp;
179 char field[LINESIZE];
182 #define constituent(c, b, i, price, hadamp) \
183 ((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
184 (c) == '$' || (c) == '!' || (c) == '_' || \
185 (c) == '#' || (c) == '%' || (c) == '&' || \
186 ((c) == ';' && hadamp) || \
187 ((c) == '-' && !(price)) || \
188 (((c) == '.' || (c) == ',' || (c) == '/') && \
189 (i) > 0 && digitchar((b)[(i)-1]&0377)))
191 #define url_xchar(c) \
192 (((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
193 !spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
194 (c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
195 (c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
196 (c) != '#' && (c) != '"')
198 enum db {
199 SUPER = 0,
200 NODES = 1
203 enum entry {
204 GOOD = 0,
205 BAD = 1
208 static const char README1[] = "\
209 This is a junk mail database maintained by mailx(1). It does not contain any\n\
210 of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
211 used for lookup. It is thus possible to tell if some given word was likely\n\
212 contained in your mail from examining this data, at best.\n";
213 static const char README2[] = "\n\
214 The database files are stored in compress(1) format by default. This saves\n\
215 some space, but leads to higher processor usage when the database is read\n\
216 or updated. You can use uncompress(1) on these files if you prefer to store\n\
217 them in flat form.\n";
219 static int verbose;
220 static int _debug;
221 static FILE *sfp, *nfp;
222 static char *sname, *nname;
224 static enum okay getdb(int rw);
225 static void putdb(void);
226 static void relsedb(void);
227 static FILE *dbfp(enum db db, int rw, int *compressed, char **fn);
228 static char *lookup(unsigned long h1, unsigned long h2, int create);
229 static unsigned long grow(unsigned long size);
230 static char *nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
231 struct lexstat *sp, int *stop);
232 static void join(char **buf, size_t *bufsize, const char *s1, const char *s2);
233 static void add(const char *word, enum entry entry, struct lexstat *sp,
234 int incr);
235 static enum okay scan(struct message *m, enum entry entry,
236 void (*func)(const char *, enum entry, struct lexstat *, int),
237 int arg);
238 static void recompute(void);
239 static float getprob(char *n);
240 static int insert(int *msgvec, enum entry entry, int incr);
241 static void clsf(struct message *m);
242 static void rate(const char *word, enum entry entry, struct lexstat *sp,
243 int unused);
244 static void dbhash(const char *word, unsigned long *h1, unsigned long *h2);
245 static void mkmangle(void);
247 static enum okay
248 getdb(int rw)
250 void *zp = NULL;
251 long n;
252 int compressed;
254 chained_tokens = value("chained-junk-tokens") != NULL;
255 if ((sfp = dbfp(SUPER, rw, &compressed, &sname)) == (FILE *)-1)
256 return STOP;
257 if (sfp && !compressed) {
258 super = mmap(NULL, SIZEOF_super,
259 rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
260 MAP_SHARED, fileno(sfp), 0);
261 if (super != MAP_FAILED) {
262 super_mmapped = SIZEOF_super;
263 goto skip;
266 super_mmapped = 0;
267 super = smalloc(SIZEOF_super);
268 if (sfp) {
269 if (compressed)
270 zp = zalloc(sfp);
271 if ((compressed ? zread(zp, super, SIZEOF_super)
272 != SIZEOF_super :
273 fread(super, 1, SIZEOF_super, sfp)
274 != SIZEOF_super) ||
275 ferror(sfp)) {
276 fprintf(stderr, "Error reading junk mail database.\n");
277 memset(super, 0, SIZEOF_super);
278 mkmangle();
279 if (compressed)
280 zfree(zp);
281 Fclose(sfp);
282 sfp = NULL;
283 } else if (compressed)
284 zfree(zp);
285 } else {
286 memset(super, 0, SIZEOF_super);
287 mkmangle();
289 skip: if ((n = getn(&super[OF_super_size])) == 0) {
290 n = 1;
291 putn(&super[OF_super_size], 1);
293 if (sfp && (nfp = dbfp(NODES, rw, &compressed, &nname)) != NULL) {
294 if (nfp == (FILE *)-1) {
295 relsedb();
296 return STOP;
299 rw_map = rw;
300 if (sfp && nfp && !compressed) {
301 nodes = mmap(NULL, n * SIZEOF_node,
302 rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
303 MAP_SHARED, fileno(nfp), 0);
304 if (nodes != MAP_FAILED) {
305 nodes_mmapped = n * SIZEOF_node;
306 return OKAY;
309 nodes_mmapped = 0;
310 nodes = smalloc(n * SIZEOF_node);
311 if (sfp && nfp) {
312 if (compressed)
313 zp = zalloc(nfp);
314 if ((compressed ? zread(zp, nodes, n * SIZEOF_node)
315 != n * SIZEOF_node :
316 fread(nodes, 1, n * SIZEOF_node, nfp)
317 != (unsigned long)n * SIZEOF_node) ||
318 ferror(nfp)) {
319 fprintf(stderr, "Error reading junk mail database.\n");
320 memset(nodes, 0, n * SIZEOF_node);
321 memset(super, 0, SIZEOF_super);
322 mkmangle();
323 putn(&super[OF_super_size], n);
325 if (compressed)
326 zfree(zp);
327 Fclose(nfp);
328 nfp = NULL;
329 } else
330 memset(nodes, 0, n * SIZEOF_node);
331 if (sfp) {
332 Fclose(sfp);
333 sfp = NULL;
335 return OKAY;
338 static void
339 putdb(void)
341 void *zp;
342 int scomp, ncomp;
344 if ((! super_mmapped && (sfp = dbfp(SUPER, O_WRONLY, &scomp, &sname))
345 == NULL) || sfp == (FILE *)-1)
346 return;
347 if ((! nodes_mmapped && (nfp = dbfp(NODES, O_WRONLY, &ncomp, &nname))
348 == NULL) || nfp == (FILE *)-1)
349 return;
350 if (super_mmapped == 0 || nodes_mmapped == 0)
351 holdint();
353 * Use utime() with mmap() since Linux does not update st_mtime
354 * reliably otherwise.
356 if (super_mmapped)
357 utime(sname, NULL);
358 else if (scomp) {
359 zp = zalloc(sfp);
360 zwrite(zp, super, SIZEOF_super);
361 zfree(zp);
362 trunc(sfp);
363 } else
364 fwrite(super, 1, SIZEOF_super, sfp);
365 if (nodes_mmapped)
366 utime(nname, NULL);
367 else if (ncomp) {
368 zp = zalloc(nfp);
369 zwrite(zp, nodes, getn(&super[OF_super_size]) * SIZEOF_node);
370 zfree(zp);
371 trunc(nfp);
372 } else
373 fwrite(nodes, 1,
374 getn(&super[OF_super_size]) * SIZEOF_node, nfp);
375 if (super_mmapped == 0 || nodes_mmapped == 0)
376 relseint();
379 static void
380 relsedb(void)
382 if (super_mmapped) {
383 munmap(super, super_mmapped);
384 super_mmapped = 0;
385 } else
386 free(super);
387 if (nodes_mmapped) {
388 munmap(nodes, nodes_mmapped);
389 nodes_mmapped = 0;
390 } else
391 free(nodes);
392 if (sfp && sfp != (FILE *)-1) {
393 Fclose(sfp);
394 sfp = NULL;
396 if (nfp && nfp != (FILE *)-1) {
397 Fclose(nfp);
398 nfp = NULL;
402 static FILE *
403 dbfp(enum db db, int rw, int *compressed, char **fn)
405 FILE *fp, *rp;
406 char *dir;
407 struct flock flp;
408 char *sfx[][2] = {
409 { "super", "nodes" },
410 { "super1", "nodes1" }
412 char **sf;
413 char *zfx[][2] = {
414 { "super.Z", "nodes.Z" },
415 { "super1.Z", "nodes1.Z" }
417 char **zf;
418 int n;
420 if ((dir = value("junkdb")) == NULL) {
421 fprintf(stderr, "No junk mail database specified. "
422 "Set the junkdb variable.\n");
423 return (FILE *)-1;
425 dir = file_expand(dir);
426 if (makedir(dir) == STOP) {
427 fprintf(stderr, "Cannot create directory \"%s\"\n.", dir);
428 return (FILE *)-1;
430 if (rw!=O_WRONLY)
431 table_version = current_table_version;
432 loop: sf = sfx[table_version];
433 zf = zfx[table_version];
434 *fn = salloc((n = strlen(dir)) + 40);
435 strcpy(*fn, dir);
436 (*fn)[n] = '/';
437 *compressed = 0;
438 strcpy(&(*fn)[n+1], sf[db]);
439 if ((fp = Fopen(*fn, rw!=O_RDONLY ? "r+" : "r")) != NULL)
440 goto okay;
441 *compressed = 1;
442 strcpy(&(*fn)[n+1], zf[db]);
443 if ((fp = Fopen(*fn, rw ? "r+" : "r")) == NULL &&
444 rw==O_WRONLY ? (fp = Fopen(*fn, "w+")) == NULL : 0) {
445 fprintf(stderr, "Cannot open junk mail database \"%s\".\n",*fn);
446 return NULL;
448 if (rw==O_WRONLY) {
449 strcpy(&(*fn)[n+1], "README");
450 if (access(*fn, F_OK) < 0 && (rp = Fopen(*fn, "w")) != NULL) {
451 fputs(README1, rp);
452 fputs(README2, rp);
453 Fclose(rp);
455 } else if (fp == NULL) {
456 if (table_version > 0) {
457 table_version--;
458 goto loop;
459 } else
460 table_version = current_table_version;
462 okay: if (fp) {
463 flp.l_type = rw!=O_RDONLY ? F_WRLCK : F_RDLCK;
464 flp.l_start = 0;
465 flp.l_len = 0;
466 flp.l_whence = SEEK_SET;
467 fcntl(fileno(fp), F_SETLKW, &flp);
469 return fp;
472 static char *
473 lookup(unsigned long h1, unsigned long h2, int create)
475 char *n, *lastn = NULL;
476 unsigned long c, lastc = MAX4, used, size;
478 used = getn(&super[OF_super_used]);
479 size = getn(&super[OF_super_size]);
480 c = ~getn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry]);
481 n = &nodes[c*SIZEOF_node];
482 while (c < used) {
483 if (getn(&n[OF_node_hash]) == h1 &&
484 (table_version < 1 ? 1 :
485 get(&n[OF_node_hash2]) == h2))
486 return n;
487 lastc = c;
488 lastn = n;
489 c = ~getn(&n[OF_node_next]);
490 n = &nodes[c*SIZEOF_node];
492 if (create) {
493 if (used >= size) {
494 if ((size = grow(size)) == 0)
495 return NULL;
496 lastn = &nodes[lastc*SIZEOF_node];
498 putn(&super[OF_super_used], used+1);
499 n = &nodes[used*SIZEOF_node];
500 putn(&n[OF_node_hash], h1);
501 put(&n[OF_node_hash2], h2);
502 if (lastc < used)
503 putn(&lastn[OF_node_next], ~used);
504 else
505 putn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry],
506 ~used);
507 return n;
508 } else
509 return NULL;
512 static unsigned long
513 grow(unsigned long size)
515 unsigned long incr, newsize;
516 void *onodes;
518 incr = size > MAX2 ? MAX2 : size;
519 newsize = size + incr;
520 if (newsize > MAX4-MAX2) {
521 oflo: fprintf(stderr, "Junk mail database overflow.\n");
522 return 0;
524 if (nodes_mmapped) {
525 if (lseek(fileno(nfp), newsize*SIZEOF_node-1, SEEK_SET)
526 == (off_t)-1 || write(fileno(nfp),"\0",1) != 1)
527 goto oflo;
528 onodes = nodes;
529 if ((nodes = mremap(nodes, nodes_mmapped, newsize*SIZEOF_node,
530 MREMAP_MAYMOVE)) == MAP_FAILED) {
531 if ((nodes = mmap(NULL, newsize*SIZEOF_node,
532 rw_map!=O_RDONLY ?
533 PROT_READ|PROT_WRITE :
534 PROT_READ,
535 MAP_SHARED, fileno(nfp), 0))
536 == MAP_FAILED) {
537 nodes = onodes;
538 goto oflo;
540 munmap(onodes, nodes_mmapped);
542 nodes_mmapped = newsize*SIZEOF_node;
543 } else {
544 nodes = srealloc(nodes, newsize*SIZEOF_node);
545 memset(&nodes[size*SIZEOF_node], 0, incr*SIZEOF_node);
547 size = newsize;
548 putn(&super[OF_super_size], size);
549 return size;
552 #define SAVE(c) { \
553 if (i+j >= (long)*bufsize-4) \
554 *buf = srealloc(*buf, *bufsize += 32); \
555 (*buf)[j+i] = (c); \
556 i += (*buf)[j+i] != '\0'; \
559 static char *
560 nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
561 struct lexstat *sp, int *stop)
563 int c, i, j, k;
564 char *cp, *cq;
566 loop: *stop = 0;
567 sp->hadamp = 0;
568 if (sp->save) {
569 i = j = 0;
570 for (cp = sp->save; *cp; cp++) {
571 SAVE(*cp&0377)
573 SAVE('\0')
574 free(sp->save);
575 sp->save = NULL;
576 goto out;
578 if (sp->loc == FROM_LINE)
579 while (*count > 0 && (c = getc(fp)) != EOF) {
580 sp->lastc = c;
581 if (c == '\n') {
582 sp->loc = HEADER;
583 break;
586 i = 0;
587 j = 0;
588 if (sp->loc == HEADER && sp->field[0]) {
589 field: cp = sp->field;
590 do {
591 c = *cp&0377;
592 SAVE(c)
593 cp++;
594 } while (*cp);
595 j = i;
596 i = 0;
598 if (sp->price) {
599 sp->price = 0;
600 SAVE('$')
602 while (*count > 0 && (c = getc(fp)) != EOF) {
603 (*count)--;
604 if (c == '\0' && table_version >= 1) {
605 sp->loc = HEADER;
606 sp->lastc = '\n';
607 *stop = 1;
608 continue;
610 if (c == '\b' && table_version >= 1) {
611 sp->html = HTML_TEXT;
612 continue;
614 if (c == '<' && sp->html == HTML_TEXT) {
615 sp->html = HTML_TAG;
616 sp->tagp = sp->tag;
617 continue;
619 if (sp->html == HTML_TAG) {
620 if (spacechar(c)) {
621 *sp->tagp = '\0';
622 if (!asccasecmp(sp->tag, "a") ||
623 !asccasecmp(sp->tag, "img") ||
624 !asccasecmp(sp->tag, "font") ||
625 !asccasecmp(sp->tag, "span") ||
626 !asccasecmp(sp->tag, "meta") ||
627 !asccasecmp(sp->tag, "table") ||
628 !asccasecmp(sp->tag, "tr") ||
629 !asccasecmp(sp->tag, "td") ||
630 !asccasecmp(sp->tag, "p"))
631 sp->html = HTML_TEXT;
632 else
633 sp->html = HTML_SKIP;
634 } else if (c == '>') {
635 sp->html = HTML_TEXT;
636 continue;
637 } else {
638 if ((size_t)(sp->tagp - sp->tag) <
639 sizeof sp->tag - 1)
640 *sp->tagp++ = c;
641 continue;
644 if (sp->html == HTML_SKIP) {
645 if (c == '>')
646 sp->html = HTML_TEXT;
647 continue;
649 if (c == '$' && i == 0)
650 sp->price = 1;
651 if (sp->loc == HEADER && sp->lastc == '\n') {
652 if (!spacechar(c)) {
653 k = 0;
654 while (k < (int)sizeof sp->field - 3) {
655 sp->field[k++] = c;
656 if (*count <= 0 ||
657 (c = getc(fp)) == EOF)
658 break;
659 if (spacechar(c) || c == ':') {
660 ungetc(c, fp);
661 break;
663 sp->lastc = c;
664 (*count)--;
666 sp->field[k++] = '*';
667 sp->field[k] = '\0';
668 j = 0;
669 *stop = 1;
670 goto field;
671 } else if (c == '\n') {
672 j = 0;
673 sp->loc = BODY;
674 sp->html = HTML_NONE;
675 *stop = 1;
678 if (sp->url) {
679 if (!url_xchar(c)) {
680 sp->url = 0;
681 cp = sp->save = smalloc(i+6);
682 for (cq = "HOST*"; *cq; cq++)
683 *cp++ = *cq;
684 for (cq = &(*buf)[j]; *cq != ':'; cq++);
685 cq += 3; /* skip "://" */
686 while (cq < &(*buf)[i+j] &&
687 (alnumchar(*cq&0377) ||
688 *cq == '.' || *cq == '-'))
689 *cp++ = *cq++;
690 *cp = '\0';
691 *stop = 1;
692 break;
694 SAVE(c)
695 } else if (constituent(c, *buf, i+j, sp->price, sp->hadamp) ||
696 (sp->loc == HEADER && c == '.' &&
697 asccasecmp(sp->field, "subject*"))) {
698 if (c == '&')
699 sp->hadamp = 1;
700 SAVE(c)
701 } else if (i > 0 && c == ':' && *count > 2) {
702 if ((c = getc(fp)) != '/') {
703 ungetc(c, fp);
704 break;
706 (*count)--;
707 if ((c = getc(fp)) != '/') {
708 ungetc(c, fp);
709 break;
711 (*count)--;
712 sp->url = 1;
713 SAVE('\0')
714 cp = savestr(*buf);
715 j = i = 0;
716 for (cq = "URL*"; *cq; cq++) {
717 SAVE(*cq&0377)
719 j = i;
720 i = 0;
721 do {
722 if (alnumchar(*cp&0377)) {
723 SAVE(*cp&0377)
724 } else
725 i = 0;
726 } while (*++cp);
727 for (cq = "://"; *cq; cq++) {
728 SAVE(*cq&0377)
730 } else if (i > 1 && ((*buf)[i+j-1] == ',' ||
731 (*buf)[i+j-1] == '.') && !digitchar(c)) {
732 i--;
733 ungetc(c, fp);
734 (*count)++;
735 break;
736 } else if (i > 0) {
737 sp->lastc = c;
738 break;
740 sp->lastc = c;
742 out: if (i > 0) {
743 SAVE('\0')
744 c = 0;
745 for (k = 0; k < i; k++)
746 if (digitchar((*buf)[k+j]&0377))
747 c++;
748 else if (!alphachar((*buf)[k+j]&0377) &&
749 (*buf)[k+j] != '$') {
750 c = 0;
751 break;
753 if (c == i)
754 goto loop;
756 * Including the results of other filtering software (the
757 * 'X-Spam' fields) might seem tempting, but will also rate
758 * their false negatives good with this filter. Therefore
759 * these fields are ignored.
761 * Handling 'Received' fields is difficult since they include
762 * lots of both useless and interesting words for our purposes.
764 if (sp->loc == HEADER &&
765 (asccasecmp(sp->field, "message-id*") == 0 ||
766 asccasecmp(sp->field, "references*") == 0 ||
767 asccasecmp(sp->field, "in-reply-to*") == 0 ||
768 asccasecmp(sp->field, "status*") == 0 ||
769 asccasecmp(sp->field, "x-status*") == 0 ||
770 asccasecmp(sp->field, "date*") == 0 ||
771 asccasecmp(sp->field, "delivery-date*") == 0 ||
772 ascncasecmp(sp->field, "x-spam", 6) == 0 ||
773 ascncasecmp(sp->field, "x-pstn", 6) == 0 ||
774 ascncasecmp(sp->field, "x-scanned", 9) == 0 ||
775 (asccasecmp(sp->field, "received*") == 0 &&
776 (((2*c > i) || i < 4 ||
777 asccasestr(*buf, "localhost")!=NULL)))))
778 goto loop;
779 return *buf;
781 return NULL;
784 #define JOINCHECK if ((size_t)i >= *bufsize) \
785 *buf = srealloc(*buf, *bufsize += 32)
786 static void
787 join(char **buf, size_t *bufsize, const char *s1, const char *s2)
789 int i = 0;
791 while (*s1) {
792 JOINCHECK;
793 (*buf)[i++] = *s1++;
795 JOINCHECK;
796 (*buf)[i++] = ' ';
797 do {
798 JOINCHECK;
799 (*buf)[i++] = *s2;
800 } while (*s2++);
803 /*ARGSUSED3*/
804 static void
805 add(const char *word, enum entry entry, struct lexstat *sp, int incr)
807 unsigned c;
808 unsigned long h1, h2;
809 char *n;
810 (void)sp;
812 dbhash(word, &h1, &h2);
813 if ((n = lookup(h1, h2, 1)) != NULL) {
814 switch (entry) {
815 case GOOD:
816 c = get(&n[OF_node_good]);
817 if ((incr > 0 && c < (unsigned)MAX3 - incr) ||
818 (incr < 0 && c >= (unsigned)-incr)) {
819 c += incr;
820 put(&n[OF_node_good], c);
822 break;
823 case BAD:
824 c = get(&n[OF_node_bad]);
825 if ((incr > 0 && c < (unsigned)MAX3 - incr) ||
826 (incr < 0 && c >= (unsigned)-incr)) {
827 c += incr;
828 put(&n[OF_node_bad], c);
830 break;
835 static enum okay
836 scan(struct message *m, enum entry entry,
837 void (*func)(const char *, enum entry, struct lexstat *, int),
838 int arg)
840 FILE *fp;
841 char *buf0 = NULL, *buf1 = NULL, *buf2 = NULL, **bp, *cp;
842 size_t bufsize0 = 0, bufsize1 = 0, bufsize2 = 0, *zp, count;
843 struct lexstat *sp;
844 int stop;
846 if ((fp = Ftemp(&cp, "Ra", "w+", 0600, 1)) == NULL) {
847 perror("tempfile");
848 return STOP;
850 rm(cp);
851 Ftfree(&cp);
852 if (send(m, fp, NULL, NULL, SEND_TOFLTR, NULL) < 0) {
853 Fclose(fp);
854 return STOP;
856 fflush(fp);
857 rewind(fp);
858 sp = scalloc(1, sizeof *sp);
859 count = fsize(fp);
860 bp = &buf0;
861 zp = &bufsize0;
862 while (nextword(bp, zp, &count, fp, sp, &stop) != NULL) {
863 (*func)(*bp, entry, sp, arg);
864 if (chained_tokens && buf0 && *buf0 && buf1 && *buf1 && !stop) {
865 join(&buf2, &bufsize2, bp == &buf1 ? buf0 : buf1, *bp);
866 (*func)(buf2, entry, sp, arg);
868 bp = bp == &buf1 ? &buf0 : &buf1;
869 zp = zp == &bufsize1 ? &bufsize0 : &bufsize1;
871 free(buf0);
872 free(buf1);
873 free(buf2);
874 free(sp);
875 Fclose(fp);
876 return OKAY;
879 static void
880 recompute(void)
882 unsigned long used, i;
883 unsigned s;
884 char *n;
885 float p;
887 used = getn(&super[OF_super_used]);
888 for (i = 0; i < used; i++) {
889 n = &nodes[i*SIZEOF_node];
890 p = getprob(n);
891 s = f2s(p);
892 put(&n[OF_node_prob_O], s);
896 static float
897 getprob(char *n)
899 unsigned long ngood, nbad;
900 unsigned g, b;
901 float p, BOT, TOP;
903 ngood = getn(&super[OF_super_ngood]);
904 nbad = getn(&super[OF_super_nbad]);
905 if (ngood + nbad >= 18000) {
906 BOT = .0001;
907 TOP = .9999;
908 } else if (ngood + nbad >= 9000) {
909 BOT = .001;
910 TOP = .999;
911 } else {
912 BOT = .01;
913 TOP = .99;
915 g = get(&n[OF_node_good]) * 2;
916 b = get(&n[OF_node_bad]);
917 if (g + b >= 5) {
918 p = smin(1.0, nbad ? (float)b/nbad : 0.0) /
919 (smin(1.0, ngood ? (float)g/ngood : 0.0) +
920 smin(1.0, nbad ? (float)b/nbad : 0.0));
921 p = smin(TOP, p);
922 p = smax(BOT, p);
923 if (p == TOP && b <= 10 && g == 0)
924 p -= BOT;
925 else if (p == BOT && g <= 10 && b == 0)
926 p += BOT;
927 } else if (g == 0 && b == 0)
928 p = DFL;
929 else
930 p = 0;
931 return p;
934 static int
935 insert(int *msgvec, enum entry entry, int incr)
937 int *ip;
938 unsigned long u = 0;
940 verbose = value("verbose") != NULL;
941 if (getdb(O_RDWR) != OKAY)
942 return 1;
943 switch (entry) {
944 case GOOD:
945 u = getn(&super[OF_super_ngood]);
946 break;
947 case BAD:
948 u = getn(&super[OF_super_nbad]);
949 break;
951 for (ip = msgvec; *ip; ip++) {
952 setdot(&message[*ip-1]);
953 if (incr > 0 && u == MAX4-incr+1) {
954 fprintf(stderr, "Junk mail database overflow.\n");
955 break;
956 } else if (incr < 0 && (unsigned long)-incr > u) {
957 fprintf(stderr, "Junk mail database underflow.\n");
958 break;
960 u += incr;
961 if ((entry == GOOD && incr > 0) || (entry == BAD && incr < 0))
962 message[*ip-1].m_flag &= ~MJUNK;
963 else
964 message[*ip-1].m_flag |= MJUNK;
965 scan(&message[*ip-1], entry, add, incr);
967 switch (entry) {
968 case GOOD:
969 putn(&super[OF_super_ngood], u);
970 break;
971 case BAD:
972 putn(&super[OF_super_nbad], u);
973 break;
975 if (table_version < 1)
976 recompute();
977 putdb();
978 relsedb();
979 return 0;
983 cgood(void *v)
985 return insert(v, GOOD, 1);
989 cjunk(void *v)
991 return insert(v, BAD, 1);
995 cungood(void *v)
997 return insert(v, GOOD, -1);
1001 cunjunk(void *v)
1003 return insert(v, BAD, -1);
1006 int
1007 cclassify(void *v)
1009 int *msgvec = v, *ip;
1011 verbose = value("verbose") != NULL;
1012 _debug = debug || value("debug") != NULL;
1013 if (getdb(O_RDONLY) != OKAY)
1014 return 1;
1015 for (ip = msgvec; *ip; ip++) {
1016 setdot(&message[*ip-1]);
1017 clsf(&message[*ip-1]);
1019 relsedb();
1020 return 0;
1023 #define BEST 15
1024 static struct {
1025 float dist;
1026 float prob;
1027 char *word;
1028 unsigned long hash1;
1029 unsigned long hash2;
1030 enum loc loc;
1031 } best[BEST];
1033 static void
1034 clsf(struct message *m)
1036 int i;
1037 float a = 1, b = 1, r;
1039 if (verbose)
1040 fprintf(stderr, "Examining message %d\n",
1041 (int)(m - &message[0] + 1));
1042 for (i = 0; i < BEST; i++) {
1043 best[i].dist = 0;
1044 best[i].prob = -1;
1046 if (scan(m, -1, rate, 0) != OKAY)
1047 return;
1048 if (best[0].prob == -1) {
1049 if (verbose)
1050 fprintf(stderr, "No information found.\n");
1051 m->m_flag &= ~MJUNK;
1052 return;
1054 for (i = 0; i < BEST; i++) {
1055 if (best[i].prob == -1)
1056 break;
1057 if (verbose)
1058 fprintf(stderr, "Probe %2d: \"%s\", hash=%lu:%lu "
1059 "prob=%.4g dist=%.4g\n",
1060 i+1, prstr(best[i].word),
1061 best[i].hash1, best[i].hash2,
1062 best[i].prob, best[i].dist);
1063 a *= best[i].prob;
1064 b *= 1 - best[i].prob;
1066 r = a+b > 0 ? a / (a+b) : 0;
1067 if (verbose)
1068 fprintf(stderr, "Junk probability of message %d: %g\n",
1069 (int)(m - &message[0] + 1), r);
1070 if (r > THR)
1071 m->m_flag |= MJUNK;
1072 else
1073 m->m_flag &= ~MJUNK;
1076 /*ARGSUSED4*/
1077 static void
1078 rate(const char *word, enum entry entry, struct lexstat *sp, int unused)
1080 char *n;
1081 unsigned long h1, h2;
1082 float p, d;
1083 int i, j;
1084 (void)entry;
1085 (void)unused;
1087 dbhash(word, &h1, &h2);
1088 if ((n = lookup(h1, h2, 0)) != NULL) {
1089 p = getprob(n);
1090 } else
1091 p = DFL;
1092 if (_debug)
1093 fprintf(stderr, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1, h2,
1094 n ? get(&n[OF_node_good]) : 0,
1095 n ? get(&n[OF_node_bad]) : 0,
1096 p, prstr(word));
1097 if (p == 0)
1098 return;
1099 d = p >= MID ? p - MID : MID - p;
1100 if (d >= best[BEST-1].dist)
1101 for (i = 0; i < BEST; i++) {
1102 if (h1 == best[i].hash1 && h2 == best[i].hash2)
1103 break;
1105 * For equal distance, this selection prefers
1106 * words with a low probability, since a false
1107 * negative is better than a false positive,
1108 * and since experience has shown that false
1109 * positives are more likely otherwise. Then,
1110 * words from the end of the header and from
1111 * the start of the body are preferred. This
1112 * gives the most interesting verbose output.
1114 if (d > best[i].dist ||
1115 d == (best[i].dist &&
1116 p < best[i].prob) ||
1117 (best[i].loc == HEADER &&
1118 d == best[i].dist)) {
1119 for (j = BEST-2; j >= i; j--)
1120 best[j+1] = best[j];
1121 best[i].dist = d;
1122 best[i].prob = p;
1123 best[i].word = savestr(word);
1124 best[i].hash1 = h1;
1125 best[i].hash2 = h2;
1126 best[i].loc = sp->loc;
1127 break;
1132 static void
1133 dbhash(const char *word, unsigned long *h1, unsigned long *h2)
1135 unsigned char digest[16];
1136 MD5_CTX ctx;
1138 MD5Init(&ctx);
1139 MD5Update(&ctx, (unsigned char *)word, strlen(word));
1140 if (table_version >= 1)
1141 MD5Update(&ctx, (unsigned char *)&super[OF_super_mangle], 4);
1142 MD5Final(digest, &ctx);
1143 *h1 = getn(digest);
1144 if (table_version < 1) {
1145 *h1 ^= getn(&super[OF_super_mangle]);
1146 *h2 = 0;
1147 } else
1148 *h2 = get(&digest[4]);
1152 * The selection of the value for mangling is not critical. It is practically
1153 * impossible for any person to determine the exact time when the database
1154 * was created first (without looking at the database, which would reveal the
1155 * value anyway), so we just use this. The MD5 hash here ensures that each
1156 * single second gives a completely different mangling value (which is not
1157 * necessary anymore if table_version>=1, but does not hurt).
1159 static void
1160 mkmangle(void)
1162 union {
1163 time_t t;
1164 char c[16];
1165 } u;
1166 unsigned long s;
1167 unsigned char digest[16];
1168 MD5_CTX ctx;
1170 memset(&u, 0, sizeof u);
1171 time(&u.t);
1172 MD5Init(&ctx);
1173 MD5Update(&ctx, (unsigned char *)u.c, sizeof u.c);
1174 MD5Final(digest, &ctx);
1175 s = getn(digest);
1176 putn(&super[OF_super_mangle], s);
1180 cprobability(void *v)
1182 char **args = v;
1183 unsigned long used, ngood, nbad;
1184 unsigned long h1, h2;
1185 unsigned g, b;
1186 float p, d;
1187 char *n;
1189 if (*args == NULL) {
1190 fprintf(stderr, "No words given.\n");
1191 return 1;
1193 if (getdb(O_RDONLY) != OKAY)
1194 return 1;
1195 used = getn(&super[OF_super_used]);
1196 ngood = getn(&super[OF_super_ngood]);
1197 nbad = getn(&super[OF_super_nbad]);
1198 printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1199 used, ngood, nbad);
1200 do {
1201 dbhash(*args, &h1, &h2);
1202 printf("\"%s\", hash=%lu:%lu ", *args, h1, h2);
1203 if ((n = lookup(h1, h2, 0)) != NULL) {
1204 g = get(&n[OF_node_good]);
1205 b = get(&n[OF_node_bad]);
1206 printf("good=%u bad=%u ", g, b);
1207 p = getprob(n);
1208 if (p != 0) {
1209 d = p >= MID ? p - MID : MID - p;
1210 printf("prob=%.4g dist=%.4g", p, d);
1211 } else
1212 printf("too infrequent");
1213 } else
1214 printf("not in database");
1215 putchar('\n');
1216 } while (*++args);
1217 relsedb();
1218 return 0;
1221 #else /* !USE_JUNK */
1223 static int
1224 nojunk(void)
1226 fputs(catgets(catd, CATSET, 270, "No JUNK support compiled in.\n"),
1227 stderr);
1228 return (1);
1232 cgood(void *v)
1234 (void)v;
1235 return nojunk();
1239 cjunk(void *v)
1241 (void)v;
1242 return nojunk();
1246 cungood(void *v)
1248 (void)v;
1249 return nojunk();
1253 cunjunk(void *v)
1255 (void)v;
1256 return nojunk();
1260 cclassify(void *v)
1262 (void)v;
1263 return nojunk();
1267 cprobability(void *v)
1269 (void)v;
1270 return nojunk();
1273 #endif /* USE_JUNK */