Support CONFIG-sets for make(1) invocation..
[s-mailx.git] / junk.c
blob481c584b9fe60e937d04a26b5491ef5e5b8d205b
1 /*
2 * Heirloom mailx - a mail user agent derived from Berkeley Mail.
4 * Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
5 */
6 /*
7 * Copyright (c) 2004
8 * Gunnar Ritter. All rights reserved.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by Gunnar Ritter
21 * and his contributors.
22 * 4. Neither the name of Gunnar Ritter nor the names of his contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
39 #ifndef lint
40 #ifdef DOSCCS
41 static char sccsid[] = "@(#)junk.c 1.75 (gritter) 9/14/08";
42 #endif
43 #endif /* not lint */
45 #include "config.h"
46 #include "rcv.h"
48 #ifdef USE_JUNK
49 #include <sys/stat.h>
50 #include <fcntl.h>
51 #include <limits.h>
52 #include <time.h>
53 #include <unistd.h>
54 #include <utime.h>
56 #ifdef HAVE_MMAP
57 #include <sys/mman.h>
58 #else /* !HAVE_MMAP */
59 #define mmap(a, b, c, d, e, f) MAP_FAILED
60 #define munmap(a, b)
61 #endif /* !HAVE_MMAP */
62 #ifndef HAVE_MREMAP
63 #define mremap(a, b, c, d) MAP_FAILED
64 #endif /* !HAVE_MREMAP */
66 #ifndef MAP_FAILED
67 #define MAP_FAILED ((void *)-1)
68 #endif /* !MAP_FAILED */
70 #include "extern.h"
71 #include "md5.h"
74 * Mail -- a mail program
76 * Junk classification, mostly according to Paul Graham's "A Plan for Spam",
77 * August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
78 * Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
80 * Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
81 * Classification using Chained Tokens", February 2004,
82 * <http://www.nuclearelephant.com/papers/chained.html>.
85 #define DFL .40
86 #define THR .9
87 #define MID .5
89 #define MAX2 0x0000ffff
90 #define MAX3 0x00ffffffUL
91 #define MAX4 0xffffffffUL
94 * The dictionary consists of two files forming a hash table. The hash
95 * consists of the first 56 bits of the result of applying MD5 to the
96 * input word. This scheme ensures that collisions are unlikely enough
97 * to make junk detection work; according to the birthday paradox, a
98 * 50 % probability for one single collision is reached at 2^28 entries.
100 * To make the chain structure independent from input, the MD5 input is
101 * xor'ed with a random number. This makes it impossible that someone uses
102 * a carefully crafted message for a denial-of-service attack against the
103 * database.
105 #define SIZEOF_node 17
106 #define OF_node_hash 0 /* first 32 bits of MD5 of word|mangle */
107 #define OF_node_next 4 /* bit-negated table index of next node */
108 #define OF_node_good 8 /* number of times this appeared in good msgs */
109 #define OF_node_bad 11 /* number of times this appeared in bad msgs */
110 #define OF_node_prob_O 14 /* table_version<1: precomputed probability */
111 #define OF_node_hash2 14 /* upper 3 bytes of MD5 hash */
112 static char *nodes;
114 #define SIZEOF_super 262164
115 #define OF_super_size 0 /* allocated nodes in the chain file */
116 #define OF_super_used 4 /* used nodes in the chain file */
117 #define OF_super_ngood 8 /* number of good messages scanned so far */
118 #define OF_super_nbad 12 /* number of bad messages scanned so far */
119 #define OF_super_mangle 16 /* used to mangle the MD5 input */
120 #define OF_super_bucket 20 /* 65536 bit-negated node indices */
121 #define SIZEOF_entry 4
122 static char *super;
124 static size_t super_mmapped;
125 static size_t nodes_mmapped;
126 static int rw_map;
127 static int chained_tokens;
130 * Version history
131 * ---------------
132 * 0 Initial version
133 * 1 Fixed the mangling; it was ineffective in version 0.
134 * Hash extended to 56 bits.
136 static int table_version;
137 #define current_table_version 1
139 #define get(e) \
140 ((unsigned)(((char *)(e))[0]&0377) + \
141 ((unsigned)(((char *)(e))[1]&0377) << 8) + \
142 ((unsigned)(((char *)(e))[2]&0377) << 16))
144 #define put(e, n) \
145 (((char *)(e))[0] = (n) & 0x0000ff, \
146 ((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
147 ((char *)(e))[2] = ((n) & 0xff0000) >> 16)
149 #define f2s(d) (smin(((unsigned)((d) * MAX3)), MAX3))
151 #define s2f(s) ((float)(s) / MAX3)
153 #define getn(p) \
154 ((unsigned long)(((char *)(p))[0]&0377) + \
155 ((unsigned long)(((char *)(p))[1]&0377) << 8) + \
156 ((unsigned long)(((char *)(p))[2]&0377) << 16) + \
157 ((unsigned long)(((char *)(p))[3]&0377) << 24))
159 #define putn(p, n) \
160 (((char *)(p))[0] = (n) & 0x000000ffUL, \
161 ((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
162 ((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
163 ((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
165 struct lexstat {
166 char *save;
167 int price;
168 int url;
169 int lastc;
170 int hadamp;
171 enum loc {
172 FROM_LINE = 0,
173 HEADER = 1,
174 BODY = 2
175 } loc;
176 enum html {
177 HTML_NONE = 0,
178 HTML_TEXT = 1,
179 HTML_TAG = 2,
180 HTML_SKIP = 3
181 } html;
182 char tag[8];
183 char *tagp;
184 char field[LINESIZE];
187 #define constituent(c, b, i, price, hadamp) \
188 ((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
189 (c) == '$' || (c) == '!' || (c) == '_' || \
190 (c) == '#' || (c) == '%' || (c) == '&' || \
191 ((c) == ';' && hadamp) || \
192 ((c) == '-' && !(price)) || \
193 (((c) == '.' || (c) == ',' || (c) == '/') && \
194 (i) > 0 && digitchar((b)[(i)-1]&0377)))
196 #define url_xchar(c) \
197 (((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
198 !spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
199 (c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
200 (c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
201 (c) != '#' && (c) != '"')
203 enum db {
204 SUPER = 0,
205 NODES = 1
208 enum entry {
209 GOOD = 0,
210 BAD = 1
213 static const char README1[] = "\
214 This is a junk mail database maintained by mailx(1). It does not contain any\n\
215 of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
216 used for lookup. It is thus possible to tell if some given word was likely\n\
217 contained in your mail from examining this data, at best.\n";
218 static const char README2[] = "\n\
219 The database files are stored in compress(1) format by default. This saves\n\
220 some space, but leads to higher processor usage when the database is read\n\
221 or updated. You can use uncompress(1) on these files if you prefer to store\n\
222 them in flat form.\n";
224 static int verbose;
225 static int _debug;
226 static FILE *sfp, *nfp;
227 static char *sname, *nname;
229 static enum okay getdb(int rw);
230 static void putdb(void);
231 static void relsedb(void);
232 static FILE *dbfp(enum db db, int rw, int *compressed, char **fn);
233 static char *lookup(unsigned long h1, unsigned long h2, int create);
234 static unsigned long grow(unsigned long size);
235 static char *nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
236 struct lexstat *sp, int *stop);
237 static void join(char **buf, size_t *bufsize, const char *s1, const char *s2);
238 static void add(const char *word, enum entry entry, struct lexstat *sp,
239 int incr);
240 static enum okay scan(struct message *m, enum entry entry,
241 void (*func)(const char *, enum entry, struct lexstat *, int),
242 int arg);
243 static void recompute(void);
244 static float getprob(char *n);
245 static int insert(int *msgvec, enum entry entry, int incr);
246 static void clsf(struct message *m);
247 static void rate(const char *word, enum entry entry, struct lexstat *sp,
248 int unused);
249 static void dbhash(const char *word, unsigned long *h1, unsigned long *h2);
250 static void mkmangle(void);
252 static enum okay
253 getdb(int rw)
255 void *zp = NULL;
256 long n;
257 int compressed;
259 chained_tokens = value("chained-junk-tokens") != NULL;
260 if ((sfp = dbfp(SUPER, rw, &compressed, &sname)) == (FILE *)-1)
261 return STOP;
262 if (sfp && !compressed) {
263 super = mmap(NULL, SIZEOF_super,
264 rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
265 MAP_SHARED, fileno(sfp), 0);
266 if (super != MAP_FAILED) {
267 super_mmapped = SIZEOF_super;
268 goto skip;
271 super_mmapped = 0;
272 super = smalloc(SIZEOF_super);
273 if (sfp) {
274 if (compressed)
275 zp = zalloc(sfp);
276 if ((compressed ? zread(zp, super, SIZEOF_super)
277 != SIZEOF_super :
278 fread(super, 1, SIZEOF_super, sfp)
279 != SIZEOF_super) ||
280 ferror(sfp)) {
281 fprintf(stderr, "Error reading junk mail database.\n");
282 memset(super, 0, SIZEOF_super);
283 mkmangle();
284 if (compressed)
285 zfree(zp);
286 Fclose(sfp);
287 sfp = NULL;
288 } else if (compressed)
289 zfree(zp);
290 } else {
291 memset(super, 0, SIZEOF_super);
292 mkmangle();
294 skip: if ((n = getn(&super[OF_super_size])) == 0) {
295 n = 1;
296 putn(&super[OF_super_size], 1);
298 if (sfp && (nfp = dbfp(NODES, rw, &compressed, &nname)) != NULL) {
299 if (nfp == (FILE *)-1) {
300 relsedb();
301 return STOP;
304 rw_map = rw;
305 if (sfp && nfp && !compressed) {
306 nodes = mmap(NULL, n * SIZEOF_node,
307 rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
308 MAP_SHARED, fileno(nfp), 0);
309 if (nodes != MAP_FAILED) {
310 nodes_mmapped = n * SIZEOF_node;
311 return OKAY;
314 nodes_mmapped = 0;
315 nodes = smalloc(n * SIZEOF_node);
316 if (sfp && nfp) {
317 if (compressed)
318 zp = zalloc(nfp);
319 if ((compressed ? zread(zp, nodes, n * SIZEOF_node)
320 != n * SIZEOF_node :
321 fread(nodes, 1, n * SIZEOF_node, nfp)
322 != n * SIZEOF_node) ||
323 ferror(nfp)) {
324 fprintf(stderr, "Error reading junk mail database.\n");
325 memset(nodes, 0, n * SIZEOF_node);
326 memset(super, 0, SIZEOF_super);
327 mkmangle();
328 putn(&super[OF_super_size], n);
330 if (compressed)
331 zfree(zp);
332 Fclose(nfp);
333 nfp = NULL;
334 } else
335 memset(nodes, 0, n * SIZEOF_node);
336 if (sfp) {
337 Fclose(sfp);
338 sfp = NULL;
340 return OKAY;
343 static void
344 putdb(void)
346 void *zp;
347 int scomp, ncomp;
349 if (!super_mmapped && (sfp = dbfp(SUPER, O_WRONLY, &scomp, &sname))
350 == NULL || sfp == (FILE *)-1)
351 return;
352 if (!nodes_mmapped && (nfp = dbfp(NODES, O_WRONLY, &ncomp, &nname))
353 == NULL || nfp == (FILE *)-1)
354 return;
355 if (super_mmapped == 0 || nodes_mmapped == 0)
356 holdint();
358 * Use utime() with mmap() since Linux does not update st_mtime
359 * reliably otherwise.
361 if (super_mmapped)
362 utime(sname, NULL);
363 else if (scomp) {
364 zp = zalloc(sfp);
365 zwrite(zp, super, SIZEOF_super);
366 zfree(zp);
367 trunc(sfp);
368 } else
369 fwrite(super, 1, SIZEOF_super, sfp);
370 if (nodes_mmapped)
371 utime(nname, NULL);
372 else if (ncomp) {
373 zp = zalloc(nfp);
374 zwrite(zp, nodes, getn(&super[OF_super_size]) * SIZEOF_node);
375 zfree(zp);
376 trunc(nfp);
377 } else
378 fwrite(nodes, 1,
379 getn(&super[OF_super_size]) * SIZEOF_node, nfp);
380 if (super_mmapped == 0 || nodes_mmapped == 0)
381 relseint();
384 static void
385 relsedb(void)
387 if (super_mmapped) {
388 munmap(super, super_mmapped);
389 super_mmapped = 0;
390 } else
391 free(super);
392 if (nodes_mmapped) {
393 munmap(nodes, nodes_mmapped);
394 nodes_mmapped = 0;
395 } else
396 free(nodes);
397 if (sfp && sfp != (FILE *)-1) {
398 Fclose(sfp);
399 sfp = NULL;
401 if (nfp && nfp != (FILE *)-1) {
402 Fclose(nfp);
403 nfp = NULL;
407 static FILE *
408 dbfp(enum db db, int rw, int *compressed, char **fn)
410 FILE *fp, *rp;
411 char *dir;
412 struct flock flp;
413 char *sfx[][2] = {
414 { "super", "nodes" },
415 { "super1", "nodes1" }
417 char **sf;
418 char *zfx[][2] = {
419 { "super.Z", "nodes.Z" },
420 { "super1.Z", "nodes1.Z" }
422 char **zf;
423 int n;
425 if ((dir = value("junkdb")) == NULL) {
426 fprintf(stderr, "No junk mail database specified. "
427 "Set the junkdb variable.\n");
428 return (FILE *)-1;
430 dir = expand(dir);
431 if (makedir(dir) == STOP) {
432 fprintf(stderr, "Cannot create directory \"%s\"\n.", dir);
433 return (FILE *)-1;
435 if (rw!=O_WRONLY)
436 table_version = current_table_version;
437 loop: sf = sfx[table_version];
438 zf = zfx[table_version];
439 *fn = salloc((n = strlen(dir)) + 40);
440 strcpy(*fn, dir);
441 (*fn)[n] = '/';
442 *compressed = 0;
443 strcpy(&(*fn)[n+1], sf[db]);
444 if ((fp = Fopen(*fn, rw!=O_RDONLY ? "r+" : "r")) != NULL)
445 goto okay;
446 *compressed = 1;
447 strcpy(&(*fn)[n+1], zf[db]);
448 if ((fp = Fopen(*fn, rw ? "r+" : "r")) == NULL &&
449 rw==O_WRONLY ? (fp = Fopen(*fn, "w+")) == NULL : 0) {
450 fprintf(stderr, "Cannot open junk mail database \"%s\".\n",*fn);
451 return NULL;
453 if (rw==O_WRONLY) {
454 strcpy(&(*fn)[n+1], "README");
455 if (access(*fn, F_OK) < 0 && (rp = Fopen(*fn, "w")) != NULL) {
456 fputs(README1, rp);
457 fputs(README2, rp);
458 Fclose(rp);
460 } else if (fp == NULL) {
461 if (table_version > 0) {
462 table_version--;
463 goto loop;
464 } else
465 table_version = current_table_version;
467 okay: if (fp) {
468 flp.l_type = rw!=O_RDONLY ? F_WRLCK : F_RDLCK;
469 flp.l_start = 0;
470 flp.l_len = 0;
471 flp.l_whence = SEEK_SET;
472 fcntl(fileno(fp), F_SETLKW, &flp);
474 return fp;
477 static char *
478 lookup(unsigned long h1, unsigned long h2, int create)
480 char *n, *lastn = NULL;
481 unsigned long c, lastc = MAX4, used, size;
483 used = getn(&super[OF_super_used]);
484 size = getn(&super[OF_super_size]);
485 c = ~getn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry]);
486 n = &nodes[c*SIZEOF_node];
487 while (c < used) {
488 if (getn(&n[OF_node_hash]) == h1 &&
489 (table_version < 1 ? 1 :
490 get(&n[OF_node_hash2]) == h2))
491 return n;
492 lastc = c;
493 lastn = n;
494 c = ~getn(&n[OF_node_next]);
495 n = &nodes[c*SIZEOF_node];
497 if (create) {
498 if (used >= size) {
499 if ((size = grow(size)) == 0)
500 return NULL;
501 lastn = &nodes[lastc*SIZEOF_node];
503 putn(&super[OF_super_used], used+1);
504 n = &nodes[used*SIZEOF_node];
505 putn(&n[OF_node_hash], h1);
506 put(&n[OF_node_hash2], h2);
507 if (lastc < used)
508 putn(&lastn[OF_node_next], ~used);
509 else
510 putn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry],
511 ~used);
512 return n;
513 } else
514 return NULL;
517 static unsigned long
518 grow(unsigned long size)
520 unsigned long incr, newsize;
521 void *onodes;
523 incr = size > MAX2 ? MAX2 : size;
524 newsize = size + incr;
525 if (newsize > MAX4-MAX2) {
526 oflo: fprintf(stderr, "Junk mail database overflow.\n");
527 return 0;
529 if (nodes_mmapped) {
530 if (lseek(fileno(nfp), newsize*SIZEOF_node-1, SEEK_SET)
531 == (off_t)-1 || write(fileno(nfp),"\0",1) != 1)
532 goto oflo;
533 onodes = nodes;
534 if ((nodes = mremap(nodes, nodes_mmapped, newsize*SIZEOF_node,
535 MREMAP_MAYMOVE)) == MAP_FAILED) {
536 if ((nodes = mmap(NULL, newsize*SIZEOF_node,
537 rw_map!=O_RDONLY ?
538 PROT_READ|PROT_WRITE :
539 PROT_READ,
540 MAP_SHARED, fileno(nfp), 0))
541 == MAP_FAILED) {
542 nodes = onodes;
543 goto oflo;
545 munmap(onodes, nodes_mmapped);
547 nodes_mmapped = newsize*SIZEOF_node;
548 } else {
549 nodes = srealloc(nodes, newsize*SIZEOF_node);
550 memset(&nodes[size*SIZEOF_node], 0, incr*SIZEOF_node);
552 size = newsize;
553 putn(&super[OF_super_size], size);
554 return size;
557 #define SAVE(c) { \
558 if (i+j >= (long)*bufsize-4) \
559 *buf = srealloc(*buf, *bufsize += 32); \
560 (*buf)[j+i] = (c); \
561 i += (*buf)[j+i] != '\0'; \
564 static char *
565 nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
566 struct lexstat *sp, int *stop)
568 int c, i, j, k;
569 char *cp, *cq;
571 loop: *stop = 0;
572 sp->hadamp = 0;
573 if (sp->save) {
574 i = j = 0;
575 for (cp = sp->save; *cp; cp++) {
576 SAVE(*cp&0377)
578 SAVE('\0')
579 free(sp->save);
580 sp->save = NULL;
581 goto out;
583 if (sp->loc == FROM_LINE)
584 while (*count > 0 && (c = getc(fp)) != EOF) {
585 sp->lastc = c;
586 if (c == '\n') {
587 sp->loc = HEADER;
588 break;
591 i = 0;
592 j = 0;
593 if (sp->loc == HEADER && sp->field[0]) {
594 field: cp = sp->field;
595 do {
596 c = *cp&0377;
597 SAVE(c)
598 cp++;
599 } while (*cp);
600 j = i;
601 i = 0;
603 if (sp->price) {
604 sp->price = 0;
605 SAVE('$')
607 while (*count > 0 && (c = getc(fp)) != EOF) {
608 (*count)--;
609 if (c == '\0' && table_version >= 1) {
610 sp->loc = HEADER;
611 sp->lastc = '\n';
612 *stop = 1;
613 continue;
615 if (c == '\b' && table_version >= 1) {
616 sp->html = HTML_TEXT;
617 continue;
619 if (c == '<' && sp->html == HTML_TEXT) {
620 sp->html = HTML_TAG;
621 sp->tagp = sp->tag;
622 continue;
624 if (sp->html == HTML_TAG) {
625 if (spacechar(c)) {
626 *sp->tagp = '\0';
627 if (!asccasecmp(sp->tag, "a") ||
628 !asccasecmp(sp->tag, "img") ||
629 !asccasecmp(sp->tag, "font") ||
630 !asccasecmp(sp->tag, "span") ||
631 !asccasecmp(sp->tag, "meta") ||
632 !asccasecmp(sp->tag, "table") ||
633 !asccasecmp(sp->tag, "tr") ||
634 !asccasecmp(sp->tag, "td") ||
635 !asccasecmp(sp->tag, "p"))
636 sp->html = HTML_TEXT;
637 else
638 sp->html = HTML_SKIP;
639 } else if (c == '>') {
640 sp->html = HTML_TEXT;
641 continue;
642 } else {
643 if (sp->tagp - sp->tag < sizeof sp->tag - 1)
644 *sp->tagp++ = c;
645 continue;
648 if (sp->html == HTML_SKIP) {
649 if (c == '>')
650 sp->html = HTML_TEXT;
651 continue;
653 if (c == '$' && i == 0)
654 sp->price = 1;
655 if (sp->loc == HEADER && sp->lastc == '\n') {
656 if (!spacechar(c)) {
657 k = 0;
658 while (k < sizeof sp->field - 3) {
659 sp->field[k++] = c;
660 if (*count <= 0 ||
661 (c = getc(fp)) == EOF)
662 break;
663 if (spacechar(c) || c == ':') {
664 ungetc(c, fp);
665 break;
667 sp->lastc = c;
668 (*count)--;
670 sp->field[k++] = '*';
671 sp->field[k] = '\0';
672 j = 0;
673 *stop = 1;
674 goto field;
675 } else if (c == '\n') {
676 j = 0;
677 sp->loc = BODY;
678 sp->html = HTML_NONE;
679 *stop = 1;
682 if (sp->url) {
683 if (!url_xchar(c)) {
684 sp->url = 0;
685 cp = sp->save = smalloc(i+6);
686 for (cq = "HOST*"; *cq; cq++)
687 *cp++ = *cq;
688 for (cq = &(*buf)[j]; *cq != ':'; cq++);
689 cq += 3; /* skip "://" */
690 while (cq < &(*buf)[i+j] &&
691 (alnumchar(*cq&0377) ||
692 *cq == '.' || *cq == '-'))
693 *cp++ = *cq++;
694 *cp = '\0';
695 *stop = 1;
696 break;
698 SAVE(c)
699 } else if (constituent(c, *buf, i+j, sp->price, sp->hadamp) ||
700 sp->loc == HEADER && c == '.' &&
701 asccasecmp(sp->field, "subject*")) {
702 if (c == '&')
703 sp->hadamp = 1;
704 SAVE(c)
705 } else if (i > 0 && c == ':' && *count > 2) {
706 if ((c = getc(fp)) != '/') {
707 ungetc(c, fp);
708 break;
710 (*count)--;
711 if ((c = getc(fp)) != '/') {
712 ungetc(c, fp);
713 break;
715 (*count)--;
716 sp->url = 1;
717 SAVE('\0')
718 cp = savestr(*buf);
719 j = i = 0;
720 for (cq = "URL*"; *cq; cq++) {
721 SAVE(*cq&0377)
723 j = i;
724 i = 0;
725 do {
726 if (alnumchar(*cp&0377)) {
727 SAVE(*cp&0377)
728 } else
729 i = 0;
730 } while (*++cp);
731 for (cq = "://"; *cq; cq++) {
732 SAVE(*cq&0377)
734 } else if (i > 1 && ((*buf)[i+j-1] == ',' ||
735 (*buf)[i+j-1] == '.') && !digitchar(c)) {
736 i--;
737 ungetc(c, fp);
738 (*count)++;
739 break;
740 } else if (i > 0) {
741 sp->lastc = c;
742 break;
744 sp->lastc = c;
746 out: if (i > 0) {
747 SAVE('\0')
748 c = 0;
749 for (k = 0; k < i; k++)
750 if (digitchar((*buf)[k+j]&0377))
751 c++;
752 else if (!alphachar((*buf)[k+j]&0377) &&
753 (*buf)[k+j] != '$') {
754 c = 0;
755 break;
757 if (c == i)
758 goto loop;
760 * Including the results of other filtering software (the
761 * 'X-Spam' fields) might seem tempting, but will also rate
762 * their false negatives good with this filter. Therefore
763 * these fields are ignored.
765 * Handling 'Received' fields is difficult since they include
766 * lots of both useless and interesting words for our purposes.
768 if (sp->loc == HEADER &&
769 (asccasecmp(sp->field, "message-id*") == 0 ||
770 asccasecmp(sp->field, "references*") == 0 ||
771 asccasecmp(sp->field, "in-reply-to*") == 0 ||
772 asccasecmp(sp->field, "status*") == 0 ||
773 asccasecmp(sp->field, "x-status*") == 0 ||
774 asccasecmp(sp->field, "date*") == 0 ||
775 asccasecmp(sp->field, "delivery-date*") == 0 ||
776 ascncasecmp(sp->field, "x-spam", 6) == 0 ||
777 ascncasecmp(sp->field, "x-pstn", 6) == 0 ||
778 ascncasecmp(sp->field, "x-scanned", 9) == 0 ||
779 asccasecmp(sp->field, "received*") == 0 &&
780 ((2*c > i) || i < 4 ||
781 asccasestr(*buf, "localhost") != NULL)))
782 goto loop;
783 return *buf;
785 return NULL;
788 #define JOINCHECK if (i >= *bufsize) \
789 *buf = srealloc(*buf, *bufsize += 32)
790 static void
791 join(char **buf, size_t *bufsize, const char *s1, const char *s2)
793 int i = 0;
795 while (*s1) {
796 JOINCHECK;
797 (*buf)[i++] = *s1++;
799 JOINCHECK;
800 (*buf)[i++] = ' ';
801 do {
802 JOINCHECK;
803 (*buf)[i++] = *s2;
804 } while (*s2++);
807 /*ARGSUSED3*/
808 static void
809 add(const char *word, enum entry entry, struct lexstat *sp, int incr)
811 unsigned c;
812 unsigned long h1, h2;
813 char *n;
815 dbhash(word, &h1, &h2);
816 if ((n = lookup(h1, h2, 1)) != NULL) {
817 switch (entry) {
818 case GOOD:
819 c = get(&n[OF_node_good]);
820 if (incr>0 && c<MAX3-incr || incr<0 && c>=-incr) {
821 c += incr;
822 put(&n[OF_node_good], c);
824 break;
825 case BAD:
826 c = get(&n[OF_node_bad]);
827 if (incr>0 && c<MAX3-incr || incr<0 && c>=-incr) {
828 c += incr;
829 put(&n[OF_node_bad], c);
831 break;
836 static enum okay
837 scan(struct message *m, enum entry entry,
838 void (*func)(const char *, enum entry, struct lexstat *, int),
839 int arg)
841 FILE *fp;
842 char *buf0 = NULL, *buf1 = NULL, *buf2 = NULL, **bp, *cp;
843 size_t bufsize0 = 0, bufsize1 = 0, bufsize2 = 0, *zp, count;
844 struct lexstat *sp;
845 int stop;
847 if ((fp = Ftemp(&cp, "Ra", "w+", 0600, 1)) == NULL) {
848 perror("tempfile");
849 return STOP;
851 rm(cp);
852 Ftfree(&cp);
853 if (send(m, fp, NULL, NULL, SEND_TOFLTR, NULL) < 0) {
854 Fclose(fp);
855 return STOP;
857 fflush(fp);
858 rewind(fp);
859 sp = scalloc(1, sizeof *sp);
860 count = fsize(fp);
861 bp = &buf0;
862 zp = &bufsize0;
863 while (nextword(bp, zp, &count, fp, sp, &stop) != NULL) {
864 (*func)(*bp, entry, sp, arg);
865 if (chained_tokens && buf0 && *buf0 && buf1 && *buf1 && !stop) {
866 join(&buf2, &bufsize2, bp == &buf1 ? buf0 : buf1, *bp);
867 (*func)(buf2, entry, sp, arg);
869 bp = bp == &buf1 ? &buf0 : &buf1;
870 zp = zp == &bufsize1 ? &bufsize0 : &bufsize1;
872 free(buf0);
873 free(buf1);
874 free(buf2);
875 free(sp);
876 Fclose(fp);
877 return OKAY;
880 static void
881 recompute(void)
883 unsigned long used, i;
884 unsigned s;
885 char *n;
886 float p;
888 used = getn(&super[OF_super_used]);
889 for (i = 0; i < used; i++) {
890 n = &nodes[i*SIZEOF_node];
891 p = getprob(n);
892 s = f2s(p);
893 put(&n[OF_node_prob_O], s);
897 static float
898 getprob(char *n)
900 unsigned long ngood, nbad;
901 unsigned g, b;
902 float p, BOT, TOP;
904 ngood = getn(&super[OF_super_ngood]);
905 nbad = getn(&super[OF_super_nbad]);
906 if (ngood + nbad >= 18000) {
907 BOT = .0001;
908 TOP = .9999;
909 } else if (ngood + nbad >= 9000) {
910 BOT = .001;
911 TOP = .999;
912 } else {
913 BOT = .01;
914 TOP = .99;
916 g = get(&n[OF_node_good]) * 2;
917 b = get(&n[OF_node_bad]);
918 if (g + b >= 5) {
919 p = smin(1.0, nbad ? (float)b/nbad : 0.0) /
920 (smin(1.0, ngood ? (float)g/ngood : 0.0) +
921 smin(1.0, nbad ? (float)b/nbad : 0.0));
922 p = smin(TOP, p);
923 p = smax(BOT, p);
924 if (p == TOP && b <= 10 && g == 0)
925 p -= BOT;
926 else if (p == BOT && g <= 10 && b == 0)
927 p += BOT;
928 } else if (g == 0 && b == 0)
929 p = DFL;
930 else
931 p = 0;
932 return p;
935 static int
936 insert(int *msgvec, enum entry entry, int incr)
938 int *ip;
939 unsigned long u = 0;
941 verbose = value("verbose") != NULL;
942 if (getdb(O_RDWR) != OKAY)
943 return 1;
944 switch (entry) {
945 case GOOD:
946 u = getn(&super[OF_super_ngood]);
947 break;
948 case BAD:
949 u = getn(&super[OF_super_nbad]);
950 break;
952 for (ip = msgvec; *ip; ip++) {
953 setdot(&message[*ip-1]);
954 if (incr > 0 && u == MAX4-incr+1) {
955 fprintf(stderr, "Junk mail database overflow.\n");
956 break;
957 } else if (incr < 0 && -incr > u) {
958 fprintf(stderr, "Junk mail database underflow.\n");
959 break;
961 u += incr;
962 if (entry == GOOD && incr > 0 || entry == BAD && incr < 0)
963 message[*ip-1].m_flag &= ~MJUNK;
964 else
965 message[*ip-1].m_flag |= MJUNK;
966 scan(&message[*ip-1], entry, add, incr);
968 switch (entry) {
969 case GOOD:
970 putn(&super[OF_super_ngood], u);
971 break;
972 case BAD:
973 putn(&super[OF_super_nbad], u);
974 break;
976 if (table_version < 1)
977 recompute();
978 putdb();
979 relsedb();
980 return 0;
984 cgood(void *v)
986 return insert(v, GOOD, 1);
990 cjunk(void *v)
992 return insert(v, BAD, 1);
996 cungood(void *v)
998 return insert(v, GOOD, -1);
1002 cunjunk(void *v)
1004 return insert(v, BAD, -1);
1007 int
1008 cclassify(void *v)
1010 int *msgvec = v, *ip;
1012 verbose = value("verbose") != NULL;
1013 _debug = debug || value("debug") != NULL;
1014 if (getdb(O_RDONLY) != OKAY)
1015 return 1;
1016 for (ip = msgvec; *ip; ip++) {
1017 setdot(&message[*ip-1]);
1018 clsf(&message[*ip-1]);
1020 relsedb();
1021 return 0;
1024 #define BEST 15
1025 static struct {
1026 float dist;
1027 float prob;
1028 char *word;
1029 unsigned long hash1;
1030 unsigned long hash2;
1031 enum loc loc;
1032 } best[BEST];
1034 static void
1035 clsf(struct message *m)
1037 int i;
1038 float a = 1, b = 1, r;
1040 if (verbose)
1041 fprintf(stderr, "Examining message %d\n",
1042 (int)(m - &message[0] + 1));
1043 for (i = 0; i < BEST; i++) {
1044 best[i].dist = 0;
1045 best[i].prob = -1;
1047 if (scan(m, -1, rate, 0) != OKAY)
1048 return;
1049 if (best[0].prob == -1) {
1050 if (verbose)
1051 fprintf(stderr, "No information found.\n");
1052 m->m_flag &= ~MJUNK;
1053 return;
1055 for (i = 0; i < BEST; i++) {
1056 if (best[i].prob == -1)
1057 break;
1058 if (verbose)
1059 fprintf(stderr, "Probe %2d: \"%s\", hash=%lu:%lu "
1060 "prob=%.4g dist=%.4g\n",
1061 i+1, prstr(best[i].word),
1062 best[i].hash1, best[i].hash2,
1063 best[i].prob, best[i].dist);
1064 a *= best[i].prob;
1065 b *= 1 - best[i].prob;
1067 r = a+b > 0 ? a / (a+b) : 0;
1068 if (verbose)
1069 fprintf(stderr, "Junk probability of message %d: %g\n",
1070 (int)(m - &message[0] + 1), r);
1071 if (r > THR)
1072 m->m_flag |= MJUNK;
1073 else
1074 m->m_flag &= ~MJUNK;
1077 /*ARGSUSED4*/
1078 static void
1079 rate(const char *word, enum entry entry, struct lexstat *sp, int unused)
1081 char *n;
1082 unsigned long h1, h2;
1083 float p, d;
1084 int i, j;
1086 dbhash(word, &h1, &h2);
1087 if ((n = lookup(h1, h2, 0)) != NULL) {
1088 p = getprob(n);
1089 } else
1090 p = DFL;
1091 if (_debug)
1092 fprintf(stderr, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1, h2,
1093 n ? get(&n[OF_node_good]) : 0,
1094 n ? get(&n[OF_node_bad]) : 0,
1095 p, prstr(word));
1096 if (p == 0)
1097 return;
1098 d = p >= MID ? p - MID : MID - p;
1099 if (d >= best[BEST-1].dist)
1100 for (i = 0; i < BEST; i++) {
1101 if (h1 == best[i].hash1 && h2 == best[i].hash2)
1102 break;
1104 * For equal distance, this selection prefers
1105 * words with a low probability, since a false
1106 * negative is better than a false positive,
1107 * and since experience has shown that false
1108 * positives are more likely otherwise. Then,
1109 * words from the end of the header and from
1110 * the start of the body are preferred. This
1111 * gives the most interesting verbose output.
1113 if (d > best[i].dist ||
1114 d == best[i].dist &&
1115 p < best[i].prob ||
1116 best[i].loc == HEADER &&
1117 d == best[i].dist) {
1118 for (j = BEST-2; j >= i; j--)
1119 best[j+1] = best[j];
1120 best[i].dist = d;
1121 best[i].prob = p;
1122 best[i].word = savestr(word);
1123 best[i].hash1 = h1;
1124 best[i].hash2 = h2;
1125 best[i].loc = sp->loc;
1126 break;
1131 static void
1132 dbhash(const char *word, unsigned long *h1, unsigned long *h2)
1134 unsigned char digest[16];
1135 MD5_CTX ctx;
1137 MD5Init(&ctx);
1138 MD5Update(&ctx, (unsigned char *)word, strlen(word));
1139 if (table_version >= 1)
1140 MD5Update(&ctx, (unsigned char *)&super[OF_super_mangle], 4);
1141 MD5Final(digest, &ctx);
1142 *h1 = getn(digest);
1143 if (table_version < 1) {
1144 *h1 ^= getn(&super[OF_super_mangle]);
1145 *h2 = 0;
1146 } else
1147 *h2 = get(&digest[4]);
1151 * The selection of the value for mangling is not critical. It is practically
1152 * impossible for any person to determine the exact time when the database
1153 * was created first (without looking at the database, which would reveal the
1154 * value anyway), so we just use this. The MD5 hash here ensures that each
1155 * single second gives a completely different mangling value (which is not
1156 * necessary anymore if table_version>=1, but does not hurt).
1158 static void
1159 mkmangle(void)
1161 union {
1162 time_t t;
1163 char c[16];
1164 } u;
1165 unsigned long s;
1166 unsigned char digest[16];
1167 MD5_CTX ctx;
1169 memset(&u, 0, sizeof u);
1170 time(&u.t);
1171 MD5Init(&ctx);
1172 MD5Update(&ctx, (unsigned char *)u.c, sizeof u.c);
1173 MD5Final(digest, &ctx);
1174 s = getn(digest);
1175 putn(&super[OF_super_mangle], s);
1179 cprobability(void *v)
1181 char **args = v;
1182 unsigned long used, ngood, nbad;
1183 unsigned long h1, h2;
1184 unsigned g, b;
1185 float p, d;
1186 char *n;
1188 if (*args == NULL) {
1189 fprintf(stderr, "No words given.\n");
1190 return 1;
1192 if (getdb(O_RDONLY) != OKAY)
1193 return 1;
1194 used = getn(&super[OF_super_used]);
1195 ngood = getn(&super[OF_super_ngood]);
1196 nbad = getn(&super[OF_super_nbad]);
1197 printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1198 used, ngood, nbad);
1199 do {
1200 dbhash(*args, &h1, &h2);
1201 printf("\"%s\", hash=%lu:%lu ", *args, h1, h2);
1202 if ((n = lookup(h1, h2, 0)) != NULL) {
1203 g = get(&n[OF_node_good]);
1204 b = get(&n[OF_node_bad]);
1205 printf("good=%u bad=%u ", g, b);
1206 p = getprob(n);
1207 if (p != 0) {
1208 d = p >= MID ? p - MID : MID - p;
1209 printf("prob=%.4g dist=%.4g", p, d);
1210 } else
1211 printf("too infrequent");
1212 } else
1213 printf("not in database");
1214 putchar('\n');
1215 } while (*++args);
1216 relsedb();
1217 return 0;
1220 #else /* !USE_JUNK */
1222 static int
1223 nojunk(void)
1225 fputs(catgets(catd, CATSET, 270, "No JUNK support compiled in.\n"),
1226 stderr);
1227 return (1);
1231 cgood(void *v)
1233 (void)v;
1234 return nojunk();
1238 cjunk(void *v)
1240 (void)v;
1241 return nojunk();
1245 cungood(void *v)
1247 (void)v;
1248 return nojunk();
1252 cunjunk(void *v)
1254 (void)v;
1255 return nojunk();
1259 cclassify(void *v)
1261 (void)v;
1262 return nojunk();
1266 cprobability(void *v)
1268 (void)v;
1269 return nojunk();
1272 #endif /* USE_JUNK */