names.c: fix compiler warnings
[s-mailx.git] / junk.c
blob03d8ddced08b37644dfd43092fdfda918012e801
1 /*
2 * Heirloom mailx - a mail user agent derived from Berkeley Mail.
4 * Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
5 * Copyright (c) 2012 Steffen "Daode" Nurpmeso.
6 */
7 /*
8 * Copyright (c) 2004
9 * Gunnar Ritter. All rights reserved.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by Gunnar Ritter
22 * and his contributors.
23 * 4. Neither the name of Gunnar Ritter nor the names of his contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
40 #ifndef lint
41 #ifdef DOSCCS
42 static char sccsid[] = "@(#)junk.c 1.75 (gritter) 9/14/08";
43 #endif
44 #endif /* not lint */
46 #include "config.h"
47 #include "rcv.h"
49 #ifdef USE_JUNK
50 #include <sys/stat.h>
51 #include <fcntl.h>
52 #include <limits.h>
53 #include <time.h>
54 #include <unistd.h>
55 #include <utime.h>
57 #ifdef HAVE_MMAP
58 #include <sys/mman.h>
59 #else /* !HAVE_MMAP */
60 #define mmap(a, b, c, d, e, f) MAP_FAILED
61 #define munmap(a, b)
62 #endif /* !HAVE_MMAP */
63 #ifndef HAVE_MREMAP
64 #define mremap(a, b, c, d) MAP_FAILED
65 #endif /* !HAVE_MREMAP */
67 #ifndef MAP_FAILED
68 #define MAP_FAILED ((void *)-1)
69 #endif /* !MAP_FAILED */
71 #include "extern.h"
72 #include "md5.h"
75 * Mail -- a mail program
77 * Junk classification, mostly according to Paul Graham's "A Plan for Spam",
78 * August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
79 * Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
81 * Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
82 * Classification using Chained Tokens", February 2004,
83 * <http://www.nuclearelephant.com/papers/chained.html>.
86 #define DFL .40
87 #define THR .9
88 #define MID .5
90 #define MAX2 0x0000ffff
91 #define MAX3 0x00ffffffUL
92 #define MAX4 0xffffffffUL
95 * The dictionary consists of two files forming a hash table. The hash
96 * consists of the first 56 bits of the result of applying MD5 to the
97 * input word. This scheme ensures that collisions are unlikely enough
98 * to make junk detection work; according to the birthday paradox, a
99 * 50 % probability for one single collision is reached at 2^28 entries.
101 * To make the chain structure independent from input, the MD5 input is
102 * xor'ed with a random number. This makes it impossible that someone uses
103 * a carefully crafted message for a denial-of-service attack against the
104 * database.
106 #define SIZEOF_node 17
107 #define OF_node_hash 0 /* first 32 bits of MD5 of word|mangle */
108 #define OF_node_next 4 /* bit-negated table index of next node */
109 #define OF_node_good 8 /* number of times this appeared in good msgs */
110 #define OF_node_bad 11 /* number of times this appeared in bad msgs */
111 #define OF_node_prob_O 14 /* table_version<1: precomputed probability */
112 #define OF_node_hash2 14 /* upper 3 bytes of MD5 hash */
113 static char *nodes;
115 #define SIZEOF_super 262164
116 #define OF_super_size 0 /* allocated nodes in the chain file */
117 #define OF_super_used 4 /* used nodes in the chain file */
118 #define OF_super_ngood 8 /* number of good messages scanned so far */
119 #define OF_super_nbad 12 /* number of bad messages scanned so far */
120 #define OF_super_mangle 16 /* used to mangle the MD5 input */
121 #define OF_super_bucket 20 /* 65536 bit-negated node indices */
122 #define SIZEOF_entry 4
123 static char *super;
125 static size_t super_mmapped;
126 static size_t nodes_mmapped;
127 static int rw_map;
128 static int chained_tokens;
131 * Version history
132 * ---------------
133 * 0 Initial version
134 * 1 Fixed the mangling; it was ineffective in version 0.
135 * Hash extended to 56 bits.
137 static int table_version;
138 #define current_table_version 1
140 #define get(e) \
141 ((unsigned)(((char *)(e))[0]&0377) + \
142 ((unsigned)(((char *)(e))[1]&0377) << 8) + \
143 ((unsigned)(((char *)(e))[2]&0377) << 16))
145 #define put(e, n) \
146 (((char *)(e))[0] = (n) & 0x0000ff, \
147 ((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
148 ((char *)(e))[2] = ((n) & 0xff0000) >> 16)
150 #define f2s(d) (smin(((unsigned)((d) * MAX3)), MAX3))
152 #define s2f(s) ((float)(s) / MAX3)
154 #define getn(p) \
155 ((unsigned long)(((char *)(p))[0]&0377) + \
156 ((unsigned long)(((char *)(p))[1]&0377) << 8) + \
157 ((unsigned long)(((char *)(p))[2]&0377) << 16) + \
158 ((unsigned long)(((char *)(p))[3]&0377) << 24))
160 #define putn(p, n) \
161 (((char *)(p))[0] = (n) & 0x000000ffUL, \
162 ((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
163 ((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
164 ((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
166 struct lexstat {
167 char *save;
168 int price;
169 int url;
170 int lastc;
171 int hadamp;
172 enum loc {
173 FROM_LINE = 0,
174 HEADER = 1,
175 BODY = 2
176 } loc;
177 enum html {
178 HTML_NONE = 0,
179 HTML_TEXT = 1,
180 HTML_TAG = 2,
181 HTML_SKIP = 3
182 } html;
183 char tag[8];
184 char *tagp;
185 char field[LINESIZE];
188 #define constituent(c, b, i, price, hadamp) \
189 ((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
190 (c) == '$' || (c) == '!' || (c) == '_' || \
191 (c) == '#' || (c) == '%' || (c) == '&' || \
192 ((c) == ';' && hadamp) || \
193 ((c) == '-' && !(price)) || \
194 (((c) == '.' || (c) == ',' || (c) == '/') && \
195 (i) > 0 && digitchar((b)[(i)-1]&0377)))
197 #define url_xchar(c) \
198 (((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
199 !spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
200 (c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
201 (c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
202 (c) != '#' && (c) != '"')
204 enum db {
205 SUPER = 0,
206 NODES = 1
209 enum entry {
210 GOOD = 0,
211 BAD = 1
214 static const char README1[] = "\
215 This is a junk mail database maintained by mailx(1). It does not contain any\n\
216 of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
217 used for lookup. It is thus possible to tell if some given word was likely\n\
218 contained in your mail from examining this data, at best.\n";
219 static const char README2[] = "\n\
220 The database files are stored in compress(1) format by default. This saves\n\
221 some space, but leads to higher processor usage when the database is read\n\
222 or updated. You can use uncompress(1) on these files if you prefer to store\n\
223 them in flat form.\n";
225 static int verbose;
226 static int _debug;
227 static FILE *sfp, *nfp;
228 static char *sname, *nname;
230 static enum okay getdb(int rw);
231 static void putdb(void);
232 static void relsedb(void);
233 static FILE *dbfp(enum db db, int rw, int *compressed, char **fn);
234 static char *lookup(unsigned long h1, unsigned long h2, int create);
235 static unsigned long grow(unsigned long size);
236 static char *nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
237 struct lexstat *sp, int *stop);
238 static void join(char **buf, size_t *bufsize, const char *s1, const char *s2);
239 static void add(const char *word, enum entry entry, struct lexstat *sp,
240 int incr);
241 static enum okay scan(struct message *m, enum entry entry,
242 void (*func)(const char *, enum entry, struct lexstat *, int),
243 int arg);
244 static void recompute(void);
245 static float getprob(char *n);
246 static int insert(int *msgvec, enum entry entry, int incr);
247 static void clsf(struct message *m);
248 static void rate(const char *word, enum entry entry, struct lexstat *sp,
249 int unused);
250 static void dbhash(const char *word, unsigned long *h1, unsigned long *h2);
251 static void mkmangle(void);
253 static enum okay
254 getdb(int rw)
256 void *zp = NULL;
257 long n;
258 int compressed;
260 chained_tokens = value("chained-junk-tokens") != NULL;
261 if ((sfp = dbfp(SUPER, rw, &compressed, &sname)) == (FILE *)-1)
262 return STOP;
263 if (sfp && !compressed) {
264 super = mmap(NULL, SIZEOF_super,
265 rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
266 MAP_SHARED, fileno(sfp), 0);
267 if (super != MAP_FAILED) {
268 super_mmapped = SIZEOF_super;
269 goto skip;
272 super_mmapped = 0;
273 super = smalloc(SIZEOF_super);
274 if (sfp) {
275 if (compressed)
276 zp = zalloc(sfp);
277 if ((compressed ? zread(zp, super, SIZEOF_super)
278 != SIZEOF_super :
279 fread(super, 1, SIZEOF_super, sfp)
280 != SIZEOF_super) ||
281 ferror(sfp)) {
282 fprintf(stderr, "Error reading junk mail database.\n");
283 memset(super, 0, SIZEOF_super);
284 mkmangle();
285 if (compressed)
286 zfree(zp);
287 Fclose(sfp);
288 sfp = NULL;
289 } else if (compressed)
290 zfree(zp);
291 } else {
292 memset(super, 0, SIZEOF_super);
293 mkmangle();
295 skip: if ((n = getn(&super[OF_super_size])) == 0) {
296 n = 1;
297 putn(&super[OF_super_size], 1);
299 if (sfp && (nfp = dbfp(NODES, rw, &compressed, &nname)) != NULL) {
300 if (nfp == (FILE *)-1) {
301 relsedb();
302 return STOP;
305 rw_map = rw;
306 if (sfp && nfp && !compressed) {
307 nodes = mmap(NULL, n * SIZEOF_node,
308 rw!=O_RDONLY ? PROT_READ|PROT_WRITE : PROT_READ,
309 MAP_SHARED, fileno(nfp), 0);
310 if (nodes != MAP_FAILED) {
311 nodes_mmapped = n * SIZEOF_node;
312 return OKAY;
315 nodes_mmapped = 0;
316 nodes = smalloc(n * SIZEOF_node);
317 if (sfp && nfp) {
318 if (compressed)
319 zp = zalloc(nfp);
320 if ((compressed ? zread(zp, nodes, n * SIZEOF_node)
321 != n * SIZEOF_node :
322 fread(nodes, 1, n * SIZEOF_node, nfp)
323 != (unsigned long)n * SIZEOF_node) ||
324 ferror(nfp)) {
325 fprintf(stderr, "Error reading junk mail database.\n");
326 memset(nodes, 0, n * SIZEOF_node);
327 memset(super, 0, SIZEOF_super);
328 mkmangle();
329 putn(&super[OF_super_size], n);
331 if (compressed)
332 zfree(zp);
333 Fclose(nfp);
334 nfp = NULL;
335 } else
336 memset(nodes, 0, n * SIZEOF_node);
337 if (sfp) {
338 Fclose(sfp);
339 sfp = NULL;
341 return OKAY;
344 static void
345 putdb(void)
347 void *zp;
348 int scomp, ncomp;
350 if ((! super_mmapped && (sfp = dbfp(SUPER, O_WRONLY, &scomp, &sname))
351 == NULL) || sfp == (FILE *)-1)
352 return;
353 if ((! nodes_mmapped && (nfp = dbfp(NODES, O_WRONLY, &ncomp, &nname))
354 == NULL) || nfp == (FILE *)-1)
355 return;
356 if (super_mmapped == 0 || nodes_mmapped == 0)
357 holdint();
359 * Use utime() with mmap() since Linux does not update st_mtime
360 * reliably otherwise.
362 if (super_mmapped)
363 utime(sname, NULL);
364 else if (scomp) {
365 zp = zalloc(sfp);
366 zwrite(zp, super, SIZEOF_super);
367 zfree(zp);
368 trunc(sfp);
369 } else
370 fwrite(super, 1, SIZEOF_super, sfp);
371 if (nodes_mmapped)
372 utime(nname, NULL);
373 else if (ncomp) {
374 zp = zalloc(nfp);
375 zwrite(zp, nodes, getn(&super[OF_super_size]) * SIZEOF_node);
376 zfree(zp);
377 trunc(nfp);
378 } else
379 fwrite(nodes, 1,
380 getn(&super[OF_super_size]) * SIZEOF_node, nfp);
381 if (super_mmapped == 0 || nodes_mmapped == 0)
382 relseint();
385 static void
386 relsedb(void)
388 if (super_mmapped) {
389 munmap(super, super_mmapped);
390 super_mmapped = 0;
391 } else
392 free(super);
393 if (nodes_mmapped) {
394 munmap(nodes, nodes_mmapped);
395 nodes_mmapped = 0;
396 } else
397 free(nodes);
398 if (sfp && sfp != (FILE *)-1) {
399 Fclose(sfp);
400 sfp = NULL;
402 if (nfp && nfp != (FILE *)-1) {
403 Fclose(nfp);
404 nfp = NULL;
408 static FILE *
409 dbfp(enum db db, int rw, int *compressed, char **fn)
411 FILE *fp, *rp;
412 char *dir;
413 struct flock flp;
414 char *sfx[][2] = {
415 { "super", "nodes" },
416 { "super1", "nodes1" }
418 char **sf;
419 char *zfx[][2] = {
420 { "super.Z", "nodes.Z" },
421 { "super1.Z", "nodes1.Z" }
423 char **zf;
424 int n;
426 if ((dir = value("junkdb")) == NULL) {
427 fprintf(stderr, "No junk mail database specified. "
428 "Set the junkdb variable.\n");
429 return (FILE *)-1;
431 dir = expand(dir);
432 if (makedir(dir) == STOP) {
433 fprintf(stderr, "Cannot create directory \"%s\"\n.", dir);
434 return (FILE *)-1;
436 if (rw!=O_WRONLY)
437 table_version = current_table_version;
438 loop: sf = sfx[table_version];
439 zf = zfx[table_version];
440 *fn = salloc((n = strlen(dir)) + 40);
441 strcpy(*fn, dir);
442 (*fn)[n] = '/';
443 *compressed = 0;
444 strcpy(&(*fn)[n+1], sf[db]);
445 if ((fp = Fopen(*fn, rw!=O_RDONLY ? "r+" : "r")) != NULL)
446 goto okay;
447 *compressed = 1;
448 strcpy(&(*fn)[n+1], zf[db]);
449 if ((fp = Fopen(*fn, rw ? "r+" : "r")) == NULL &&
450 rw==O_WRONLY ? (fp = Fopen(*fn, "w+")) == NULL : 0) {
451 fprintf(stderr, "Cannot open junk mail database \"%s\".\n",*fn);
452 return NULL;
454 if (rw==O_WRONLY) {
455 strcpy(&(*fn)[n+1], "README");
456 if (access(*fn, F_OK) < 0 && (rp = Fopen(*fn, "w")) != NULL) {
457 fputs(README1, rp);
458 fputs(README2, rp);
459 Fclose(rp);
461 } else if (fp == NULL) {
462 if (table_version > 0) {
463 table_version--;
464 goto loop;
465 } else
466 table_version = current_table_version;
468 okay: if (fp) {
469 flp.l_type = rw!=O_RDONLY ? F_WRLCK : F_RDLCK;
470 flp.l_start = 0;
471 flp.l_len = 0;
472 flp.l_whence = SEEK_SET;
473 fcntl(fileno(fp), F_SETLKW, &flp);
475 return fp;
478 static char *
479 lookup(unsigned long h1, unsigned long h2, int create)
481 char *n, *lastn = NULL;
482 unsigned long c, lastc = MAX4, used, size;
484 used = getn(&super[OF_super_used]);
485 size = getn(&super[OF_super_size]);
486 c = ~getn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry]);
487 n = &nodes[c*SIZEOF_node];
488 while (c < used) {
489 if (getn(&n[OF_node_hash]) == h1 &&
490 (table_version < 1 ? 1 :
491 get(&n[OF_node_hash2]) == h2))
492 return n;
493 lastc = c;
494 lastn = n;
495 c = ~getn(&n[OF_node_next]);
496 n = &nodes[c*SIZEOF_node];
498 if (create) {
499 if (used >= size) {
500 if ((size = grow(size)) == 0)
501 return NULL;
502 lastn = &nodes[lastc*SIZEOF_node];
504 putn(&super[OF_super_used], used+1);
505 n = &nodes[used*SIZEOF_node];
506 putn(&n[OF_node_hash], h1);
507 put(&n[OF_node_hash2], h2);
508 if (lastc < used)
509 putn(&lastn[OF_node_next], ~used);
510 else
511 putn(&super[OF_super_bucket + (h1&MAX2)*SIZEOF_entry],
512 ~used);
513 return n;
514 } else
515 return NULL;
518 static unsigned long
519 grow(unsigned long size)
521 unsigned long incr, newsize;
522 void *onodes;
524 incr = size > MAX2 ? MAX2 : size;
525 newsize = size + incr;
526 if (newsize > MAX4-MAX2) {
527 oflo: fprintf(stderr, "Junk mail database overflow.\n");
528 return 0;
530 if (nodes_mmapped) {
531 if (lseek(fileno(nfp), newsize*SIZEOF_node-1, SEEK_SET)
532 == (off_t)-1 || write(fileno(nfp),"\0",1) != 1)
533 goto oflo;
534 onodes = nodes;
535 if ((nodes = mremap(nodes, nodes_mmapped, newsize*SIZEOF_node,
536 MREMAP_MAYMOVE)) == MAP_FAILED) {
537 if ((nodes = mmap(NULL, newsize*SIZEOF_node,
538 rw_map!=O_RDONLY ?
539 PROT_READ|PROT_WRITE :
540 PROT_READ,
541 MAP_SHARED, fileno(nfp), 0))
542 == MAP_FAILED) {
543 nodes = onodes;
544 goto oflo;
546 munmap(onodes, nodes_mmapped);
548 nodes_mmapped = newsize*SIZEOF_node;
549 } else {
550 nodes = srealloc(nodes, newsize*SIZEOF_node);
551 memset(&nodes[size*SIZEOF_node], 0, incr*SIZEOF_node);
553 size = newsize;
554 putn(&super[OF_super_size], size);
555 return size;
558 #define SAVE(c) { \
559 if (i+j >= (long)*bufsize-4) \
560 *buf = srealloc(*buf, *bufsize += 32); \
561 (*buf)[j+i] = (c); \
562 i += (*buf)[j+i] != '\0'; \
565 static char *
566 nextword(char **buf, size_t *bufsize, size_t *count, FILE *fp,
567 struct lexstat *sp, int *stop)
569 int c, i, j, k;
570 char *cp, *cq;
572 loop: *stop = 0;
573 sp->hadamp = 0;
574 if (sp->save) {
575 i = j = 0;
576 for (cp = sp->save; *cp; cp++) {
577 SAVE(*cp&0377)
579 SAVE('\0')
580 free(sp->save);
581 sp->save = NULL;
582 goto out;
584 if (sp->loc == FROM_LINE)
585 while (*count > 0 && (c = getc(fp)) != EOF) {
586 sp->lastc = c;
587 if (c == '\n') {
588 sp->loc = HEADER;
589 break;
592 i = 0;
593 j = 0;
594 if (sp->loc == HEADER && sp->field[0]) {
595 field: cp = sp->field;
596 do {
597 c = *cp&0377;
598 SAVE(c)
599 cp++;
600 } while (*cp);
601 j = i;
602 i = 0;
604 if (sp->price) {
605 sp->price = 0;
606 SAVE('$')
608 while (*count > 0 && (c = getc(fp)) != EOF) {
609 (*count)--;
610 if (c == '\0' && table_version >= 1) {
611 sp->loc = HEADER;
612 sp->lastc = '\n';
613 *stop = 1;
614 continue;
616 if (c == '\b' && table_version >= 1) {
617 sp->html = HTML_TEXT;
618 continue;
620 if (c == '<' && sp->html == HTML_TEXT) {
621 sp->html = HTML_TAG;
622 sp->tagp = sp->tag;
623 continue;
625 if (sp->html == HTML_TAG) {
626 if (spacechar(c)) {
627 *sp->tagp = '\0';
628 if (!asccasecmp(sp->tag, "a") ||
629 !asccasecmp(sp->tag, "img") ||
630 !asccasecmp(sp->tag, "font") ||
631 !asccasecmp(sp->tag, "span") ||
632 !asccasecmp(sp->tag, "meta") ||
633 !asccasecmp(sp->tag, "table") ||
634 !asccasecmp(sp->tag, "tr") ||
635 !asccasecmp(sp->tag, "td") ||
636 !asccasecmp(sp->tag, "p"))
637 sp->html = HTML_TEXT;
638 else
639 sp->html = HTML_SKIP;
640 } else if (c == '>') {
641 sp->html = HTML_TEXT;
642 continue;
643 } else {
644 if ((size_t)(sp->tagp - sp->tag) <
645 sizeof sp->tag - 1)
646 *sp->tagp++ = c;
647 continue;
650 if (sp->html == HTML_SKIP) {
651 if (c == '>')
652 sp->html = HTML_TEXT;
653 continue;
655 if (c == '$' && i == 0)
656 sp->price = 1;
657 if (sp->loc == HEADER && sp->lastc == '\n') {
658 if (!spacechar(c)) {
659 k = 0;
660 while (k < (int)sizeof sp->field - 3) {
661 sp->field[k++] = c;
662 if (*count <= 0 ||
663 (c = getc(fp)) == EOF)
664 break;
665 if (spacechar(c) || c == ':') {
666 ungetc(c, fp);
667 break;
669 sp->lastc = c;
670 (*count)--;
672 sp->field[k++] = '*';
673 sp->field[k] = '\0';
674 j = 0;
675 *stop = 1;
676 goto field;
677 } else if (c == '\n') {
678 j = 0;
679 sp->loc = BODY;
680 sp->html = HTML_NONE;
681 *stop = 1;
684 if (sp->url) {
685 if (!url_xchar(c)) {
686 sp->url = 0;
687 cp = sp->save = smalloc(i+6);
688 for (cq = "HOST*"; *cq; cq++)
689 *cp++ = *cq;
690 for (cq = &(*buf)[j]; *cq != ':'; cq++);
691 cq += 3; /* skip "://" */
692 while (cq < &(*buf)[i+j] &&
693 (alnumchar(*cq&0377) ||
694 *cq == '.' || *cq == '-'))
695 *cp++ = *cq++;
696 *cp = '\0';
697 *stop = 1;
698 break;
700 SAVE(c)
701 } else if (constituent(c, *buf, i+j, sp->price, sp->hadamp) ||
702 (sp->loc == HEADER && c == '.' &&
703 asccasecmp(sp->field, "subject*"))) {
704 if (c == '&')
705 sp->hadamp = 1;
706 SAVE(c)
707 } else if (i > 0 && c == ':' && *count > 2) {
708 if ((c = getc(fp)) != '/') {
709 ungetc(c, fp);
710 break;
712 (*count)--;
713 if ((c = getc(fp)) != '/') {
714 ungetc(c, fp);
715 break;
717 (*count)--;
718 sp->url = 1;
719 SAVE('\0')
720 cp = savestr(*buf);
721 j = i = 0;
722 for (cq = "URL*"; *cq; cq++) {
723 SAVE(*cq&0377)
725 j = i;
726 i = 0;
727 do {
728 if (alnumchar(*cp&0377)) {
729 SAVE(*cp&0377)
730 } else
731 i = 0;
732 } while (*++cp);
733 for (cq = "://"; *cq; cq++) {
734 SAVE(*cq&0377)
736 } else if (i > 1 && ((*buf)[i+j-1] == ',' ||
737 (*buf)[i+j-1] == '.') && !digitchar(c)) {
738 i--;
739 ungetc(c, fp);
740 (*count)++;
741 break;
742 } else if (i > 0) {
743 sp->lastc = c;
744 break;
746 sp->lastc = c;
748 out: if (i > 0) {
749 SAVE('\0')
750 c = 0;
751 for (k = 0; k < i; k++)
752 if (digitchar((*buf)[k+j]&0377))
753 c++;
754 else if (!alphachar((*buf)[k+j]&0377) &&
755 (*buf)[k+j] != '$') {
756 c = 0;
757 break;
759 if (c == i)
760 goto loop;
762 * Including the results of other filtering software (the
763 * 'X-Spam' fields) might seem tempting, but will also rate
764 * their false negatives good with this filter. Therefore
765 * these fields are ignored.
767 * Handling 'Received' fields is difficult since they include
768 * lots of both useless and interesting words for our purposes.
770 if (sp->loc == HEADER &&
771 (asccasecmp(sp->field, "message-id*") == 0 ||
772 asccasecmp(sp->field, "references*") == 0 ||
773 asccasecmp(sp->field, "in-reply-to*") == 0 ||
774 asccasecmp(sp->field, "status*") == 0 ||
775 asccasecmp(sp->field, "x-status*") == 0 ||
776 asccasecmp(sp->field, "date*") == 0 ||
777 asccasecmp(sp->field, "delivery-date*") == 0 ||
778 ascncasecmp(sp->field, "x-spam", 6) == 0 ||
779 ascncasecmp(sp->field, "x-pstn", 6) == 0 ||
780 ascncasecmp(sp->field, "x-scanned", 9) == 0 ||
781 (asccasecmp(sp->field, "received*") == 0 &&
782 (((2*c > i) || i < 4 ||
783 asccasestr(*buf, "localhost")!=NULL)))))
784 goto loop;
785 return *buf;
787 return NULL;
790 #define JOINCHECK if ((size_t)i >= *bufsize) \
791 *buf = srealloc(*buf, *bufsize += 32)
792 static void
793 join(char **buf, size_t *bufsize, const char *s1, const char *s2)
795 int i = 0;
797 while (*s1) {
798 JOINCHECK;
799 (*buf)[i++] = *s1++;
801 JOINCHECK;
802 (*buf)[i++] = ' ';
803 do {
804 JOINCHECK;
805 (*buf)[i++] = *s2;
806 } while (*s2++);
809 /*ARGSUSED3*/
810 static void
811 add(const char *word, enum entry entry, struct lexstat *sp, int incr)
813 unsigned c;
814 unsigned long h1, h2;
815 char *n;
816 (void)sp;
818 dbhash(word, &h1, &h2);
819 if ((n = lookup(h1, h2, 1)) != NULL) {
820 switch (entry) {
821 case GOOD:
822 c = get(&n[OF_node_good]);
823 if ((incr > 0 && c < (unsigned)MAX3 - incr) ||
824 (incr < 0 && c >= (unsigned)-incr)) {
825 c += incr;
826 put(&n[OF_node_good], c);
828 break;
829 case BAD:
830 c = get(&n[OF_node_bad]);
831 if ((incr > 0 && c < (unsigned)MAX3 - incr) ||
832 (incr < 0 && c >= (unsigned)-incr)) {
833 c += incr;
834 put(&n[OF_node_bad], c);
836 break;
841 static enum okay
842 scan(struct message *m, enum entry entry,
843 void (*func)(const char *, enum entry, struct lexstat *, int),
844 int arg)
846 FILE *fp;
847 char *buf0 = NULL, *buf1 = NULL, *buf2 = NULL, **bp, *cp;
848 size_t bufsize0 = 0, bufsize1 = 0, bufsize2 = 0, *zp, count;
849 struct lexstat *sp;
850 int stop;
852 if ((fp = Ftemp(&cp, "Ra", "w+", 0600, 1)) == NULL) {
853 perror("tempfile");
854 return STOP;
856 rm(cp);
857 Ftfree(&cp);
858 if (send(m, fp, NULL, NULL, SEND_TOFLTR, NULL) < 0) {
859 Fclose(fp);
860 return STOP;
862 fflush(fp);
863 rewind(fp);
864 sp = scalloc(1, sizeof *sp);
865 count = fsize(fp);
866 bp = &buf0;
867 zp = &bufsize0;
868 while (nextword(bp, zp, &count, fp, sp, &stop) != NULL) {
869 (*func)(*bp, entry, sp, arg);
870 if (chained_tokens && buf0 && *buf0 && buf1 && *buf1 && !stop) {
871 join(&buf2, &bufsize2, bp == &buf1 ? buf0 : buf1, *bp);
872 (*func)(buf2, entry, sp, arg);
874 bp = bp == &buf1 ? &buf0 : &buf1;
875 zp = zp == &bufsize1 ? &bufsize0 : &bufsize1;
877 free(buf0);
878 free(buf1);
879 free(buf2);
880 free(sp);
881 Fclose(fp);
882 return OKAY;
885 static void
886 recompute(void)
888 unsigned long used, i;
889 unsigned s;
890 char *n;
891 float p;
893 used = getn(&super[OF_super_used]);
894 for (i = 0; i < used; i++) {
895 n = &nodes[i*SIZEOF_node];
896 p = getprob(n);
897 s = f2s(p);
898 put(&n[OF_node_prob_O], s);
902 static float
903 getprob(char *n)
905 unsigned long ngood, nbad;
906 unsigned g, b;
907 float p, BOT, TOP;
909 ngood = getn(&super[OF_super_ngood]);
910 nbad = getn(&super[OF_super_nbad]);
911 if (ngood + nbad >= 18000) {
912 BOT = .0001;
913 TOP = .9999;
914 } else if (ngood + nbad >= 9000) {
915 BOT = .001;
916 TOP = .999;
917 } else {
918 BOT = .01;
919 TOP = .99;
921 g = get(&n[OF_node_good]) * 2;
922 b = get(&n[OF_node_bad]);
923 if (g + b >= 5) {
924 p = smin(1.0, nbad ? (float)b/nbad : 0.0) /
925 (smin(1.0, ngood ? (float)g/ngood : 0.0) +
926 smin(1.0, nbad ? (float)b/nbad : 0.0));
927 p = smin(TOP, p);
928 p = smax(BOT, p);
929 if (p == TOP && b <= 10 && g == 0)
930 p -= BOT;
931 else if (p == BOT && g <= 10 && b == 0)
932 p += BOT;
933 } else if (g == 0 && b == 0)
934 p = DFL;
935 else
936 p = 0;
937 return p;
940 static int
941 insert(int *msgvec, enum entry entry, int incr)
943 int *ip;
944 unsigned long u = 0;
946 verbose = value("verbose") != NULL;
947 if (getdb(O_RDWR) != OKAY)
948 return 1;
949 switch (entry) {
950 case GOOD:
951 u = getn(&super[OF_super_ngood]);
952 break;
953 case BAD:
954 u = getn(&super[OF_super_nbad]);
955 break;
957 for (ip = msgvec; *ip; ip++) {
958 setdot(&message[*ip-1]);
959 if (incr > 0 && u == MAX4-incr+1) {
960 fprintf(stderr, "Junk mail database overflow.\n");
961 break;
962 } else if (incr < 0 && (unsigned long)-incr > u) {
963 fprintf(stderr, "Junk mail database underflow.\n");
964 break;
966 u += incr;
967 if ((entry == GOOD && incr > 0) || (entry == BAD && incr < 0))
968 message[*ip-1].m_flag &= ~MJUNK;
969 else
970 message[*ip-1].m_flag |= MJUNK;
971 scan(&message[*ip-1], entry, add, incr);
973 switch (entry) {
974 case GOOD:
975 putn(&super[OF_super_ngood], u);
976 break;
977 case BAD:
978 putn(&super[OF_super_nbad], u);
979 break;
981 if (table_version < 1)
982 recompute();
983 putdb();
984 relsedb();
985 return 0;
989 cgood(void *v)
991 return insert(v, GOOD, 1);
995 cjunk(void *v)
997 return insert(v, BAD, 1);
1001 cungood(void *v)
1003 return insert(v, GOOD, -1);
1007 cunjunk(void *v)
1009 return insert(v, BAD, -1);
1012 int
1013 cclassify(void *v)
1015 int *msgvec = v, *ip;
1017 verbose = value("verbose") != NULL;
1018 _debug = debug || value("debug") != NULL;
1019 if (getdb(O_RDONLY) != OKAY)
1020 return 1;
1021 for (ip = msgvec; *ip; ip++) {
1022 setdot(&message[*ip-1]);
1023 clsf(&message[*ip-1]);
1025 relsedb();
1026 return 0;
1029 #define BEST 15
1030 static struct {
1031 float dist;
1032 float prob;
1033 char *word;
1034 unsigned long hash1;
1035 unsigned long hash2;
1036 enum loc loc;
1037 } best[BEST];
1039 static void
1040 clsf(struct message *m)
1042 int i;
1043 float a = 1, b = 1, r;
1045 if (verbose)
1046 fprintf(stderr, "Examining message %d\n",
1047 (int)(m - &message[0] + 1));
1048 for (i = 0; i < BEST; i++) {
1049 best[i].dist = 0;
1050 best[i].prob = -1;
1052 if (scan(m, -1, rate, 0) != OKAY)
1053 return;
1054 if (best[0].prob == -1) {
1055 if (verbose)
1056 fprintf(stderr, "No information found.\n");
1057 m->m_flag &= ~MJUNK;
1058 return;
1060 for (i = 0; i < BEST; i++) {
1061 if (best[i].prob == -1)
1062 break;
1063 if (verbose)
1064 fprintf(stderr, "Probe %2d: \"%s\", hash=%lu:%lu "
1065 "prob=%.4g dist=%.4g\n",
1066 i+1, prstr(best[i].word),
1067 best[i].hash1, best[i].hash2,
1068 best[i].prob, best[i].dist);
1069 a *= best[i].prob;
1070 b *= 1 - best[i].prob;
1072 r = a+b > 0 ? a / (a+b) : 0;
1073 if (verbose)
1074 fprintf(stderr, "Junk probability of message %d: %g\n",
1075 (int)(m - &message[0] + 1), r);
1076 if (r > THR)
1077 m->m_flag |= MJUNK;
1078 else
1079 m->m_flag &= ~MJUNK;
1082 /*ARGSUSED4*/
1083 static void
1084 rate(const char *word, enum entry entry, struct lexstat *sp, int unused)
1086 char *n;
1087 unsigned long h1, h2;
1088 float p, d;
1089 int i, j;
1090 (void)entry;
1091 (void)unused;
1093 dbhash(word, &h1, &h2);
1094 if ((n = lookup(h1, h2, 0)) != NULL) {
1095 p = getprob(n);
1096 } else
1097 p = DFL;
1098 if (_debug)
1099 fprintf(stderr, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1, h2,
1100 n ? get(&n[OF_node_good]) : 0,
1101 n ? get(&n[OF_node_bad]) : 0,
1102 p, prstr(word));
1103 if (p == 0)
1104 return;
1105 d = p >= MID ? p - MID : MID - p;
1106 if (d >= best[BEST-1].dist)
1107 for (i = 0; i < BEST; i++) {
1108 if (h1 == best[i].hash1 && h2 == best[i].hash2)
1109 break;
1111 * For equal distance, this selection prefers
1112 * words with a low probability, since a false
1113 * negative is better than a false positive,
1114 * and since experience has shown that false
1115 * positives are more likely otherwise. Then,
1116 * words from the end of the header and from
1117 * the start of the body are preferred. This
1118 * gives the most interesting verbose output.
1120 if (d > best[i].dist ||
1121 d == (best[i].dist &&
1122 p < best[i].prob) ||
1123 (best[i].loc == HEADER &&
1124 d == best[i].dist)) {
1125 for (j = BEST-2; j >= i; j--)
1126 best[j+1] = best[j];
1127 best[i].dist = d;
1128 best[i].prob = p;
1129 best[i].word = savestr(word);
1130 best[i].hash1 = h1;
1131 best[i].hash2 = h2;
1132 best[i].loc = sp->loc;
1133 break;
1138 static void
1139 dbhash(const char *word, unsigned long *h1, unsigned long *h2)
1141 unsigned char digest[16];
1142 MD5_CTX ctx;
1144 MD5Init(&ctx);
1145 MD5Update(&ctx, (unsigned char *)word, strlen(word));
1146 if (table_version >= 1)
1147 MD5Update(&ctx, (unsigned char *)&super[OF_super_mangle], 4);
1148 MD5Final(digest, &ctx);
1149 *h1 = getn(digest);
1150 if (table_version < 1) {
1151 *h1 ^= getn(&super[OF_super_mangle]);
1152 *h2 = 0;
1153 } else
1154 *h2 = get(&digest[4]);
1158 * The selection of the value for mangling is not critical. It is practically
1159 * impossible for any person to determine the exact time when the database
1160 * was created first (without looking at the database, which would reveal the
1161 * value anyway), so we just use this. The MD5 hash here ensures that each
1162 * single second gives a completely different mangling value (which is not
1163 * necessary anymore if table_version>=1, but does not hurt).
1165 static void
1166 mkmangle(void)
1168 union {
1169 time_t t;
1170 char c[16];
1171 } u;
1172 unsigned long s;
1173 unsigned char digest[16];
1174 MD5_CTX ctx;
1176 memset(&u, 0, sizeof u);
1177 time(&u.t);
1178 MD5Init(&ctx);
1179 MD5Update(&ctx, (unsigned char *)u.c, sizeof u.c);
1180 MD5Final(digest, &ctx);
1181 s = getn(digest);
1182 putn(&super[OF_super_mangle], s);
1186 cprobability(void *v)
1188 char **args = v;
1189 unsigned long used, ngood, nbad;
1190 unsigned long h1, h2;
1191 unsigned g, b;
1192 float p, d;
1193 char *n;
1195 if (*args == NULL) {
1196 fprintf(stderr, "No words given.\n");
1197 return 1;
1199 if (getdb(O_RDONLY) != OKAY)
1200 return 1;
1201 used = getn(&super[OF_super_used]);
1202 ngood = getn(&super[OF_super_ngood]);
1203 nbad = getn(&super[OF_super_nbad]);
1204 printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1205 used, ngood, nbad);
1206 do {
1207 dbhash(*args, &h1, &h2);
1208 printf("\"%s\", hash=%lu:%lu ", *args, h1, h2);
1209 if ((n = lookup(h1, h2, 0)) != NULL) {
1210 g = get(&n[OF_node_good]);
1211 b = get(&n[OF_node_bad]);
1212 printf("good=%u bad=%u ", g, b);
1213 p = getprob(n);
1214 if (p != 0) {
1215 d = p >= MID ? p - MID : MID - p;
1216 printf("prob=%.4g dist=%.4g", p, d);
1217 } else
1218 printf("too infrequent");
1219 } else
1220 printf("not in database");
1221 putchar('\n');
1222 } while (*++args);
1223 relsedb();
1224 return 0;
1227 #else /* !USE_JUNK */
1229 static int
1230 nojunk(void)
1232 fputs(catgets(catd, CATSET, 270, "No JUNK support compiled in.\n"),
1233 stderr);
1234 return (1);
1238 cgood(void *v)
1240 (void)v;
1241 return nojunk();
1245 cjunk(void *v)
1247 (void)v;
1248 return nojunk();
1252 cungood(void *v)
1254 (void)v;
1255 return nojunk();
1259 cunjunk(void *v)
1261 (void)v;
1262 return nojunk();
1266 cclassify(void *v)
1268 (void)v;
1269 return nojunk();
1273 cprobability(void *v)
1275 (void)v;
1276 return nojunk();
1279 #endif /* USE_JUNK */