2 * Heirloom mailx - a mail user agent derived from Berkeley Mail.
4 * Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
8 * Gunnar Ritter. All rights reserved.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by Gunnar Ritter
21 * and his contributors.
22 * 4. Neither the name of Gunnar Ritter nor the names of his contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 static char sccsid
[] = "@(#)junk.c 1.75 (gritter) 9/14/08";
57 #else /* !HAVE_MMAP */
58 #define mmap(a, b, c, d, e, f) MAP_FAILED
60 #endif /* !HAVE_MMAP */
62 #define mremap(a, b, c, d) MAP_FAILED
63 #endif /* !HAVE_MREMAP */
66 #define MAP_FAILED ((void *)-1)
67 #endif /* !MAP_FAILED */
73 * Mail -- a mail program
75 * Junk classification, mostly according to Paul Graham's "A Plan for Spam",
76 * August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
77 * Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
79 * Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
80 * Classification using Chained Tokens", February 2004,
81 * <http://www.nuclearelephant.com/papers/chained.html>.
88 #define MAX2 0x0000ffff
89 #define MAX3 0x00ffffffUL
90 #define MAX4 0xffffffffUL
93 * The dictionary consists of two files forming a hash table. The hash
94 * consists of the first 56 bits of the result of applying MD5 to the
95 * input word. This scheme ensures that collisions are unlikely enough
96 * to make junk detection work; according to the birthday paradox, a
97 * 50 % probability for one single collision is reached at 2^28 entries.
99 * To make the chain structure independent from input, the MD5 input is
100 * xor'ed with a random number. This makes it impossible that someone uses
101 * a carefully crafted message for a denial-of-service attack against the
104 #define SIZEOF_node 17
105 #define OF_node_hash 0 /* first 32 bits of MD5 of word|mangle */
106 #define OF_node_next 4 /* bit-negated table index of next node */
107 #define OF_node_good 8 /* number of times this appeared in good msgs */
108 #define OF_node_bad 11 /* number of times this appeared in bad msgs */
109 #define OF_node_prob_O 14 /* table_version<1: precomputed probability */
110 #define OF_node_hash2 14 /* upper 3 bytes of MD5 hash */
113 #define SIZEOF_super 262164
114 #define OF_super_size 0 /* allocated nodes in the chain file */
115 #define OF_super_used 4 /* used nodes in the chain file */
116 #define OF_super_ngood 8 /* number of good messages scanned so far */
117 #define OF_super_nbad 12 /* number of bad messages scanned so far */
118 #define OF_super_mangle 16 /* used to mangle the MD5 input */
119 #define OF_super_bucket 20 /* 65536 bit-negated node indices */
120 #define SIZEOF_entry 4
123 static size_t super_mmapped
;
124 static size_t nodes_mmapped
;
126 static int chained_tokens
;
132 * 1 Fixed the mangling; it was ineffective in version 0.
133 * Hash extended to 56 bits.
135 static int table_version
;
136 #define current_table_version 1
139 ((unsigned)(((char *)(e))[0]&0377) + \
140 ((unsigned)(((char *)(e))[1]&0377) << 8) + \
141 ((unsigned)(((char *)(e))[2]&0377) << 16))
144 (((char *)(e))[0] = (n) & 0x0000ff, \
145 ((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
146 ((char *)(e))[2] = ((n) & 0xff0000) >> 16)
148 #define f2s(d) (smin(((unsigned)((d) * MAX3)), MAX3))
150 #define s2f(s) ((float)(s) / MAX3)
153 ((unsigned long)(((char *)(p))[0]&0377) + \
154 ((unsigned long)(((char *)(p))[1]&0377) << 8) + \
155 ((unsigned long)(((char *)(p))[2]&0377) << 16) + \
156 ((unsigned long)(((char *)(p))[3]&0377) << 24))
159 (((char *)(p))[0] = (n) & 0x000000ffUL, \
160 ((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
161 ((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
162 ((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
183 char field
[LINESIZE
];
186 #define constituent(c, b, i, price, hadamp) \
187 ((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
188 (c) == '$' || (c) == '!' || (c) == '_' || \
189 (c) == '#' || (c) == '%' || (c) == '&' || \
190 ((c) == ';' && hadamp) || \
191 ((c) == '-' && !(price)) || \
192 (((c) == '.' || (c) == ',' || (c) == '/') && \
193 (i) > 0 && digitchar((b)[(i)-1]&0377)))
195 #define url_xchar(c) \
196 (((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
197 !spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
198 (c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
199 (c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
200 (c) != '#' && (c) != '"')
212 static const char README1
[] = "\
213 This is a junk mail database maintained by mailx(1). It does not contain any\n\
214 of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
215 used for lookup. It is thus possible to tell if some given word was likely\n\
216 contained in your mail from examining this data, at best.\n";
217 static const char README2
[] = "\n\
218 The database files are stored in compress(1) format by default. This saves\n\
219 some space, but leads to higher processor usage when the database is read\n\
220 or updated. You can use uncompress(1) on these files if you prefer to store\n\
221 them in flat form.\n";
225 static FILE *sfp
, *nfp
;
226 static char *sname
, *nname
;
228 static enum okay
getdb(int rw
);
229 static void putdb(void);
230 static void relsedb(void);
231 static FILE *dbfp(enum db db
, int rw
, int *compressed
, char **fn
);
232 static char *lookup(unsigned long h1
, unsigned long h2
, int create
);
233 static unsigned long grow(unsigned long size
);
234 static char *nextword(char **buf
, size_t *bufsize
, size_t *count
, FILE *fp
,
235 struct lexstat
*sp
, int *stop
);
236 static void join(char **buf
, size_t *bufsize
, const char *s1
, const char *s2
);
237 static void add(const char *word
, enum entry entry
, struct lexstat
*sp
,
239 static enum okay
scan(struct message
*m
, enum entry entry
,
240 void (*func
)(const char *, enum entry
, struct lexstat
*, int),
242 static void recompute(void);
243 static float getprob(char *n
);
244 static int insert(int *msgvec
, enum entry entry
, int incr
);
245 static void clsf(struct message
*m
);
246 static void rate(const char *word
, enum entry entry
, struct lexstat
*sp
,
248 static void dbhash(const char *word
, unsigned long *h1
, unsigned long *h2
);
249 static void mkmangle(void);
258 chained_tokens
= value("chained-junk-tokens") != NULL
;
259 if ((sfp
= dbfp(SUPER
, rw
, &compressed
, &sname
)) == (FILE *)-1)
261 if (sfp
&& !compressed
) {
262 super
= mmap(NULL
, SIZEOF_super
,
263 rw
!=O_RDONLY
? PROT_READ
|PROT_WRITE
: PROT_READ
,
264 MAP_SHARED
, fileno(sfp
), 0);
265 if (super
!= MAP_FAILED
) {
266 super_mmapped
= SIZEOF_super
;
271 super
= smalloc(SIZEOF_super
);
275 if ((compressed
? zread(zp
, super
, SIZEOF_super
)
277 fread(super
, 1, SIZEOF_super
, sfp
)
280 fprintf(stderr
, "Error reading junk mail database.\n");
281 memset(super
, 0, SIZEOF_super
);
287 } else if (compressed
)
290 memset(super
, 0, SIZEOF_super
);
293 skip
: if ((n
= getn(&super
[OF_super_size
])) == 0) {
295 putn(&super
[OF_super_size
], 1);
297 if (sfp
&& (nfp
= dbfp(NODES
, rw
, &compressed
, &nname
)) != NULL
) {
298 if (nfp
== (FILE *)-1) {
304 if (sfp
&& nfp
&& !compressed
) {
305 nodes
= mmap(NULL
, n
* SIZEOF_node
,
306 rw
!=O_RDONLY
? PROT_READ
|PROT_WRITE
: PROT_READ
,
307 MAP_SHARED
, fileno(nfp
), 0);
308 if (nodes
!= MAP_FAILED
) {
309 nodes_mmapped
= n
* SIZEOF_node
;
314 nodes
= smalloc(n
* SIZEOF_node
);
318 if ((compressed
? zread(zp
, nodes
, n
* SIZEOF_node
)
320 fread(nodes
, 1, n
* SIZEOF_node
, nfp
)
321 != n
* SIZEOF_node
) ||
323 fprintf(stderr
, "Error reading junk mail database.\n");
324 memset(nodes
, 0, n
* SIZEOF_node
);
325 memset(super
, 0, SIZEOF_super
);
327 putn(&super
[OF_super_size
], n
);
334 memset(nodes
, 0, n
* SIZEOF_node
);
348 if (!super_mmapped
&& (sfp
= dbfp(SUPER
, O_WRONLY
, &scomp
, &sname
))
349 == NULL
|| sfp
== (FILE *)-1)
351 if (!nodes_mmapped
&& (nfp
= dbfp(NODES
, O_WRONLY
, &ncomp
, &nname
))
352 == NULL
|| nfp
== (FILE *)-1)
354 if (super_mmapped
== 0 || nodes_mmapped
== 0)
357 * Use utime() with mmap() since Linux does not update st_mtime
358 * reliably otherwise.
364 zwrite(zp
, super
, SIZEOF_super
);
368 fwrite(super
, 1, SIZEOF_super
, sfp
);
373 zwrite(zp
, nodes
, getn(&super
[OF_super_size
]) * SIZEOF_node
);
378 getn(&super
[OF_super_size
]) * SIZEOF_node
, nfp
);
379 if (super_mmapped
== 0 || nodes_mmapped
== 0)
387 munmap(super
, super_mmapped
);
392 munmap(nodes
, nodes_mmapped
);
396 if (sfp
&& sfp
!= (FILE *)-1) {
400 if (nfp
&& nfp
!= (FILE *)-1) {
407 dbfp(enum db db
, int rw
, int *compressed
, char **fn
)
413 { "super", "nodes" },
414 { "super1", "nodes1" }
418 { "super.Z", "nodes.Z" },
419 { "super1.Z", "nodes1.Z" }
424 if ((dir
= value("junkdb")) == NULL
) {
425 fprintf(stderr
, "No junk mail database specified. "
426 "Set the junkdb variable.\n");
430 if (makedir(dir
) == STOP
) {
431 fprintf(stderr
, "Cannot create directory \"%s\"\n.", dir
);
435 table_version
= current_table_version
;
436 loop
: sf
= sfx
[table_version
];
437 zf
= zfx
[table_version
];
438 *fn
= salloc((n
= strlen(dir
)) + 40);
442 strcpy(&(*fn
)[n
+1], sf
[db
]);
443 if ((fp
= Fopen(*fn
, rw
!=O_RDONLY
? "r+" : "r")) != NULL
)
446 strcpy(&(*fn
)[n
+1], zf
[db
]);
447 if ((fp
= Fopen(*fn
, rw
? "r+" : "r")) == NULL
&&
448 rw
==O_WRONLY
? (fp
= Fopen(*fn
, "w+")) == NULL
: 0) {
449 fprintf(stderr
, "Cannot open junk mail database \"%s\".\n",*fn
);
453 strcpy(&(*fn
)[n
+1], "README");
454 if (access(*fn
, F_OK
) < 0 && (rp
= Fopen(*fn
, "w")) != NULL
) {
459 } else if (fp
== NULL
) {
460 if (table_version
> 0) {
464 table_version
= current_table_version
;
467 flp
.l_type
= rw
!=O_RDONLY
? F_WRLCK
: F_RDLCK
;
470 flp
.l_whence
= SEEK_SET
;
471 fcntl(fileno(fp
), F_SETLKW
, &flp
);
477 lookup(unsigned long h1
, unsigned long h2
, int create
)
479 char *n
, *lastn
= NULL
;
480 unsigned long c
, lastc
= MAX4
, used
, size
;
482 used
= getn(&super
[OF_super_used
]);
483 size
= getn(&super
[OF_super_size
]);
484 c
= ~getn(&super
[OF_super_bucket
+ (h1
&MAX2
)*SIZEOF_entry
]);
485 n
= &nodes
[c
*SIZEOF_node
];
487 if (getn(&n
[OF_node_hash
]) == h1
&&
488 (table_version
< 1 ? 1 :
489 get(&n
[OF_node_hash2
]) == h2
))
493 c
= ~getn(&n
[OF_node_next
]);
494 n
= &nodes
[c
*SIZEOF_node
];
498 if ((size
= grow(size
)) == 0)
500 lastn
= &nodes
[lastc
*SIZEOF_node
];
502 putn(&super
[OF_super_used
], used
+1);
503 n
= &nodes
[used
*SIZEOF_node
];
504 putn(&n
[OF_node_hash
], h1
);
505 put(&n
[OF_node_hash2
], h2
);
507 putn(&lastn
[OF_node_next
], ~used
);
509 putn(&super
[OF_super_bucket
+ (h1
&MAX2
)*SIZEOF_entry
],
517 grow(unsigned long size
)
519 unsigned long incr
, newsize
;
522 incr
= size
> MAX2
? MAX2
: size
;
523 newsize
= size
+ incr
;
524 if (newsize
> MAX4
-MAX2
) {
525 oflo
: fprintf(stderr
, "Junk mail database overflow.\n");
529 if (lseek(fileno(nfp
), newsize
*SIZEOF_node
-1, SEEK_SET
)
530 == (off_t
)-1 || write(fileno(nfp
),"\0",1) != 1)
533 if ((nodes
= mremap(nodes
, nodes_mmapped
, newsize
*SIZEOF_node
,
534 MREMAP_MAYMOVE
)) == MAP_FAILED
) {
535 if ((nodes
= mmap(NULL
, newsize
*SIZEOF_node
,
537 PROT_READ
|PROT_WRITE
:
539 MAP_SHARED
, fileno(nfp
), 0))
544 munmap(onodes
, nodes_mmapped
);
546 nodes_mmapped
= newsize
*SIZEOF_node
;
548 nodes
= srealloc(nodes
, newsize
*SIZEOF_node
);
549 memset(&nodes
[size
*SIZEOF_node
], 0, incr
*SIZEOF_node
);
552 putn(&super
[OF_super_size
], size
);
557 if (i+j >= (long)*bufsize-4) \
558 *buf = srealloc(*buf, *bufsize += 32); \
560 i += (*buf)[j+i] != '\0'; \
564 nextword(char **buf
, size_t *bufsize
, size_t *count
, FILE *fp
,
565 struct lexstat
*sp
, int *stop
)
574 for (cp
= sp
->save
; *cp
; cp
++) {
582 if (sp
->loc
== FROM_LINE
)
583 while (*count
> 0 && (c
= getc(fp
)) != EOF
) {
592 if (sp
->loc
== HEADER
&& sp
->field
[0]) {
593 field
: cp
= sp
->field
;
606 while (*count
> 0 && (c
= getc(fp
)) != EOF
) {
608 if (c
== '\0' && table_version
>= 1) {
614 if (c
== '\b' && table_version
>= 1) {
615 sp
->html
= HTML_TEXT
;
618 if (c
== '<' && sp
->html
== HTML_TEXT
) {
623 if (sp
->html
== HTML_TAG
) {
626 if (!asccasecmp(sp
->tag
, "a") ||
627 !asccasecmp(sp
->tag
, "img") ||
628 !asccasecmp(sp
->tag
, "font") ||
629 !asccasecmp(sp
->tag
, "span") ||
630 !asccasecmp(sp
->tag
, "meta") ||
631 !asccasecmp(sp
->tag
, "table") ||
632 !asccasecmp(sp
->tag
, "tr") ||
633 !asccasecmp(sp
->tag
, "td") ||
634 !asccasecmp(sp
->tag
, "p"))
635 sp
->html
= HTML_TEXT
;
637 sp
->html
= HTML_SKIP
;
638 } else if (c
== '>') {
639 sp
->html
= HTML_TEXT
;
642 if (sp
->tagp
- sp
->tag
< sizeof sp
->tag
- 1)
647 if (sp
->html
== HTML_SKIP
) {
649 sp
->html
= HTML_TEXT
;
652 if (c
== '$' && i
== 0)
654 if (sp
->loc
== HEADER
&& sp
->lastc
== '\n') {
657 while (k
< sizeof sp
->field
- 3) {
660 (c
= getc(fp
)) == EOF
)
662 if (spacechar(c
) || c
== ':') {
669 sp
->field
[k
++] = '*';
674 } else if (c
== '\n') {
677 sp
->html
= HTML_NONE
;
684 cp
= sp
->save
= smalloc(i
+6);
685 for (cq
= "HOST*"; *cq
; cq
++)
687 for (cq
= &(*buf
)[j
]; *cq
!= ':'; cq
++);
688 cq
+= 3; /* skip "://" */
689 while (cq
< &(*buf
)[i
+j
] &&
690 (alnumchar(*cq
&0377) ||
691 *cq
== '.' || *cq
== '-'))
698 } else if (constituent(c
, *buf
, i
+j
, sp
->price
, sp
->hadamp
) ||
699 sp
->loc
== HEADER
&& c
== '.' &&
700 asccasecmp(sp
->field
, "subject*")) {
704 } else if (i
> 0 && c
== ':' && *count
> 2) {
705 if ((c
= getc(fp
)) != '/') {
710 if ((c
= getc(fp
)) != '/') {
719 for (cq
= "URL*"; *cq
; cq
++) {
725 if (alnumchar(*cp
&0377)) {
730 for (cq
= "://"; *cq
; cq
++) {
733 } else if (i
> 1 && ((*buf
)[i
+j
-1] == ',' ||
734 (*buf
)[i
+j
-1] == '.') && !digitchar(c
)) {
748 for (k
= 0; k
< i
; k
++)
749 if (digitchar((*buf
)[k
+j
]&0377))
751 else if (!alphachar((*buf
)[k
+j
]&0377) &&
752 (*buf
)[k
+j
] != '$') {
759 * Including the results of other filtering software (the
760 * 'X-Spam' fields) might seem tempting, but will also rate
761 * their false negatives good with this filter. Therefore
762 * these fields are ignored.
764 * Handling 'Received' fields is difficult since they include
765 * lots of both useless and interesting words for our purposes.
767 if (sp
->loc
== HEADER
&&
768 (asccasecmp(sp
->field
, "message-id*") == 0 ||
769 asccasecmp(sp
->field
, "references*") == 0 ||
770 asccasecmp(sp
->field
, "in-reply-to*") == 0 ||
771 asccasecmp(sp
->field
, "status*") == 0 ||
772 asccasecmp(sp
->field
, "x-status*") == 0 ||
773 asccasecmp(sp
->field
, "date*") == 0 ||
774 asccasecmp(sp
->field
, "delivery-date*") == 0 ||
775 ascncasecmp(sp
->field
, "x-spam", 6) == 0 ||
776 ascncasecmp(sp
->field
, "x-pstn", 6) == 0 ||
777 ascncasecmp(sp
->field
, "x-scanned", 9) == 0 ||
778 asccasecmp(sp
->field
, "received*") == 0 &&
779 ((2*c
> i
) || i
< 4 ||
780 asccasestr(*buf
, "localhost") != NULL
)))
787 #define JOINCHECK if (i >= *bufsize) \
788 *buf = srealloc(*buf, *bufsize += 32)
790 join(char **buf
, size_t *bufsize
, const char *s1
, const char *s2
)
808 add(const char *word
, enum entry entry
, struct lexstat
*sp
, int incr
)
811 unsigned long h1
, h2
;
814 dbhash(word
, &h1
, &h2
);
815 if ((n
= lookup(h1
, h2
, 1)) != NULL
) {
818 c
= get(&n
[OF_node_good
]);
819 if (incr
>0 && c
<MAX3
-incr
|| incr
<0 && c
>=-incr
) {
821 put(&n
[OF_node_good
], c
);
825 c
= get(&n
[OF_node_bad
]);
826 if (incr
>0 && c
<MAX3
-incr
|| incr
<0 && c
>=-incr
) {
828 put(&n
[OF_node_bad
], c
);
836 scan(struct message
*m
, enum entry entry
,
837 void (*func
)(const char *, enum entry
, struct lexstat
*, int),
841 char *buf0
= NULL
, *buf1
= NULL
, *buf2
= NULL
, **bp
, *cp
;
842 size_t bufsize0
= 0, bufsize1
= 0, bufsize2
= 0, *zp
, count
;
846 if ((fp
= Ftemp(&cp
, "Ra", "w+", 0600, 1)) == NULL
) {
852 if (send(m
, fp
, NULL
, NULL
, SEND_TOFLTR
, NULL
) < 0) {
858 sp
= scalloc(1, sizeof *sp
);
862 while (nextword(bp
, zp
, &count
, fp
, sp
, &stop
) != NULL
) {
863 (*func
)(*bp
, entry
, sp
, arg
);
864 if (chained_tokens
&& buf0
&& *buf0
&& buf1
&& *buf1
&& !stop
) {
865 join(&buf2
, &bufsize2
, bp
== &buf1
? buf0
: buf1
, *bp
);
866 (*func
)(buf2
, entry
, sp
, arg
);
868 bp
= bp
== &buf1
? &buf0
: &buf1
;
869 zp
= zp
== &bufsize1
? &bufsize0
: &bufsize1
;
882 unsigned long used
, i
;
887 used
= getn(&super
[OF_super_used
]);
888 for (i
= 0; i
< used
; i
++) {
889 n
= &nodes
[i
*SIZEOF_node
];
892 put(&n
[OF_node_prob_O
], s
);
899 unsigned long ngood
, nbad
;
903 ngood
= getn(&super
[OF_super_ngood
]);
904 nbad
= getn(&super
[OF_super_nbad
]);
905 if (ngood
+ nbad
>= 18000) {
908 } else if (ngood
+ nbad
>= 9000) {
915 g
= get(&n
[OF_node_good
]) * 2;
916 b
= get(&n
[OF_node_bad
]);
918 p
= smin(1.0, nbad
? (float)b
/nbad
: 0.0) /
919 (smin(1.0, ngood
? (float)g
/ngood
: 0.0) +
920 smin(1.0, nbad
? (float)b
/nbad
: 0.0));
923 if (p
== TOP
&& b
<= 10 && g
== 0)
925 else if (p
== BOT
&& g
<= 10 && b
== 0)
927 } else if (g
== 0 && b
== 0)
935 insert(int *msgvec
, enum entry entry
, int incr
)
940 verbose
= value("verbose") != NULL
;
941 if (getdb(O_RDWR
) != OKAY
)
945 u
= getn(&super
[OF_super_ngood
]);
948 u
= getn(&super
[OF_super_nbad
]);
951 for (ip
= msgvec
; *ip
; ip
++) {
952 setdot(&message
[*ip
-1]);
953 if (incr
> 0 && u
== MAX4
-incr
+1) {
954 fprintf(stderr
, "Junk mail database overflow.\n");
956 } else if (incr
< 0 && -incr
> u
) {
957 fprintf(stderr
, "Junk mail database underflow.\n");
961 if (entry
== GOOD
&& incr
> 0 || entry
== BAD
&& incr
< 0)
962 message
[*ip
-1].m_flag
&= ~MJUNK
;
964 message
[*ip
-1].m_flag
|= MJUNK
;
965 scan(&message
[*ip
-1], entry
, add
, incr
);
969 putn(&super
[OF_super_ngood
], u
);
972 putn(&super
[OF_super_nbad
], u
);
975 if (table_version
< 1)
985 return insert(v
, GOOD
, 1);
991 return insert(v
, BAD
, 1);
997 return insert(v
, GOOD
, -1);
1003 return insert(v
, BAD
, -1);
1009 int *msgvec
= v
, *ip
;
1011 verbose
= value("verbose") != NULL
;
1012 _debug
= debug
|| value("debug") != NULL
;
1013 if (getdb(O_RDONLY
) != OKAY
)
1015 for (ip
= msgvec
; *ip
; ip
++) {
1016 setdot(&message
[*ip
-1]);
1017 clsf(&message
[*ip
-1]);
1028 unsigned long hash1
;
1029 unsigned long hash2
;
1034 clsf(struct message
*m
)
1037 float a
= 1, b
= 1, r
;
1040 fprintf(stderr
, "Examining message %d\n",
1041 (int)(m
- &message
[0] + 1));
1042 for (i
= 0; i
< BEST
; i
++) {
1046 if (scan(m
, -1, rate
, 0) != OKAY
)
1048 if (best
[0].prob
== -1) {
1050 fprintf(stderr
, "No information found.\n");
1051 m
->m_flag
&= ~MJUNK
;
1054 for (i
= 0; i
< BEST
; i
++) {
1055 if (best
[i
].prob
== -1)
1058 fprintf(stderr
, "Probe %2d: \"%s\", hash=%lu:%lu "
1059 "prob=%.4g dist=%.4g\n",
1060 i
+1, prstr(best
[i
].word
),
1061 best
[i
].hash1
, best
[i
].hash2
,
1062 best
[i
].prob
, best
[i
].dist
);
1064 b
*= 1 - best
[i
].prob
;
1066 r
= a
+b
> 0 ? a
/ (a
+b
) : 0;
1068 fprintf(stderr
, "Junk probability of message %d: %g\n",
1069 (int)(m
- &message
[0] + 1), r
);
1073 m
->m_flag
&= ~MJUNK
;
1078 rate(const char *word
, enum entry entry
, struct lexstat
*sp
, int unused
)
1081 unsigned long h1
, h2
;
1085 dbhash(word
, &h1
, &h2
);
1086 if ((n
= lookup(h1
, h2
, 0)) != NULL
) {
1091 fprintf(stderr
, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1
, h2
,
1092 n
? get(&n
[OF_node_good
]) : 0,
1093 n
? get(&n
[OF_node_bad
]) : 0,
1097 d
= p
>= MID
? p
- MID
: MID
- p
;
1098 if (d
>= best
[BEST
-1].dist
)
1099 for (i
= 0; i
< BEST
; i
++) {
1100 if (h1
== best
[i
].hash1
&& h2
== best
[i
].hash2
)
1103 * For equal distance, this selection prefers
1104 * words with a low probability, since a false
1105 * negative is better than a false positive,
1106 * and since experience has shown that false
1107 * positives are more likely otherwise. Then,
1108 * words from the end of the header and from
1109 * the start of the body are preferred. This
1110 * gives the most interesting verbose output.
1112 if (d
> best
[i
].dist
||
1113 d
== best
[i
].dist
&&
1115 best
[i
].loc
== HEADER
&&
1116 d
== best
[i
].dist
) {
1117 for (j
= BEST
-2; j
>= i
; j
--)
1118 best
[j
+1] = best
[j
];
1121 best
[i
].word
= savestr(word
);
1124 best
[i
].loc
= sp
->loc
;
1131 dbhash(const char *word
, unsigned long *h1
, unsigned long *h2
)
1133 unsigned char digest
[16];
1137 MD5Update(&ctx
, (unsigned char *)word
, strlen(word
));
1138 if (table_version
>= 1)
1139 MD5Update(&ctx
, (unsigned char *)&super
[OF_super_mangle
], 4);
1140 MD5Final(digest
, &ctx
);
1142 if (table_version
< 1) {
1143 *h1
^= getn(&super
[OF_super_mangle
]);
1146 *h2
= get(&digest
[4]);
1150 * The selection of the value for mangling is not critical. It is practically
1151 * impossible for any person to determine the exact time when the database
1152 * was created first (without looking at the database, which would reveal the
1153 * value anyway), so we just use this. The MD5 hash here ensures that each
1154 * single second gives a completely different mangling value (which is not
1155 * necessary anymore if table_version>=1, but does not hurt).
1165 unsigned char digest
[16];
1168 memset(&u
, 0, sizeof u
);
1171 MD5Update(&ctx
, (unsigned char *)u
.c
, sizeof u
.c
);
1172 MD5Final(digest
, &ctx
);
1174 putn(&super
[OF_super_mangle
], s
);
1178 cprobability(void *v
)
1181 unsigned long used
, ngood
, nbad
;
1182 unsigned long h1
, h2
;
1187 if (*args
== NULL
) {
1188 fprintf(stderr
, "No words given.\n");
1191 if (getdb(O_RDONLY
) != OKAY
)
1193 used
= getn(&super
[OF_super_used
]);
1194 ngood
= getn(&super
[OF_super_ngood
]);
1195 nbad
= getn(&super
[OF_super_nbad
]);
1196 printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1199 dbhash(*args
, &h1
, &h2
);
1200 printf("\"%s\", hash=%lu:%lu ", *args
, h1
, h2
);
1201 if ((n
= lookup(h1
, h2
, 0)) != NULL
) {
1202 g
= get(&n
[OF_node_good
]);
1203 b
= get(&n
[OF_node_bad
]);
1204 printf("good=%u bad=%u ", g
, b
);
1207 d
= p
>= MID
? p
- MID
: MID
- p
;
1208 printf("prob=%.4g dist=%.4g", p
, d
);
1210 printf("too infrequent");
1212 printf("not in database");