2 * Heirloom mailx - a mail user agent derived from Berkeley Mail.
4 * Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
5 * Copyright (c) 2012 Steffen "Daode" Nurpmeso.
9 * Gunnar Ritter. All rights reserved.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by Gunnar Ritter
22 * and his contributors.
23 * 4. Neither the name of Gunnar Ritter nor the names of his contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42 static char sccsid
[] = "@(#)junk.c 1.75 (gritter) 9/14/08";
59 #else /* !HAVE_MMAP */
60 #define mmap(a, b, c, d, e, f) MAP_FAILED
62 #endif /* !HAVE_MMAP */
64 #define mremap(a, b, c, d) MAP_FAILED
65 #endif /* !HAVE_MREMAP */
68 #define MAP_FAILED ((void *)-1)
69 #endif /* !MAP_FAILED */
75 * Mail -- a mail program
77 * Junk classification, mostly according to Paul Graham's "A Plan for Spam",
78 * August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
79 * Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
81 * Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
82 * Classification using Chained Tokens", February 2004,
83 * <http://www.nuclearelephant.com/papers/chained.html>.
90 #define MAX2 0x0000ffff
91 #define MAX3 0x00ffffffUL
92 #define MAX4 0xffffffffUL
95 * The dictionary consists of two files forming a hash table. The hash
96 * consists of the first 56 bits of the result of applying MD5 to the
97 * input word. This scheme ensures that collisions are unlikely enough
98 * to make junk detection work; according to the birthday paradox, a
99 * 50 % probability for one single collision is reached at 2^28 entries.
101 * To make the chain structure independent from input, the MD5 input is
102 * xor'ed with a random number. This makes it impossible that someone uses
103 * a carefully crafted message for a denial-of-service attack against the
106 #define SIZEOF_node 17
107 #define OF_node_hash 0 /* first 32 bits of MD5 of word|mangle */
108 #define OF_node_next 4 /* bit-negated table index of next node */
109 #define OF_node_good 8 /* number of times this appeared in good msgs */
110 #define OF_node_bad 11 /* number of times this appeared in bad msgs */
111 #define OF_node_prob_O 14 /* table_version<1: precomputed probability */
112 #define OF_node_hash2 14 /* upper 3 bytes of MD5 hash */
115 #define SIZEOF_super 262164
116 #define OF_super_size 0 /* allocated nodes in the chain file */
117 #define OF_super_used 4 /* used nodes in the chain file */
118 #define OF_super_ngood 8 /* number of good messages scanned so far */
119 #define OF_super_nbad 12 /* number of bad messages scanned so far */
120 #define OF_super_mangle 16 /* used to mangle the MD5 input */
121 #define OF_super_bucket 20 /* 65536 bit-negated node indices */
122 #define SIZEOF_entry 4
125 static size_t super_mmapped
;
126 static size_t nodes_mmapped
;
128 static int chained_tokens
;
134 * 1 Fixed the mangling; it was ineffective in version 0.
135 * Hash extended to 56 bits.
137 static int table_version
;
138 #define current_table_version 1
141 ((unsigned)(((char *)(e))[0]&0377) + \
142 ((unsigned)(((char *)(e))[1]&0377) << 8) + \
143 ((unsigned)(((char *)(e))[2]&0377) << 16))
146 (((char *)(e))[0] = (n) & 0x0000ff, \
147 ((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
148 ((char *)(e))[2] = ((n) & 0xff0000) >> 16)
150 #define f2s(d) (smin(((unsigned)((d) * MAX3)), MAX3))
152 #define s2f(s) ((float)(s) / MAX3)
155 ((unsigned long)(((char *)(p))[0]&0377) + \
156 ((unsigned long)(((char *)(p))[1]&0377) << 8) + \
157 ((unsigned long)(((char *)(p))[2]&0377) << 16) + \
158 ((unsigned long)(((char *)(p))[3]&0377) << 24))
161 (((char *)(p))[0] = (n) & 0x000000ffUL, \
162 ((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
163 ((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
164 ((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
185 char field
[LINESIZE
];
188 #define constituent(c, b, i, price, hadamp) \
189 ((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
190 (c) == '$' || (c) == '!' || (c) == '_' || \
191 (c) == '#' || (c) == '%' || (c) == '&' || \
192 ((c) == ';' && hadamp) || \
193 ((c) == '-' && !(price)) || \
194 (((c) == '.' || (c) == ',' || (c) == '/') && \
195 (i) > 0 && digitchar((b)[(i)-1]&0377)))
197 #define url_xchar(c) \
198 (((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
199 !spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
200 (c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
201 (c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
202 (c) != '#' && (c) != '"')
214 static const char README1
[] = "\
215 This is a junk mail database maintained by mailx(1). It does not contain any\n\
216 of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
217 used for lookup. It is thus possible to tell if some given word was likely\n\
218 contained in your mail from examining this data, at best.\n";
219 static const char README2
[] = "\n\
220 The database files are stored in compress(1) format by default. This saves\n\
221 some space, but leads to higher processor usage when the database is read\n\
222 or updated. You can use uncompress(1) on these files if you prefer to store\n\
223 them in flat form.\n";
227 static FILE *sfp
, *nfp
;
228 static char *sname
, *nname
;
230 static enum okay
getdb(int rw
);
231 static void putdb(void);
232 static void relsedb(void);
233 static FILE *dbfp(enum db db
, int rw
, int *compressed
, char **fn
);
234 static char *lookup(unsigned long h1
, unsigned long h2
, int create
);
235 static unsigned long grow(unsigned long size
);
236 static char *nextword(char **buf
, size_t *bufsize
, size_t *count
, FILE *fp
,
237 struct lexstat
*sp
, int *stop
);
238 static void join(char **buf
, size_t *bufsize
, const char *s1
, const char *s2
);
239 static void add(const char *word
, enum entry entry
, struct lexstat
*sp
,
241 static enum okay
scan(struct message
*m
, enum entry entry
,
242 void (*func
)(const char *, enum entry
, struct lexstat
*, int),
244 static void recompute(void);
245 static float getprob(char *n
);
246 static int insert(int *msgvec
, enum entry entry
, int incr
);
247 static void clsf(struct message
*m
);
248 static void rate(const char *word
, enum entry entry
, struct lexstat
*sp
,
250 static void dbhash(const char *word
, unsigned long *h1
, unsigned long *h2
);
251 static void mkmangle(void);
260 chained_tokens
= value("chained-junk-tokens") != NULL
;
261 if ((sfp
= dbfp(SUPER
, rw
, &compressed
, &sname
)) == (FILE *)-1)
263 if (sfp
&& !compressed
) {
264 super
= mmap(NULL
, SIZEOF_super
,
265 rw
!=O_RDONLY
? PROT_READ
|PROT_WRITE
: PROT_READ
,
266 MAP_SHARED
, fileno(sfp
), 0);
267 if (super
!= MAP_FAILED
) {
268 super_mmapped
= SIZEOF_super
;
273 super
= smalloc(SIZEOF_super
);
277 if ((compressed
? zread(zp
, super
, SIZEOF_super
)
279 fread(super
, 1, SIZEOF_super
, sfp
)
282 fprintf(stderr
, "Error reading junk mail database.\n");
283 memset(super
, 0, SIZEOF_super
);
289 } else if (compressed
)
292 memset(super
, 0, SIZEOF_super
);
295 skip
: if ((n
= getn(&super
[OF_super_size
])) == 0) {
297 putn(&super
[OF_super_size
], 1);
299 if (sfp
&& (nfp
= dbfp(NODES
, rw
, &compressed
, &nname
)) != NULL
) {
300 if (nfp
== (FILE *)-1) {
306 if (sfp
&& nfp
&& !compressed
) {
307 nodes
= mmap(NULL
, n
* SIZEOF_node
,
308 rw
!=O_RDONLY
? PROT_READ
|PROT_WRITE
: PROT_READ
,
309 MAP_SHARED
, fileno(nfp
), 0);
310 if (nodes
!= MAP_FAILED
) {
311 nodes_mmapped
= n
* SIZEOF_node
;
316 nodes
= smalloc(n
* SIZEOF_node
);
320 if ((compressed
? zread(zp
, nodes
, n
* SIZEOF_node
)
322 fread(nodes
, 1, n
* SIZEOF_node
, nfp
)
323 != (unsigned long)n
* SIZEOF_node
) ||
325 fprintf(stderr
, "Error reading junk mail database.\n");
326 memset(nodes
, 0, n
* SIZEOF_node
);
327 memset(super
, 0, SIZEOF_super
);
329 putn(&super
[OF_super_size
], n
);
336 memset(nodes
, 0, n
* SIZEOF_node
);
350 if ((! super_mmapped
&& (sfp
= dbfp(SUPER
, O_WRONLY
, &scomp
, &sname
))
351 == NULL
) || sfp
== (FILE *)-1)
353 if ((! nodes_mmapped
&& (nfp
= dbfp(NODES
, O_WRONLY
, &ncomp
, &nname
))
354 == NULL
) || nfp
== (FILE *)-1)
356 if (super_mmapped
== 0 || nodes_mmapped
== 0)
359 * Use utime() with mmap() since Linux does not update st_mtime
360 * reliably otherwise.
366 zwrite(zp
, super
, SIZEOF_super
);
370 fwrite(super
, 1, SIZEOF_super
, sfp
);
375 zwrite(zp
, nodes
, getn(&super
[OF_super_size
]) * SIZEOF_node
);
380 getn(&super
[OF_super_size
]) * SIZEOF_node
, nfp
);
381 if (super_mmapped
== 0 || nodes_mmapped
== 0)
389 munmap(super
, super_mmapped
);
394 munmap(nodes
, nodes_mmapped
);
398 if (sfp
&& sfp
!= (FILE *)-1) {
402 if (nfp
&& nfp
!= (FILE *)-1) {
409 dbfp(enum db db
, int rw
, int *compressed
, char **fn
)
415 { "super", "nodes" },
416 { "super1", "nodes1" }
420 { "super.Z", "nodes.Z" },
421 { "super1.Z", "nodes1.Z" }
426 if ((dir
= value("junkdb")) == NULL
) {
427 fprintf(stderr
, "No junk mail database specified. "
428 "Set the junkdb variable.\n");
432 if (makedir(dir
) == STOP
) {
433 fprintf(stderr
, "Cannot create directory \"%s\"\n.", dir
);
437 table_version
= current_table_version
;
438 loop
: sf
= sfx
[table_version
];
439 zf
= zfx
[table_version
];
440 *fn
= salloc((n
= strlen(dir
)) + 40);
444 strcpy(&(*fn
)[n
+1], sf
[db
]);
445 if ((fp
= Fopen(*fn
, rw
!=O_RDONLY
? "r+" : "r")) != NULL
)
448 strcpy(&(*fn
)[n
+1], zf
[db
]);
449 if ((fp
= Fopen(*fn
, rw
? "r+" : "r")) == NULL
&&
450 rw
==O_WRONLY
? (fp
= Fopen(*fn
, "w+")) == NULL
: 0) {
451 fprintf(stderr
, "Cannot open junk mail database \"%s\".\n",*fn
);
455 strcpy(&(*fn
)[n
+1], "README");
456 if (access(*fn
, F_OK
) < 0 && (rp
= Fopen(*fn
, "w")) != NULL
) {
461 } else if (fp
== NULL
) {
462 if (table_version
> 0) {
466 table_version
= current_table_version
;
469 flp
.l_type
= rw
!=O_RDONLY
? F_WRLCK
: F_RDLCK
;
472 flp
.l_whence
= SEEK_SET
;
473 fcntl(fileno(fp
), F_SETLKW
, &flp
);
479 lookup(unsigned long h1
, unsigned long h2
, int create
)
481 char *n
, *lastn
= NULL
;
482 unsigned long c
, lastc
= MAX4
, used
, size
;
484 used
= getn(&super
[OF_super_used
]);
485 size
= getn(&super
[OF_super_size
]);
486 c
= ~getn(&super
[OF_super_bucket
+ (h1
&MAX2
)*SIZEOF_entry
]);
487 n
= &nodes
[c
*SIZEOF_node
];
489 if (getn(&n
[OF_node_hash
]) == h1
&&
490 (table_version
< 1 ? 1 :
491 get(&n
[OF_node_hash2
]) == h2
))
495 c
= ~getn(&n
[OF_node_next
]);
496 n
= &nodes
[c
*SIZEOF_node
];
500 if ((size
= grow(size
)) == 0)
502 lastn
= &nodes
[lastc
*SIZEOF_node
];
504 putn(&super
[OF_super_used
], used
+1);
505 n
= &nodes
[used
*SIZEOF_node
];
506 putn(&n
[OF_node_hash
], h1
);
507 put(&n
[OF_node_hash2
], h2
);
509 putn(&lastn
[OF_node_next
], ~used
);
511 putn(&super
[OF_super_bucket
+ (h1
&MAX2
)*SIZEOF_entry
],
519 grow(unsigned long size
)
521 unsigned long incr
, newsize
;
524 incr
= size
> MAX2
? MAX2
: size
;
525 newsize
= size
+ incr
;
526 if (newsize
> MAX4
-MAX2
) {
527 oflo
: fprintf(stderr
, "Junk mail database overflow.\n");
531 if (lseek(fileno(nfp
), newsize
*SIZEOF_node
-1, SEEK_SET
)
532 == (off_t
)-1 || write(fileno(nfp
),"\0",1) != 1)
535 if ((nodes
= mremap(nodes
, nodes_mmapped
, newsize
*SIZEOF_node
,
536 MREMAP_MAYMOVE
)) == MAP_FAILED
) {
537 if ((nodes
= mmap(NULL
, newsize
*SIZEOF_node
,
539 PROT_READ
|PROT_WRITE
:
541 MAP_SHARED
, fileno(nfp
), 0))
546 munmap(onodes
, nodes_mmapped
);
548 nodes_mmapped
= newsize
*SIZEOF_node
;
550 nodes
= srealloc(nodes
, newsize
*SIZEOF_node
);
551 memset(&nodes
[size
*SIZEOF_node
], 0, incr
*SIZEOF_node
);
554 putn(&super
[OF_super_size
], size
);
559 if (i+j >= (long)*bufsize-4) \
560 *buf = srealloc(*buf, *bufsize += 32); \
562 i += (*buf)[j+i] != '\0'; \
566 nextword(char **buf
, size_t *bufsize
, size_t *count
, FILE *fp
,
567 struct lexstat
*sp
, int *stop
)
576 for (cp
= sp
->save
; *cp
; cp
++) {
584 if (sp
->loc
== FROM_LINE
)
585 while (*count
> 0 && (c
= getc(fp
)) != EOF
) {
594 if (sp
->loc
== HEADER
&& sp
->field
[0]) {
595 field
: cp
= sp
->field
;
608 while (*count
> 0 && (c
= getc(fp
)) != EOF
) {
610 if (c
== '\0' && table_version
>= 1) {
616 if (c
== '\b' && table_version
>= 1) {
617 sp
->html
= HTML_TEXT
;
620 if (c
== '<' && sp
->html
== HTML_TEXT
) {
625 if (sp
->html
== HTML_TAG
) {
628 if (!asccasecmp(sp
->tag
, "a") ||
629 !asccasecmp(sp
->tag
, "img") ||
630 !asccasecmp(sp
->tag
, "font") ||
631 !asccasecmp(sp
->tag
, "span") ||
632 !asccasecmp(sp
->tag
, "meta") ||
633 !asccasecmp(sp
->tag
, "table") ||
634 !asccasecmp(sp
->tag
, "tr") ||
635 !asccasecmp(sp
->tag
, "td") ||
636 !asccasecmp(sp
->tag
, "p"))
637 sp
->html
= HTML_TEXT
;
639 sp
->html
= HTML_SKIP
;
640 } else if (c
== '>') {
641 sp
->html
= HTML_TEXT
;
644 if ((size_t)(sp
->tagp
- sp
->tag
) <
650 if (sp
->html
== HTML_SKIP
) {
652 sp
->html
= HTML_TEXT
;
655 if (c
== '$' && i
== 0)
657 if (sp
->loc
== HEADER
&& sp
->lastc
== '\n') {
660 while (k
< (int)sizeof sp
->field
- 3) {
663 (c
= getc(fp
)) == EOF
)
665 if (spacechar(c
) || c
== ':') {
672 sp
->field
[k
++] = '*';
677 } else if (c
== '\n') {
680 sp
->html
= HTML_NONE
;
687 cp
= sp
->save
= smalloc(i
+6);
688 for (cq
= "HOST*"; *cq
; cq
++)
690 for (cq
= &(*buf
)[j
]; *cq
!= ':'; cq
++);
691 cq
+= 3; /* skip "://" */
692 while (cq
< &(*buf
)[i
+j
] &&
693 (alnumchar(*cq
&0377) ||
694 *cq
== '.' || *cq
== '-'))
701 } else if (constituent(c
, *buf
, i
+j
, sp
->price
, sp
->hadamp
) ||
702 (sp
->loc
== HEADER
&& c
== '.' &&
703 asccasecmp(sp
->field
, "subject*"))) {
707 } else if (i
> 0 && c
== ':' && *count
> 2) {
708 if ((c
= getc(fp
)) != '/') {
713 if ((c
= getc(fp
)) != '/') {
722 for (cq
= "URL*"; *cq
; cq
++) {
728 if (alnumchar(*cp
&0377)) {
733 for (cq
= "://"; *cq
; cq
++) {
736 } else if (i
> 1 && ((*buf
)[i
+j
-1] == ',' ||
737 (*buf
)[i
+j
-1] == '.') && !digitchar(c
)) {
751 for (k
= 0; k
< i
; k
++)
752 if (digitchar((*buf
)[k
+j
]&0377))
754 else if (!alphachar((*buf
)[k
+j
]&0377) &&
755 (*buf
)[k
+j
] != '$') {
762 * Including the results of other filtering software (the
763 * 'X-Spam' fields) might seem tempting, but will also rate
764 * their false negatives good with this filter. Therefore
765 * these fields are ignored.
767 * Handling 'Received' fields is difficult since they include
768 * lots of both useless and interesting words for our purposes.
770 if (sp
->loc
== HEADER
&&
771 (asccasecmp(sp
->field
, "message-id*") == 0 ||
772 asccasecmp(sp
->field
, "references*") == 0 ||
773 asccasecmp(sp
->field
, "in-reply-to*") == 0 ||
774 asccasecmp(sp
->field
, "status*") == 0 ||
775 asccasecmp(sp
->field
, "x-status*") == 0 ||
776 asccasecmp(sp
->field
, "date*") == 0 ||
777 asccasecmp(sp
->field
, "delivery-date*") == 0 ||
778 ascncasecmp(sp
->field
, "x-spam", 6) == 0 ||
779 ascncasecmp(sp
->field
, "x-pstn", 6) == 0 ||
780 ascncasecmp(sp
->field
, "x-scanned", 9) == 0 ||
781 (asccasecmp(sp
->field
, "received*") == 0 &&
782 (((2*c
> i
) || i
< 4 ||
783 asccasestr(*buf
, "localhost")!=NULL
)))))
790 #define JOINCHECK if ((size_t)i >= *bufsize) \
791 *buf = srealloc(*buf, *bufsize += 32)
793 join(char **buf
, size_t *bufsize
, const char *s1
, const char *s2
)
811 add(const char *word
, enum entry entry
, struct lexstat
*sp
, int incr
)
814 unsigned long h1
, h2
;
818 dbhash(word
, &h1
, &h2
);
819 if ((n
= lookup(h1
, h2
, 1)) != NULL
) {
822 c
= get(&n
[OF_node_good
]);
823 if ((incr
> 0 && c
< (unsigned)MAX3
- incr
) ||
824 (incr
< 0 && c
>= (unsigned)-incr
)) {
826 put(&n
[OF_node_good
], c
);
830 c
= get(&n
[OF_node_bad
]);
831 if ((incr
> 0 && c
< (unsigned)MAX3
- incr
) ||
832 (incr
< 0 && c
>= (unsigned)-incr
)) {
834 put(&n
[OF_node_bad
], c
);
842 scan(struct message
*m
, enum entry entry
,
843 void (*func
)(const char *, enum entry
, struct lexstat
*, int),
847 char *buf0
= NULL
, *buf1
= NULL
, *buf2
= NULL
, **bp
, *cp
;
848 size_t bufsize0
= 0, bufsize1
= 0, bufsize2
= 0, *zp
, count
;
852 if ((fp
= Ftemp(&cp
, "Ra", "w+", 0600, 1)) == NULL
) {
858 if (send(m
, fp
, NULL
, NULL
, SEND_TOFLTR
, NULL
) < 0) {
864 sp
= scalloc(1, sizeof *sp
);
868 while (nextword(bp
, zp
, &count
, fp
, sp
, &stop
) != NULL
) {
869 (*func
)(*bp
, entry
, sp
, arg
);
870 if (chained_tokens
&& buf0
&& *buf0
&& buf1
&& *buf1
&& !stop
) {
871 join(&buf2
, &bufsize2
, bp
== &buf1
? buf0
: buf1
, *bp
);
872 (*func
)(buf2
, entry
, sp
, arg
);
874 bp
= bp
== &buf1
? &buf0
: &buf1
;
875 zp
= zp
== &bufsize1
? &bufsize0
: &bufsize1
;
888 unsigned long used
, i
;
893 used
= getn(&super
[OF_super_used
]);
894 for (i
= 0; i
< used
; i
++) {
895 n
= &nodes
[i
*SIZEOF_node
];
898 put(&n
[OF_node_prob_O
], s
);
905 unsigned long ngood
, nbad
;
909 ngood
= getn(&super
[OF_super_ngood
]);
910 nbad
= getn(&super
[OF_super_nbad
]);
911 if (ngood
+ nbad
>= 18000) {
914 } else if (ngood
+ nbad
>= 9000) {
921 g
= get(&n
[OF_node_good
]) * 2;
922 b
= get(&n
[OF_node_bad
]);
924 p
= smin(1.0, nbad
? (float)b
/nbad
: 0.0) /
925 (smin(1.0, ngood
? (float)g
/ngood
: 0.0) +
926 smin(1.0, nbad
? (float)b
/nbad
: 0.0));
929 if (p
== TOP
&& b
<= 10 && g
== 0)
931 else if (p
== BOT
&& g
<= 10 && b
== 0)
933 } else if (g
== 0 && b
== 0)
941 insert(int *msgvec
, enum entry entry
, int incr
)
946 verbose
= value("verbose") != NULL
;
947 if (getdb(O_RDWR
) != OKAY
)
951 u
= getn(&super
[OF_super_ngood
]);
954 u
= getn(&super
[OF_super_nbad
]);
957 for (ip
= msgvec
; *ip
; ip
++) {
958 setdot(&message
[*ip
-1]);
959 if (incr
> 0 && u
== MAX4
-incr
+1) {
960 fprintf(stderr
, "Junk mail database overflow.\n");
962 } else if (incr
< 0 && (unsigned long)-incr
> u
) {
963 fprintf(stderr
, "Junk mail database underflow.\n");
967 if ((entry
== GOOD
&& incr
> 0) || (entry
== BAD
&& incr
< 0))
968 message
[*ip
-1].m_flag
&= ~MJUNK
;
970 message
[*ip
-1].m_flag
|= MJUNK
;
971 scan(&message
[*ip
-1], entry
, add
, incr
);
975 putn(&super
[OF_super_ngood
], u
);
978 putn(&super
[OF_super_nbad
], u
);
981 if (table_version
< 1)
991 return insert(v
, GOOD
, 1);
997 return insert(v
, BAD
, 1);
1003 return insert(v
, GOOD
, -1);
1009 return insert(v
, BAD
, -1);
1015 int *msgvec
= v
, *ip
;
1017 verbose
= value("verbose") != NULL
;
1018 _debug
= debug
|| value("debug") != NULL
;
1019 if (getdb(O_RDONLY
) != OKAY
)
1021 for (ip
= msgvec
; *ip
; ip
++) {
1022 setdot(&message
[*ip
-1]);
1023 clsf(&message
[*ip
-1]);
1034 unsigned long hash1
;
1035 unsigned long hash2
;
1040 clsf(struct message
*m
)
1043 float a
= 1, b
= 1, r
;
1046 fprintf(stderr
, "Examining message %d\n",
1047 (int)(m
- &message
[0] + 1));
1048 for (i
= 0; i
< BEST
; i
++) {
1052 if (scan(m
, -1, rate
, 0) != OKAY
)
1054 if (best
[0].prob
== -1) {
1056 fprintf(stderr
, "No information found.\n");
1057 m
->m_flag
&= ~MJUNK
;
1060 for (i
= 0; i
< BEST
; i
++) {
1061 if (best
[i
].prob
== -1)
1064 fprintf(stderr
, "Probe %2d: \"%s\", hash=%lu:%lu "
1065 "prob=%.4g dist=%.4g\n",
1066 i
+1, prstr(best
[i
].word
),
1067 best
[i
].hash1
, best
[i
].hash2
,
1068 best
[i
].prob
, best
[i
].dist
);
1070 b
*= 1 - best
[i
].prob
;
1072 r
= a
+b
> 0 ? a
/ (a
+b
) : 0;
1074 fprintf(stderr
, "Junk probability of message %d: %g\n",
1075 (int)(m
- &message
[0] + 1), r
);
1079 m
->m_flag
&= ~MJUNK
;
1084 rate(const char *word
, enum entry entry
, struct lexstat
*sp
, int unused
)
1087 unsigned long h1
, h2
;
1093 dbhash(word
, &h1
, &h2
);
1094 if ((n
= lookup(h1
, h2
, 0)) != NULL
) {
1099 fprintf(stderr
, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1
, h2
,
1100 n
? get(&n
[OF_node_good
]) : 0,
1101 n
? get(&n
[OF_node_bad
]) : 0,
1105 d
= p
>= MID
? p
- MID
: MID
- p
;
1106 if (d
>= best
[BEST
-1].dist
)
1107 for (i
= 0; i
< BEST
; i
++) {
1108 if (h1
== best
[i
].hash1
&& h2
== best
[i
].hash2
)
1111 * For equal distance, this selection prefers
1112 * words with a low probability, since a false
1113 * negative is better than a false positive,
1114 * and since experience has shown that false
1115 * positives are more likely otherwise. Then,
1116 * words from the end of the header and from
1117 * the start of the body are preferred. This
1118 * gives the most interesting verbose output.
1120 if (d
> best
[i
].dist
||
1121 d
== (best
[i
].dist
&&
1122 p
< best
[i
].prob
) ||
1123 (best
[i
].loc
== HEADER
&&
1124 d
== best
[i
].dist
)) {
1125 for (j
= BEST
-2; j
>= i
; j
--)
1126 best
[j
+1] = best
[j
];
1129 best
[i
].word
= savestr(word
);
1132 best
[i
].loc
= sp
->loc
;
1139 dbhash(const char *word
, unsigned long *h1
, unsigned long *h2
)
1141 unsigned char digest
[16];
1145 MD5Update(&ctx
, (unsigned char *)word
, strlen(word
));
1146 if (table_version
>= 1)
1147 MD5Update(&ctx
, (unsigned char *)&super
[OF_super_mangle
], 4);
1148 MD5Final(digest
, &ctx
);
1150 if (table_version
< 1) {
1151 *h1
^= getn(&super
[OF_super_mangle
]);
1154 *h2
= get(&digest
[4]);
1158 * The selection of the value for mangling is not critical. It is practically
1159 * impossible for any person to determine the exact time when the database
1160 * was created first (without looking at the database, which would reveal the
1161 * value anyway), so we just use this. The MD5 hash here ensures that each
1162 * single second gives a completely different mangling value (which is not
1163 * necessary anymore if table_version>=1, but does not hurt).
1173 unsigned char digest
[16];
1176 memset(&u
, 0, sizeof u
);
1179 MD5Update(&ctx
, (unsigned char *)u
.c
, sizeof u
.c
);
1180 MD5Final(digest
, &ctx
);
1182 putn(&super
[OF_super_mangle
], s
);
1186 cprobability(void *v
)
1189 unsigned long used
, ngood
, nbad
;
1190 unsigned long h1
, h2
;
1195 if (*args
== NULL
) {
1196 fprintf(stderr
, "No words given.\n");
1199 if (getdb(O_RDONLY
) != OKAY
)
1201 used
= getn(&super
[OF_super_used
]);
1202 ngood
= getn(&super
[OF_super_ngood
]);
1203 nbad
= getn(&super
[OF_super_nbad
]);
1204 printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1207 dbhash(*args
, &h1
, &h2
);
1208 printf("\"%s\", hash=%lu:%lu ", *args
, h1
, h2
);
1209 if ((n
= lookup(h1
, h2
, 0)) != NULL
) {
1210 g
= get(&n
[OF_node_good
]);
1211 b
= get(&n
[OF_node_bad
]);
1212 printf("good=%u bad=%u ", g
, b
);
1215 d
= p
>= MID
? p
- MID
: MID
- p
;
1216 printf("prob=%.4g dist=%.4g", p
, d
);
1218 printf("too infrequent");
1220 printf("not in database");
1227 #else /* !USE_JUNK */
1232 fputs(catgets(catd
, CATSET
, 270, "No JUNK support compiled in.\n"),
1273 cprobability(void *v
)
1279 #endif /* USE_JUNK */