2 * S-nail - a mail user agent derived from Berkeley Mail.
4 * Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
5 * Copyright (c) 2012 Steffen "Daode" Nurpmeso.
9 * Gunnar Ritter. All rights reserved.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by Gunnar Ritter
22 * and his contributors.
23 * 4. Neither the name of Gunnar Ritter nor the names of his contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 #else /* !HAVE_MMAP */
54 #define mmap(a, b, c, d, e, f) MAP_FAILED
56 #endif /* !HAVE_MMAP */
58 #define mremap(a, b, c, d) MAP_FAILED
59 #endif /* !HAVE_MREMAP */
62 #define MAP_FAILED ((void *)-1)
63 #endif /* !MAP_FAILED */
69 * Mail -- a mail program
71 * Junk classification, mostly according to Paul Graham's "A Plan for Spam",
72 * August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
73 * Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
75 * Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
76 * Classification using Chained Tokens", February 2004,
77 * <http://www.nuclearelephant.com/papers/chained.html>.
84 #define MAX2 0x0000ffff
85 #define MAX3 0x00ffffffUL
86 #define MAX4 0xffffffffUL
89 * The dictionary consists of two files forming a hash table. The hash
90 * consists of the first 56 bits of the result of applying MD5 to the
91 * input word. This scheme ensures that collisions are unlikely enough
92 * to make junk detection work; according to the birthday paradox, a
93 * 50 % probability for one single collision is reached at 2^28 entries.
95 * To make the chain structure independent from input, the MD5 input is
96 * xor'ed with a random number. This makes it impossible that someone uses
97 * a carefully crafted message for a denial-of-service attack against the
100 #define SIZEOF_node 17
101 #define OF_node_hash 0 /* first 32 bits of MD5 of word|mangle */
102 #define OF_node_next 4 /* bit-negated table index of next node */
103 #define OF_node_good 8 /* number of times this appeared in good msgs */
104 #define OF_node_bad 11 /* number of times this appeared in bad msgs */
105 #define OF_node_prob_O 14 /* table_version<1: precomputed probability */
106 #define OF_node_hash2 14 /* upper 3 bytes of MD5 hash */
109 #define SIZEOF_super 262164
110 #define OF_super_size 0 /* allocated nodes in the chain file */
111 #define OF_super_used 4 /* used nodes in the chain file */
112 #define OF_super_ngood 8 /* number of good messages scanned so far */
113 #define OF_super_nbad 12 /* number of bad messages scanned so far */
114 #define OF_super_mangle 16 /* used to mangle the MD5 input */
115 #define OF_super_bucket 20 /* 65536 bit-negated node indices */
116 #define SIZEOF_entry 4
119 static size_t super_mmapped
;
120 static size_t nodes_mmapped
;
122 static int chained_tokens
;
128 * 1 Fixed the mangling; it was ineffective in version 0.
129 * Hash extended to 56 bits.
131 static int table_version
;
132 #define current_table_version 1
135 ((unsigned)(((char *)(e))[0]&0377) + \
136 ((unsigned)(((char *)(e))[1]&0377) << 8) + \
137 ((unsigned)(((char *)(e))[2]&0377) << 16))
140 (((char *)(e))[0] = (n) & 0x0000ff, \
141 ((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
142 ((char *)(e))[2] = ((n) & 0xff0000) >> 16)
144 #define f2s(d) (smin(((unsigned)((d) * MAX3)), MAX3))
146 #define s2f(s) ((float)(s) / MAX3)
149 ((unsigned long)(((char *)(p))[0]&0377) + \
150 ((unsigned long)(((char *)(p))[1]&0377) << 8) + \
151 ((unsigned long)(((char *)(p))[2]&0377) << 16) + \
152 ((unsigned long)(((char *)(p))[3]&0377) << 24))
155 (((char *)(p))[0] = (n) & 0x000000ffUL, \
156 ((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
157 ((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
158 ((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
179 char field
[LINESIZE
];
182 #define constituent(c, b, i, price, hadamp) \
183 ((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
184 (c) == '$' || (c) == '!' || (c) == '_' || \
185 (c) == '#' || (c) == '%' || (c) == '&' || \
186 ((c) == ';' && hadamp) || \
187 ((c) == '-' && !(price)) || \
188 (((c) == '.' || (c) == ',' || (c) == '/') && \
189 (i) > 0 && digitchar((b)[(i)-1]&0377)))
191 #define url_xchar(c) \
192 (((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
193 !spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
194 (c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
195 (c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
196 (c) != '#' && (c) != '"')
208 static const char README1
[] = "\
209 This is a junk mail database maintained by mailx(1). It does not contain any\n\
210 of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
211 used for lookup. It is thus possible to tell if some given word was likely\n\
212 contained in your mail from examining this data, at best.\n";
213 static const char README2
[] = "\n\
214 The database files are stored in compress(1) format by default. This saves\n\
215 some space, but leads to higher processor usage when the database is read\n\
216 or updated. You can use uncompress(1) on these files if you prefer to store\n\
217 them in flat form.\n";
221 static FILE *sfp
, *nfp
;
222 static char *sname
, *nname
;
224 static enum okay
getdb(int rw
);
225 static void putdb(void);
226 static void relsedb(void);
227 static FILE *dbfp(enum db db
, int rw
, int *compressed
, char **fn
);
228 static char *lookup(unsigned long h1
, unsigned long h2
, int create
);
229 static unsigned long grow(unsigned long size
);
230 static char *nextword(char **buf
, size_t *bufsize
, size_t *count
, FILE *fp
,
231 struct lexstat
*sp
, int *stop
);
232 static void join(char **buf
, size_t *bufsize
, const char *s1
, const char *s2
);
233 static void add(const char *word
, enum entry entry
, struct lexstat
*sp
,
235 static enum okay
scan(struct message
*m
, enum entry entry
,
236 void (*func
)(const char *, enum entry
, struct lexstat
*, int),
238 static void recompute(void);
239 static float getprob(char *n
);
240 static int insert(int *msgvec
, enum entry entry
, int incr
);
241 static void clsf(struct message
*m
);
242 static void rate(const char *word
, enum entry entry
, struct lexstat
*sp
,
244 static void dbhash(const char *word
, unsigned long *h1
, unsigned long *h2
);
245 static void mkmangle(void);
254 chained_tokens
= value("chained-junk-tokens") != NULL
;
255 if ((sfp
= dbfp(SUPER
, rw
, &compressed
, &sname
)) == (FILE *)-1)
257 if (sfp
&& !compressed
) {
258 super
= mmap(NULL
, SIZEOF_super
,
259 rw
!=O_RDONLY
? PROT_READ
|PROT_WRITE
: PROT_READ
,
260 MAP_SHARED
, fileno(sfp
), 0);
261 if (super
!= MAP_FAILED
) {
262 super_mmapped
= SIZEOF_super
;
267 super
= smalloc(SIZEOF_super
);
271 if ((compressed
? zread(zp
, super
, SIZEOF_super
)
273 fread(super
, 1, SIZEOF_super
, sfp
)
276 fprintf(stderr
, "Error reading junk mail database.\n");
277 memset(super
, 0, SIZEOF_super
);
283 } else if (compressed
)
286 memset(super
, 0, SIZEOF_super
);
289 skip
: if ((n
= getn(&super
[OF_super_size
])) == 0) {
291 putn(&super
[OF_super_size
], 1);
293 if (sfp
&& (nfp
= dbfp(NODES
, rw
, &compressed
, &nname
)) != NULL
) {
294 if (nfp
== (FILE *)-1) {
300 if (sfp
&& nfp
&& !compressed
) {
301 nodes
= mmap(NULL
, n
* SIZEOF_node
,
302 rw
!=O_RDONLY
? PROT_READ
|PROT_WRITE
: PROT_READ
,
303 MAP_SHARED
, fileno(nfp
), 0);
304 if (nodes
!= MAP_FAILED
) {
305 nodes_mmapped
= n
* SIZEOF_node
;
310 nodes
= smalloc(n
* SIZEOF_node
);
314 if ((compressed
? zread(zp
, nodes
, n
* SIZEOF_node
)
316 fread(nodes
, 1, n
* SIZEOF_node
, nfp
)
317 != (unsigned long)n
* SIZEOF_node
) ||
319 fprintf(stderr
, "Error reading junk mail database.\n");
320 memset(nodes
, 0, n
* SIZEOF_node
);
321 memset(super
, 0, SIZEOF_super
);
323 putn(&super
[OF_super_size
], n
);
330 memset(nodes
, 0, n
* SIZEOF_node
);
344 if ((! super_mmapped
&& (sfp
= dbfp(SUPER
, O_WRONLY
, &scomp
, &sname
))
345 == NULL
) || sfp
== (FILE *)-1)
347 if ((! nodes_mmapped
&& (nfp
= dbfp(NODES
, O_WRONLY
, &ncomp
, &nname
))
348 == NULL
) || nfp
== (FILE *)-1)
350 if (super_mmapped
== 0 || nodes_mmapped
== 0)
353 * Use utime() with mmap() since Linux does not update st_mtime
354 * reliably otherwise.
360 zwrite(zp
, super
, SIZEOF_super
);
364 fwrite(super
, 1, SIZEOF_super
, sfp
);
369 zwrite(zp
, nodes
, getn(&super
[OF_super_size
]) * SIZEOF_node
);
374 getn(&super
[OF_super_size
]) * SIZEOF_node
, nfp
);
375 if (super_mmapped
== 0 || nodes_mmapped
== 0)
383 munmap(super
, super_mmapped
);
388 munmap(nodes
, nodes_mmapped
);
392 if (sfp
&& sfp
!= (FILE *)-1) {
396 if (nfp
&& nfp
!= (FILE *)-1) {
403 dbfp(enum db db
, int rw
, int *compressed
, char **fn
)
409 { "super", "nodes" },
410 { "super1", "nodes1" }
414 { "super.Z", "nodes.Z" },
415 { "super1.Z", "nodes1.Z" }
420 if ((dir
= value("junkdb")) == NULL
) {
421 fprintf(stderr
, "No junk mail database specified. "
422 "Set the junkdb variable.\n");
425 dir
= file_expand(dir
);
426 if (makedir(dir
) == STOP
) {
427 fprintf(stderr
, "Cannot create directory \"%s\"\n.", dir
);
431 table_version
= current_table_version
;
432 loop
: sf
= sfx
[table_version
];
433 zf
= zfx
[table_version
];
434 *fn
= salloc((n
= strlen(dir
)) + 40);
438 strcpy(&(*fn
)[n
+1], sf
[db
]);
439 if ((fp
= Fopen(*fn
, rw
!=O_RDONLY
? "r+" : "r")) != NULL
)
442 strcpy(&(*fn
)[n
+1], zf
[db
]);
443 if ((fp
= Fopen(*fn
, rw
? "r+" : "r")) == NULL
&&
444 rw
==O_WRONLY
? (fp
= Fopen(*fn
, "w+")) == NULL
: 0) {
445 fprintf(stderr
, "Cannot open junk mail database \"%s\".\n",*fn
);
449 strcpy(&(*fn
)[n
+1], "README");
450 if (access(*fn
, F_OK
) < 0 && (rp
= Fopen(*fn
, "w")) != NULL
) {
455 } else if (fp
== NULL
) {
456 if (table_version
> 0) {
460 table_version
= current_table_version
;
463 flp
.l_type
= rw
!=O_RDONLY
? F_WRLCK
: F_RDLCK
;
466 flp
.l_whence
= SEEK_SET
;
467 fcntl(fileno(fp
), F_SETLKW
, &flp
);
473 lookup(unsigned long h1
, unsigned long h2
, int create
)
475 char *n
, *lastn
= NULL
;
476 unsigned long c
, lastc
= MAX4
, used
, size
;
478 used
= getn(&super
[OF_super_used
]);
479 size
= getn(&super
[OF_super_size
]);
480 c
= ~getn(&super
[OF_super_bucket
+ (h1
&MAX2
)*SIZEOF_entry
]);
481 n
= &nodes
[c
*SIZEOF_node
];
483 if (getn(&n
[OF_node_hash
]) == h1
&&
484 (table_version
< 1 ? 1 :
485 get(&n
[OF_node_hash2
]) == h2
))
489 c
= ~getn(&n
[OF_node_next
]);
490 n
= &nodes
[c
*SIZEOF_node
];
494 if ((size
= grow(size
)) == 0)
496 lastn
= &nodes
[lastc
*SIZEOF_node
];
498 putn(&super
[OF_super_used
], used
+1);
499 n
= &nodes
[used
*SIZEOF_node
];
500 putn(&n
[OF_node_hash
], h1
);
501 put(&n
[OF_node_hash2
], h2
);
503 putn(&lastn
[OF_node_next
], ~used
);
505 putn(&super
[OF_super_bucket
+ (h1
&MAX2
)*SIZEOF_entry
],
513 grow(unsigned long size
)
515 unsigned long incr
, newsize
;
518 incr
= size
> MAX2
? MAX2
: size
;
519 newsize
= size
+ incr
;
520 if (newsize
> MAX4
-MAX2
) {
521 oflo
: fprintf(stderr
, "Junk mail database overflow.\n");
525 if (lseek(fileno(nfp
), newsize
*SIZEOF_node
-1, SEEK_SET
)
526 == (off_t
)-1 || write(fileno(nfp
),"\0",1) != 1)
529 if ((nodes
= mremap(nodes
, nodes_mmapped
, newsize
*SIZEOF_node
,
530 MREMAP_MAYMOVE
)) == MAP_FAILED
) {
531 if ((nodes
= mmap(NULL
, newsize
*SIZEOF_node
,
533 PROT_READ
|PROT_WRITE
:
535 MAP_SHARED
, fileno(nfp
), 0))
540 munmap(onodes
, nodes_mmapped
);
542 nodes_mmapped
= newsize
*SIZEOF_node
;
544 nodes
= srealloc(nodes
, newsize
*SIZEOF_node
);
545 memset(&nodes
[size
*SIZEOF_node
], 0, incr
*SIZEOF_node
);
548 putn(&super
[OF_super_size
], size
);
553 if (i+j >= (long)*bufsize-4) \
554 *buf = srealloc(*buf, *bufsize += 32); \
556 i += (*buf)[j+i] != '\0'; \
560 nextword(char **buf
, size_t *bufsize
, size_t *count
, FILE *fp
,
561 struct lexstat
*sp
, int *stop
)
570 for (cp
= sp
->save
; *cp
; cp
++) {
578 if (sp
->loc
== FROM_LINE
)
579 while (*count
> 0 && (c
= getc(fp
)) != EOF
) {
588 if (sp
->loc
== HEADER
&& sp
->field
[0]) {
589 field
: cp
= sp
->field
;
602 while (*count
> 0 && (c
= getc(fp
)) != EOF
) {
604 if (c
== '\0' && table_version
>= 1) {
610 if (c
== '\b' && table_version
>= 1) {
611 sp
->html
= HTML_TEXT
;
614 if (c
== '<' && sp
->html
== HTML_TEXT
) {
619 if (sp
->html
== HTML_TAG
) {
622 if (!asccasecmp(sp
->tag
, "a") ||
623 !asccasecmp(sp
->tag
, "img") ||
624 !asccasecmp(sp
->tag
, "font") ||
625 !asccasecmp(sp
->tag
, "span") ||
626 !asccasecmp(sp
->tag
, "meta") ||
627 !asccasecmp(sp
->tag
, "table") ||
628 !asccasecmp(sp
->tag
, "tr") ||
629 !asccasecmp(sp
->tag
, "td") ||
630 !asccasecmp(sp
->tag
, "p"))
631 sp
->html
= HTML_TEXT
;
633 sp
->html
= HTML_SKIP
;
634 } else if (c
== '>') {
635 sp
->html
= HTML_TEXT
;
638 if ((size_t)(sp
->tagp
- sp
->tag
) <
644 if (sp
->html
== HTML_SKIP
) {
646 sp
->html
= HTML_TEXT
;
649 if (c
== '$' && i
== 0)
651 if (sp
->loc
== HEADER
&& sp
->lastc
== '\n') {
654 while (k
< (int)sizeof sp
->field
- 3) {
657 (c
= getc(fp
)) == EOF
)
659 if (spacechar(c
) || c
== ':') {
666 sp
->field
[k
++] = '*';
671 } else if (c
== '\n') {
674 sp
->html
= HTML_NONE
;
681 cp
= sp
->save
= smalloc(i
+6);
682 for (cq
= "HOST*"; *cq
; cq
++)
684 for (cq
= &(*buf
)[j
]; *cq
!= ':'; cq
++);
685 cq
+= 3; /* skip "://" */
686 while (cq
< &(*buf
)[i
+j
] &&
687 (alnumchar(*cq
&0377) ||
688 *cq
== '.' || *cq
== '-'))
695 } else if (constituent(c
, *buf
, i
+j
, sp
->price
, sp
->hadamp
) ||
696 (sp
->loc
== HEADER
&& c
== '.' &&
697 asccasecmp(sp
->field
, "subject*"))) {
701 } else if (i
> 0 && c
== ':' && *count
> 2) {
702 if ((c
= getc(fp
)) != '/') {
707 if ((c
= getc(fp
)) != '/') {
716 for (cq
= "URL*"; *cq
; cq
++) {
722 if (alnumchar(*cp
&0377)) {
727 for (cq
= "://"; *cq
; cq
++) {
730 } else if (i
> 1 && ((*buf
)[i
+j
-1] == ',' ||
731 (*buf
)[i
+j
-1] == '.') && !digitchar(c
)) {
745 for (k
= 0; k
< i
; k
++)
746 if (digitchar((*buf
)[k
+j
]&0377))
748 else if (!alphachar((*buf
)[k
+j
]&0377) &&
749 (*buf
)[k
+j
] != '$') {
756 * Including the results of other filtering software (the
757 * 'X-Spam' fields) might seem tempting, but will also rate
758 * their false negatives good with this filter. Therefore
759 * these fields are ignored.
761 * Handling 'Received' fields is difficult since they include
762 * lots of both useless and interesting words for our purposes.
764 if (sp
->loc
== HEADER
&&
765 (asccasecmp(sp
->field
, "message-id*") == 0 ||
766 asccasecmp(sp
->field
, "references*") == 0 ||
767 asccasecmp(sp
->field
, "in-reply-to*") == 0 ||
768 asccasecmp(sp
->field
, "status*") == 0 ||
769 asccasecmp(sp
->field
, "x-status*") == 0 ||
770 asccasecmp(sp
->field
, "date*") == 0 ||
771 asccasecmp(sp
->field
, "delivery-date*") == 0 ||
772 ascncasecmp(sp
->field
, "x-spam", 6) == 0 ||
773 ascncasecmp(sp
->field
, "x-pstn", 6) == 0 ||
774 ascncasecmp(sp
->field
, "x-scanned", 9) == 0 ||
775 (asccasecmp(sp
->field
, "received*") == 0 &&
776 (((2*c
> i
) || i
< 4 ||
777 asccasestr(*buf
, "localhost")!=NULL
)))))
784 #define JOINCHECK if ((size_t)i >= *bufsize) \
785 *buf = srealloc(*buf, *bufsize += 32)
787 join(char **buf
, size_t *bufsize
, const char *s1
, const char *s2
)
805 add(const char *word
, enum entry entry
, struct lexstat
*sp
, int incr
)
808 unsigned long h1
, h2
;
812 dbhash(word
, &h1
, &h2
);
813 if ((n
= lookup(h1
, h2
, 1)) != NULL
) {
816 c
= get(&n
[OF_node_good
]);
817 if ((incr
> 0 && c
< (unsigned)MAX3
- incr
) ||
818 (incr
< 0 && c
>= (unsigned)-incr
)) {
820 put(&n
[OF_node_good
], c
);
824 c
= get(&n
[OF_node_bad
]);
825 if ((incr
> 0 && c
< (unsigned)MAX3
- incr
) ||
826 (incr
< 0 && c
>= (unsigned)-incr
)) {
828 put(&n
[OF_node_bad
], c
);
836 scan(struct message
*m
, enum entry entry
,
837 void (*func
)(const char *, enum entry
, struct lexstat
*, int),
841 char *buf0
= NULL
, *buf1
= NULL
, *buf2
= NULL
, **bp
, *cp
;
842 size_t bufsize0
= 0, bufsize1
= 0, bufsize2
= 0, *zp
, count
;
846 if ((fp
= Ftemp(&cp
, "Ra", "w+", 0600, 1)) == NULL
) {
852 if (send(m
, fp
, NULL
, NULL
, SEND_TOFLTR
, NULL
) < 0) {
858 sp
= scalloc(1, sizeof *sp
);
862 while (nextword(bp
, zp
, &count
, fp
, sp
, &stop
) != NULL
) {
863 (*func
)(*bp
, entry
, sp
, arg
);
864 if (chained_tokens
&& buf0
&& *buf0
&& buf1
&& *buf1
&& !stop
) {
865 join(&buf2
, &bufsize2
, bp
== &buf1
? buf0
: buf1
, *bp
);
866 (*func
)(buf2
, entry
, sp
, arg
);
868 bp
= bp
== &buf1
? &buf0
: &buf1
;
869 zp
= zp
== &bufsize1
? &bufsize0
: &bufsize1
;
882 unsigned long used
, i
;
887 used
= getn(&super
[OF_super_used
]);
888 for (i
= 0; i
< used
; i
++) {
889 n
= &nodes
[i
*SIZEOF_node
];
892 put(&n
[OF_node_prob_O
], s
);
899 unsigned long ngood
, nbad
;
903 ngood
= getn(&super
[OF_super_ngood
]);
904 nbad
= getn(&super
[OF_super_nbad
]);
905 if (ngood
+ nbad
>= 18000) {
908 } else if (ngood
+ nbad
>= 9000) {
915 g
= get(&n
[OF_node_good
]) * 2;
916 b
= get(&n
[OF_node_bad
]);
918 p
= smin(1.0, nbad
? (float)b
/nbad
: 0.0) /
919 (smin(1.0, ngood
? (float)g
/ngood
: 0.0) +
920 smin(1.0, nbad
? (float)b
/nbad
: 0.0));
923 if (p
== TOP
&& b
<= 10 && g
== 0)
925 else if (p
== BOT
&& g
<= 10 && b
== 0)
927 } else if (g
== 0 && b
== 0)
935 insert(int *msgvec
, enum entry entry
, int incr
)
940 verbose
= value("verbose") != NULL
;
941 if (getdb(O_RDWR
) != OKAY
)
945 u
= getn(&super
[OF_super_ngood
]);
948 u
= getn(&super
[OF_super_nbad
]);
951 for (ip
= msgvec
; *ip
; ip
++) {
952 setdot(&message
[*ip
-1]);
953 if (incr
> 0 && u
== MAX4
-incr
+1) {
954 fprintf(stderr
, "Junk mail database overflow.\n");
956 } else if (incr
< 0 && (unsigned long)-incr
> u
) {
957 fprintf(stderr
, "Junk mail database underflow.\n");
961 if ((entry
== GOOD
&& incr
> 0) || (entry
== BAD
&& incr
< 0))
962 message
[*ip
-1].m_flag
&= ~MJUNK
;
964 message
[*ip
-1].m_flag
|= MJUNK
;
965 scan(&message
[*ip
-1], entry
, add
, incr
);
969 putn(&super
[OF_super_ngood
], u
);
972 putn(&super
[OF_super_nbad
], u
);
975 if (table_version
< 1)
985 return insert(v
, GOOD
, 1);
991 return insert(v
, BAD
, 1);
997 return insert(v
, GOOD
, -1);
1003 return insert(v
, BAD
, -1);
1009 int *msgvec
= v
, *ip
;
1011 verbose
= value("verbose") != NULL
;
1012 _debug
= debug
|| value("debug") != NULL
;
1013 if (getdb(O_RDONLY
) != OKAY
)
1015 for (ip
= msgvec
; *ip
; ip
++) {
1016 setdot(&message
[*ip
-1]);
1017 clsf(&message
[*ip
-1]);
1028 unsigned long hash1
;
1029 unsigned long hash2
;
1034 clsf(struct message
*m
)
1037 float a
= 1, b
= 1, r
;
1040 fprintf(stderr
, "Examining message %d\n",
1041 (int)(m
- &message
[0] + 1));
1042 for (i
= 0; i
< BEST
; i
++) {
1046 if (scan(m
, -1, rate
, 0) != OKAY
)
1048 if (best
[0].prob
== -1) {
1050 fprintf(stderr
, "No information found.\n");
1051 m
->m_flag
&= ~MJUNK
;
1054 for (i
= 0; i
< BEST
; i
++) {
1055 if (best
[i
].prob
== -1)
1058 fprintf(stderr
, "Probe %2d: \"%s\", hash=%lu:%lu "
1059 "prob=%.4g dist=%.4g\n",
1060 i
+1, prstr(best
[i
].word
),
1061 best
[i
].hash1
, best
[i
].hash2
,
1062 best
[i
].prob
, best
[i
].dist
);
1064 b
*= 1 - best
[i
].prob
;
1066 r
= a
+b
> 0 ? a
/ (a
+b
) : 0;
1068 fprintf(stderr
, "Junk probability of message %d: %g\n",
1069 (int)(m
- &message
[0] + 1), r
);
1073 m
->m_flag
&= ~MJUNK
;
1078 rate(const char *word
, enum entry entry
, struct lexstat
*sp
, int unused
)
1081 unsigned long h1
, h2
;
1087 dbhash(word
, &h1
, &h2
);
1088 if ((n
= lookup(h1
, h2
, 0)) != NULL
) {
1093 fprintf(stderr
, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1
, h2
,
1094 n
? get(&n
[OF_node_good
]) : 0,
1095 n
? get(&n
[OF_node_bad
]) : 0,
1099 d
= p
>= MID
? p
- MID
: MID
- p
;
1100 if (d
>= best
[BEST
-1].dist
)
1101 for (i
= 0; i
< BEST
; i
++) {
1102 if (h1
== best
[i
].hash1
&& h2
== best
[i
].hash2
)
1105 * For equal distance, this selection prefers
1106 * words with a low probability, since a false
1107 * negative is better than a false positive,
1108 * and since experience has shown that false
1109 * positives are more likely otherwise. Then,
1110 * words from the end of the header and from
1111 * the start of the body are preferred. This
1112 * gives the most interesting verbose output.
1114 if (d
> best
[i
].dist
||
1115 d
== (best
[i
].dist
&&
1116 p
< best
[i
].prob
) ||
1117 (best
[i
].loc
== HEADER
&&
1118 d
== best
[i
].dist
)) {
1119 for (j
= BEST
-2; j
>= i
; j
--)
1120 best
[j
+1] = best
[j
];
1123 best
[i
].word
= savestr(word
);
1126 best
[i
].loc
= sp
->loc
;
1133 dbhash(const char *word
, unsigned long *h1
, unsigned long *h2
)
1135 unsigned char digest
[16];
1139 MD5Update(&ctx
, (unsigned char *)word
, strlen(word
));
1140 if (table_version
>= 1)
1141 MD5Update(&ctx
, (unsigned char *)&super
[OF_super_mangle
], 4);
1142 MD5Final(digest
, &ctx
);
1144 if (table_version
< 1) {
1145 *h1
^= getn(&super
[OF_super_mangle
]);
1148 *h2
= get(&digest
[4]);
1152 * The selection of the value for mangling is not critical. It is practically
1153 * impossible for any person to determine the exact time when the database
1154 * was created first (without looking at the database, which would reveal the
1155 * value anyway), so we just use this. The MD5 hash here ensures that each
1156 * single second gives a completely different mangling value (which is not
1157 * necessary anymore if table_version>=1, but does not hurt).
1167 unsigned char digest
[16];
1170 memset(&u
, 0, sizeof u
);
1173 MD5Update(&ctx
, (unsigned char *)u
.c
, sizeof u
.c
);
1174 MD5Final(digest
, &ctx
);
1176 putn(&super
[OF_super_mangle
], s
);
1180 cprobability(void *v
)
1183 unsigned long used
, ngood
, nbad
;
1184 unsigned long h1
, h2
;
1189 if (*args
== NULL
) {
1190 fprintf(stderr
, "No words given.\n");
1193 if (getdb(O_RDONLY
) != OKAY
)
1195 used
= getn(&super
[OF_super_used
]);
1196 ngood
= getn(&super
[OF_super_ngood
]);
1197 nbad
= getn(&super
[OF_super_nbad
]);
1198 printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1201 dbhash(*args
, &h1
, &h2
);
1202 printf("\"%s\", hash=%lu:%lu ", *args
, h1
, h2
);
1203 if ((n
= lookup(h1
, h2
, 0)) != NULL
) {
1204 g
= get(&n
[OF_node_good
]);
1205 b
= get(&n
[OF_node_bad
]);
1206 printf("good=%u bad=%u ", g
, b
);
1209 d
= p
>= MID
? p
- MID
: MID
- p
;
1210 printf("prob=%.4g dist=%.4g", p
, d
);
1212 printf("too infrequent");
1214 printf("not in database");
1221 #else /* !USE_JUNK */
1226 fputs(catgets(catd
, CATSET
, 270, "No JUNK support compiled in.\n"),
1267 cprobability(void *v
)
1273 #endif /* USE_JUNK */