2 * Heirloom mailx - a mail user agent derived from Berkeley Mail.
4 * Copyright (c) 2000-2004 Gunnar Ritter, Freiburg i. Br., Germany.
8 * Gunnar Ritter. All rights reserved.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by Gunnar Ritter
21 * and his contributors.
22 * 4. Neither the name of Gunnar Ritter nor the names of his contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY GUNNAR RITTER AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL GUNNAR RITTER OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 static char sccsid
[] = "@(#)junk.c 1.75 (gritter) 9/14/08";
58 #else /* !HAVE_MMAP */
59 #define mmap(a, b, c, d, e, f) MAP_FAILED
61 #endif /* !HAVE_MMAP */
63 #define mremap(a, b, c, d) MAP_FAILED
64 #endif /* !HAVE_MREMAP */
67 #define MAP_FAILED ((void *)-1)
68 #endif /* !MAP_FAILED */
74 * Mail -- a mail program
76 * Junk classification, mostly according to Paul Graham's "A Plan for Spam",
77 * August 2002, <http://www.paulgraham.com/spam.html>, and his "Better
78 * Bayesian Filtering", January 2003, <http://www.paulgraham.com/better.html>.
80 * Chained tokens according to Jonathan A. Zdziarski's "Advanced Language
81 * Classification using Chained Tokens", February 2004,
82 * <http://www.nuclearelephant.com/papers/chained.html>.
89 #define MAX2 0x0000ffff
90 #define MAX3 0x00ffffffUL
91 #define MAX4 0xffffffffUL
94 * The dictionary consists of two files forming a hash table. The hash
95 * consists of the first 56 bits of the result of applying MD5 to the
96 * input word. This scheme ensures that collisions are unlikely enough
97 * to make junk detection work; according to the birthday paradox, a
98 * 50 % probability for one single collision is reached at 2^28 entries.
100 * To make the chain structure independent from input, the MD5 input is
101 * xor'ed with a random number. This makes it impossible that someone uses
102 * a carefully crafted message for a denial-of-service attack against the
105 #define SIZEOF_node 17
106 #define OF_node_hash 0 /* first 32 bits of MD5 of word|mangle */
107 #define OF_node_next 4 /* bit-negated table index of next node */
108 #define OF_node_good 8 /* number of times this appeared in good msgs */
109 #define OF_node_bad 11 /* number of times this appeared in bad msgs */
110 #define OF_node_prob_O 14 /* table_version<1: precomputed probability */
111 #define OF_node_hash2 14 /* upper 3 bytes of MD5 hash */
114 #define SIZEOF_super 262164
115 #define OF_super_size 0 /* allocated nodes in the chain file */
116 #define OF_super_used 4 /* used nodes in the chain file */
117 #define OF_super_ngood 8 /* number of good messages scanned so far */
118 #define OF_super_nbad 12 /* number of bad messages scanned so far */
119 #define OF_super_mangle 16 /* used to mangle the MD5 input */
120 #define OF_super_bucket 20 /* 65536 bit-negated node indices */
121 #define SIZEOF_entry 4
124 static size_t super_mmapped
;
125 static size_t nodes_mmapped
;
127 static int chained_tokens
;
133 * 1 Fixed the mangling; it was ineffective in version 0.
134 * Hash extended to 56 bits.
136 static int table_version
;
137 #define current_table_version 1
140 ((unsigned)(((char *)(e))[0]&0377) + \
141 ((unsigned)(((char *)(e))[1]&0377) << 8) + \
142 ((unsigned)(((char *)(e))[2]&0377) << 16))
145 (((char *)(e))[0] = (n) & 0x0000ff, \
146 ((char *)(e))[1] = ((n) & 0x00ff00) >> 8, \
147 ((char *)(e))[2] = ((n) & 0xff0000) >> 16)
149 #define f2s(d) (smin(((unsigned)((d) * MAX3)), MAX3))
151 #define s2f(s) ((float)(s) / MAX3)
154 ((unsigned long)(((char *)(p))[0]&0377) + \
155 ((unsigned long)(((char *)(p))[1]&0377) << 8) + \
156 ((unsigned long)(((char *)(p))[2]&0377) << 16) + \
157 ((unsigned long)(((char *)(p))[3]&0377) << 24))
160 (((char *)(p))[0] = (n) & 0x000000ffUL, \
161 ((char *)(p))[1] = ((n) & 0x0000ff00UL) >> 8, \
162 ((char *)(p))[2] = ((n) & 0x00ff0000UL) >> 16, \
163 ((char *)(p))[3] = ((n) & 0xff000000UL) >> 24)
184 char field
[LINESIZE
];
187 #define constituent(c, b, i, price, hadamp) \
188 ((c) & 0200 || alnumchar(c) || (c) == '\'' || (c) == '"' || \
189 (c) == '$' || (c) == '!' || (c) == '_' || \
190 (c) == '#' || (c) == '%' || (c) == '&' || \
191 ((c) == ';' && hadamp) || \
192 ((c) == '-' && !(price)) || \
193 (((c) == '.' || (c) == ',' || (c) == '/') && \
194 (i) > 0 && digitchar((b)[(i)-1]&0377)))
196 #define url_xchar(c) \
197 (((c)&0200) == 0 && ((c)&037) != (c) && (c) != 0177 && \
198 !spacechar(c) && (c) != '{' && (c) != '}' && (c) != '|' && \
199 (c) != '\\' && (c) != '^' && (c) != '~' && (c) != '[' && \
200 (c) != ']' && (c) != '`' && (c) != '<' && (c) != '>' && \
201 (c) != '#' && (c) != '"')
213 static const char README1
[] = "\
214 This is a junk mail database maintained by mailx(1). It does not contain any\n\
215 of the actual words found in your messages. Instead, parts of MD5 hashes are\n\
216 used for lookup. It is thus possible to tell if some given word was likely\n\
217 contained in your mail from examining this data, at best.\n";
218 static const char README2
[] = "\n\
219 The database files are stored in compress(1) format by default. This saves\n\
220 some space, but leads to higher processor usage when the database is read\n\
221 or updated. You can use uncompress(1) on these files if you prefer to store\n\
222 them in flat form.\n";
226 static FILE *sfp
, *nfp
;
227 static char *sname
, *nname
;
229 static enum okay
getdb(int rw
);
230 static void putdb(void);
231 static void relsedb(void);
232 static FILE *dbfp(enum db db
, int rw
, int *compressed
, char **fn
);
233 static char *lookup(unsigned long h1
, unsigned long h2
, int create
);
234 static unsigned long grow(unsigned long size
);
235 static char *nextword(char **buf
, size_t *bufsize
, size_t *count
, FILE *fp
,
236 struct lexstat
*sp
, int *stop
);
237 static void join(char **buf
, size_t *bufsize
, const char *s1
, const char *s2
);
238 static void add(const char *word
, enum entry entry
, struct lexstat
*sp
,
240 static enum okay
scan(struct message
*m
, enum entry entry
,
241 void (*func
)(const char *, enum entry
, struct lexstat
*, int),
243 static void recompute(void);
244 static float getprob(char *n
);
245 static int insert(int *msgvec
, enum entry entry
, int incr
);
246 static void clsf(struct message
*m
);
247 static void rate(const char *word
, enum entry entry
, struct lexstat
*sp
,
249 static void dbhash(const char *word
, unsigned long *h1
, unsigned long *h2
);
250 static void mkmangle(void);
259 chained_tokens
= value("chained-junk-tokens") != NULL
;
260 if ((sfp
= dbfp(SUPER
, rw
, &compressed
, &sname
)) == (FILE *)-1)
262 if (sfp
&& !compressed
) {
263 super
= mmap(NULL
, SIZEOF_super
,
264 rw
!=O_RDONLY
? PROT_READ
|PROT_WRITE
: PROT_READ
,
265 MAP_SHARED
, fileno(sfp
), 0);
266 if (super
!= MAP_FAILED
) {
267 super_mmapped
= SIZEOF_super
;
272 super
= smalloc(SIZEOF_super
);
276 if ((compressed
? zread(zp
, super
, SIZEOF_super
)
278 fread(super
, 1, SIZEOF_super
, sfp
)
281 fprintf(stderr
, "Error reading junk mail database.\n");
282 memset(super
, 0, SIZEOF_super
);
288 } else if (compressed
)
291 memset(super
, 0, SIZEOF_super
);
294 skip
: if ((n
= getn(&super
[OF_super_size
])) == 0) {
296 putn(&super
[OF_super_size
], 1);
298 if (sfp
&& (nfp
= dbfp(NODES
, rw
, &compressed
, &nname
)) != NULL
) {
299 if (nfp
== (FILE *)-1) {
305 if (sfp
&& nfp
&& !compressed
) {
306 nodes
= mmap(NULL
, n
* SIZEOF_node
,
307 rw
!=O_RDONLY
? PROT_READ
|PROT_WRITE
: PROT_READ
,
308 MAP_SHARED
, fileno(nfp
), 0);
309 if (nodes
!= MAP_FAILED
) {
310 nodes_mmapped
= n
* SIZEOF_node
;
315 nodes
= smalloc(n
* SIZEOF_node
);
319 if ((compressed
? zread(zp
, nodes
, n
* SIZEOF_node
)
321 fread(nodes
, 1, n
* SIZEOF_node
, nfp
)
322 != n
* SIZEOF_node
) ||
324 fprintf(stderr
, "Error reading junk mail database.\n");
325 memset(nodes
, 0, n
* SIZEOF_node
);
326 memset(super
, 0, SIZEOF_super
);
328 putn(&super
[OF_super_size
], n
);
335 memset(nodes
, 0, n
* SIZEOF_node
);
349 if (!super_mmapped
&& (sfp
= dbfp(SUPER
, O_WRONLY
, &scomp
, &sname
))
350 == NULL
|| sfp
== (FILE *)-1)
352 if (!nodes_mmapped
&& (nfp
= dbfp(NODES
, O_WRONLY
, &ncomp
, &nname
))
353 == NULL
|| nfp
== (FILE *)-1)
355 if (super_mmapped
== 0 || nodes_mmapped
== 0)
358 * Use utime() with mmap() since Linux does not update st_mtime
359 * reliably otherwise.
365 zwrite(zp
, super
, SIZEOF_super
);
369 fwrite(super
, 1, SIZEOF_super
, sfp
);
374 zwrite(zp
, nodes
, getn(&super
[OF_super_size
]) * SIZEOF_node
);
379 getn(&super
[OF_super_size
]) * SIZEOF_node
, nfp
);
380 if (super_mmapped
== 0 || nodes_mmapped
== 0)
388 munmap(super
, super_mmapped
);
393 munmap(nodes
, nodes_mmapped
);
397 if (sfp
&& sfp
!= (FILE *)-1) {
401 if (nfp
&& nfp
!= (FILE *)-1) {
408 dbfp(enum db db
, int rw
, int *compressed
, char **fn
)
414 { "super", "nodes" },
415 { "super1", "nodes1" }
419 { "super.Z", "nodes.Z" },
420 { "super1.Z", "nodes1.Z" }
425 if ((dir
= value("junkdb")) == NULL
) {
426 fprintf(stderr
, "No junk mail database specified. "
427 "Set the junkdb variable.\n");
431 if (makedir(dir
) == STOP
) {
432 fprintf(stderr
, "Cannot create directory \"%s\"\n.", dir
);
436 table_version
= current_table_version
;
437 loop
: sf
= sfx
[table_version
];
438 zf
= zfx
[table_version
];
439 *fn
= salloc((n
= strlen(dir
)) + 40);
443 strcpy(&(*fn
)[n
+1], sf
[db
]);
444 if ((fp
= Fopen(*fn
, rw
!=O_RDONLY
? "r+" : "r")) != NULL
)
447 strcpy(&(*fn
)[n
+1], zf
[db
]);
448 if ((fp
= Fopen(*fn
, rw
? "r+" : "r")) == NULL
&&
449 rw
==O_WRONLY
? (fp
= Fopen(*fn
, "w+")) == NULL
: 0) {
450 fprintf(stderr
, "Cannot open junk mail database \"%s\".\n",*fn
);
454 strcpy(&(*fn
)[n
+1], "README");
455 if (access(*fn
, F_OK
) < 0 && (rp
= Fopen(*fn
, "w")) != NULL
) {
460 } else if (fp
== NULL
) {
461 if (table_version
> 0) {
465 table_version
= current_table_version
;
468 flp
.l_type
= rw
!=O_RDONLY
? F_WRLCK
: F_RDLCK
;
471 flp
.l_whence
= SEEK_SET
;
472 fcntl(fileno(fp
), F_SETLKW
, &flp
);
478 lookup(unsigned long h1
, unsigned long h2
, int create
)
480 char *n
, *lastn
= NULL
;
481 unsigned long c
, lastc
= MAX4
, used
, size
;
483 used
= getn(&super
[OF_super_used
]);
484 size
= getn(&super
[OF_super_size
]);
485 c
= ~getn(&super
[OF_super_bucket
+ (h1
&MAX2
)*SIZEOF_entry
]);
486 n
= &nodes
[c
*SIZEOF_node
];
488 if (getn(&n
[OF_node_hash
]) == h1
&&
489 (table_version
< 1 ? 1 :
490 get(&n
[OF_node_hash2
]) == h2
))
494 c
= ~getn(&n
[OF_node_next
]);
495 n
= &nodes
[c
*SIZEOF_node
];
499 if ((size
= grow(size
)) == 0)
501 lastn
= &nodes
[lastc
*SIZEOF_node
];
503 putn(&super
[OF_super_used
], used
+1);
504 n
= &nodes
[used
*SIZEOF_node
];
505 putn(&n
[OF_node_hash
], h1
);
506 put(&n
[OF_node_hash2
], h2
);
508 putn(&lastn
[OF_node_next
], ~used
);
510 putn(&super
[OF_super_bucket
+ (h1
&MAX2
)*SIZEOF_entry
],
518 grow(unsigned long size
)
520 unsigned long incr
, newsize
;
523 incr
= size
> MAX2
? MAX2
: size
;
524 newsize
= size
+ incr
;
525 if (newsize
> MAX4
-MAX2
) {
526 oflo
: fprintf(stderr
, "Junk mail database overflow.\n");
530 if (lseek(fileno(nfp
), newsize
*SIZEOF_node
-1, SEEK_SET
)
531 == (off_t
)-1 || write(fileno(nfp
),"\0",1) != 1)
534 if ((nodes
= mremap(nodes
, nodes_mmapped
, newsize
*SIZEOF_node
,
535 MREMAP_MAYMOVE
)) == MAP_FAILED
) {
536 if ((nodes
= mmap(NULL
, newsize
*SIZEOF_node
,
538 PROT_READ
|PROT_WRITE
:
540 MAP_SHARED
, fileno(nfp
), 0))
545 munmap(onodes
, nodes_mmapped
);
547 nodes_mmapped
= newsize
*SIZEOF_node
;
549 nodes
= srealloc(nodes
, newsize
*SIZEOF_node
);
550 memset(&nodes
[size
*SIZEOF_node
], 0, incr
*SIZEOF_node
);
553 putn(&super
[OF_super_size
], size
);
558 if (i+j >= (long)*bufsize-4) \
559 *buf = srealloc(*buf, *bufsize += 32); \
561 i += (*buf)[j+i] != '\0'; \
565 nextword(char **buf
, size_t *bufsize
, size_t *count
, FILE *fp
,
566 struct lexstat
*sp
, int *stop
)
575 for (cp
= sp
->save
; *cp
; cp
++) {
583 if (sp
->loc
== FROM_LINE
)
584 while (*count
> 0 && (c
= getc(fp
)) != EOF
) {
593 if (sp
->loc
== HEADER
&& sp
->field
[0]) {
594 field
: cp
= sp
->field
;
607 while (*count
> 0 && (c
= getc(fp
)) != EOF
) {
609 if (c
== '\0' && table_version
>= 1) {
615 if (c
== '\b' && table_version
>= 1) {
616 sp
->html
= HTML_TEXT
;
619 if (c
== '<' && sp
->html
== HTML_TEXT
) {
624 if (sp
->html
== HTML_TAG
) {
627 if (!asccasecmp(sp
->tag
, "a") ||
628 !asccasecmp(sp
->tag
, "img") ||
629 !asccasecmp(sp
->tag
, "font") ||
630 !asccasecmp(sp
->tag
, "span") ||
631 !asccasecmp(sp
->tag
, "meta") ||
632 !asccasecmp(sp
->tag
, "table") ||
633 !asccasecmp(sp
->tag
, "tr") ||
634 !asccasecmp(sp
->tag
, "td") ||
635 !asccasecmp(sp
->tag
, "p"))
636 sp
->html
= HTML_TEXT
;
638 sp
->html
= HTML_SKIP
;
639 } else if (c
== '>') {
640 sp
->html
= HTML_TEXT
;
643 if (sp
->tagp
- sp
->tag
< sizeof sp
->tag
- 1)
648 if (sp
->html
== HTML_SKIP
) {
650 sp
->html
= HTML_TEXT
;
653 if (c
== '$' && i
== 0)
655 if (sp
->loc
== HEADER
&& sp
->lastc
== '\n') {
658 while (k
< sizeof sp
->field
- 3) {
661 (c
= getc(fp
)) == EOF
)
663 if (spacechar(c
) || c
== ':') {
670 sp
->field
[k
++] = '*';
675 } else if (c
== '\n') {
678 sp
->html
= HTML_NONE
;
685 cp
= sp
->save
= smalloc(i
+6);
686 for (cq
= "HOST*"; *cq
; cq
++)
688 for (cq
= &(*buf
)[j
]; *cq
!= ':'; cq
++);
689 cq
+= 3; /* skip "://" */
690 while (cq
< &(*buf
)[i
+j
] &&
691 (alnumchar(*cq
&0377) ||
692 *cq
== '.' || *cq
== '-'))
699 } else if (constituent(c
, *buf
, i
+j
, sp
->price
, sp
->hadamp
) ||
700 sp
->loc
== HEADER
&& c
== '.' &&
701 asccasecmp(sp
->field
, "subject*")) {
705 } else if (i
> 0 && c
== ':' && *count
> 2) {
706 if ((c
= getc(fp
)) != '/') {
711 if ((c
= getc(fp
)) != '/') {
720 for (cq
= "URL*"; *cq
; cq
++) {
726 if (alnumchar(*cp
&0377)) {
731 for (cq
= "://"; *cq
; cq
++) {
734 } else if (i
> 1 && ((*buf
)[i
+j
-1] == ',' ||
735 (*buf
)[i
+j
-1] == '.') && !digitchar(c
)) {
749 for (k
= 0; k
< i
; k
++)
750 if (digitchar((*buf
)[k
+j
]&0377))
752 else if (!alphachar((*buf
)[k
+j
]&0377) &&
753 (*buf
)[k
+j
] != '$') {
760 * Including the results of other filtering software (the
761 * 'X-Spam' fields) might seem tempting, but will also rate
762 * their false negatives good with this filter. Therefore
763 * these fields are ignored.
765 * Handling 'Received' fields is difficult since they include
766 * lots of both useless and interesting words for our purposes.
768 if (sp
->loc
== HEADER
&&
769 (asccasecmp(sp
->field
, "message-id*") == 0 ||
770 asccasecmp(sp
->field
, "references*") == 0 ||
771 asccasecmp(sp
->field
, "in-reply-to*") == 0 ||
772 asccasecmp(sp
->field
, "status*") == 0 ||
773 asccasecmp(sp
->field
, "x-status*") == 0 ||
774 asccasecmp(sp
->field
, "date*") == 0 ||
775 asccasecmp(sp
->field
, "delivery-date*") == 0 ||
776 ascncasecmp(sp
->field
, "x-spam", 6) == 0 ||
777 ascncasecmp(sp
->field
, "x-pstn", 6) == 0 ||
778 ascncasecmp(sp
->field
, "x-scanned", 9) == 0 ||
779 asccasecmp(sp
->field
, "received*") == 0 &&
780 ((2*c
> i
) || i
< 4 ||
781 asccasestr(*buf
, "localhost") != NULL
)))
788 #define JOINCHECK if (i >= *bufsize) \
789 *buf = srealloc(*buf, *bufsize += 32)
791 join(char **buf
, size_t *bufsize
, const char *s1
, const char *s2
)
809 add(const char *word
, enum entry entry
, struct lexstat
*sp
, int incr
)
812 unsigned long h1
, h2
;
815 dbhash(word
, &h1
, &h2
);
816 if ((n
= lookup(h1
, h2
, 1)) != NULL
) {
819 c
= get(&n
[OF_node_good
]);
820 if (incr
>0 && c
<MAX3
-incr
|| incr
<0 && c
>=-incr
) {
822 put(&n
[OF_node_good
], c
);
826 c
= get(&n
[OF_node_bad
]);
827 if (incr
>0 && c
<MAX3
-incr
|| incr
<0 && c
>=-incr
) {
829 put(&n
[OF_node_bad
], c
);
837 scan(struct message
*m
, enum entry entry
,
838 void (*func
)(const char *, enum entry
, struct lexstat
*, int),
842 char *buf0
= NULL
, *buf1
= NULL
, *buf2
= NULL
, **bp
, *cp
;
843 size_t bufsize0
= 0, bufsize1
= 0, bufsize2
= 0, *zp
, count
;
847 if ((fp
= Ftemp(&cp
, "Ra", "w+", 0600, 1)) == NULL
) {
853 if (send(m
, fp
, NULL
, NULL
, SEND_TOFLTR
, NULL
) < 0) {
859 sp
= scalloc(1, sizeof *sp
);
863 while (nextword(bp
, zp
, &count
, fp
, sp
, &stop
) != NULL
) {
864 (*func
)(*bp
, entry
, sp
, arg
);
865 if (chained_tokens
&& buf0
&& *buf0
&& buf1
&& *buf1
&& !stop
) {
866 join(&buf2
, &bufsize2
, bp
== &buf1
? buf0
: buf1
, *bp
);
867 (*func
)(buf2
, entry
, sp
, arg
);
869 bp
= bp
== &buf1
? &buf0
: &buf1
;
870 zp
= zp
== &bufsize1
? &bufsize0
: &bufsize1
;
883 unsigned long used
, i
;
888 used
= getn(&super
[OF_super_used
]);
889 for (i
= 0; i
< used
; i
++) {
890 n
= &nodes
[i
*SIZEOF_node
];
893 put(&n
[OF_node_prob_O
], s
);
900 unsigned long ngood
, nbad
;
904 ngood
= getn(&super
[OF_super_ngood
]);
905 nbad
= getn(&super
[OF_super_nbad
]);
906 if (ngood
+ nbad
>= 18000) {
909 } else if (ngood
+ nbad
>= 9000) {
916 g
= get(&n
[OF_node_good
]) * 2;
917 b
= get(&n
[OF_node_bad
]);
919 p
= smin(1.0, nbad
? (float)b
/nbad
: 0.0) /
920 (smin(1.0, ngood
? (float)g
/ngood
: 0.0) +
921 smin(1.0, nbad
? (float)b
/nbad
: 0.0));
924 if (p
== TOP
&& b
<= 10 && g
== 0)
926 else if (p
== BOT
&& g
<= 10 && b
== 0)
928 } else if (g
== 0 && b
== 0)
936 insert(int *msgvec
, enum entry entry
, int incr
)
941 verbose
= value("verbose") != NULL
;
942 if (getdb(O_RDWR
) != OKAY
)
946 u
= getn(&super
[OF_super_ngood
]);
949 u
= getn(&super
[OF_super_nbad
]);
952 for (ip
= msgvec
; *ip
; ip
++) {
953 setdot(&message
[*ip
-1]);
954 if (incr
> 0 && u
== MAX4
-incr
+1) {
955 fprintf(stderr
, "Junk mail database overflow.\n");
957 } else if (incr
< 0 && -incr
> u
) {
958 fprintf(stderr
, "Junk mail database underflow.\n");
962 if (entry
== GOOD
&& incr
> 0 || entry
== BAD
&& incr
< 0)
963 message
[*ip
-1].m_flag
&= ~MJUNK
;
965 message
[*ip
-1].m_flag
|= MJUNK
;
966 scan(&message
[*ip
-1], entry
, add
, incr
);
970 putn(&super
[OF_super_ngood
], u
);
973 putn(&super
[OF_super_nbad
], u
);
976 if (table_version
< 1)
986 return insert(v
, GOOD
, 1);
992 return insert(v
, BAD
, 1);
998 return insert(v
, GOOD
, -1);
1004 return insert(v
, BAD
, -1);
1010 int *msgvec
= v
, *ip
;
1012 verbose
= value("verbose") != NULL
;
1013 _debug
= debug
|| value("debug") != NULL
;
1014 if (getdb(O_RDONLY
) != OKAY
)
1016 for (ip
= msgvec
; *ip
; ip
++) {
1017 setdot(&message
[*ip
-1]);
1018 clsf(&message
[*ip
-1]);
1029 unsigned long hash1
;
1030 unsigned long hash2
;
1035 clsf(struct message
*m
)
1038 float a
= 1, b
= 1, r
;
1041 fprintf(stderr
, "Examining message %d\n",
1042 (int)(m
- &message
[0] + 1));
1043 for (i
= 0; i
< BEST
; i
++) {
1047 if (scan(m
, -1, rate
, 0) != OKAY
)
1049 if (best
[0].prob
== -1) {
1051 fprintf(stderr
, "No information found.\n");
1052 m
->m_flag
&= ~MJUNK
;
1055 for (i
= 0; i
< BEST
; i
++) {
1056 if (best
[i
].prob
== -1)
1059 fprintf(stderr
, "Probe %2d: \"%s\", hash=%lu:%lu "
1060 "prob=%.4g dist=%.4g\n",
1061 i
+1, prstr(best
[i
].word
),
1062 best
[i
].hash1
, best
[i
].hash2
,
1063 best
[i
].prob
, best
[i
].dist
);
1065 b
*= 1 - best
[i
].prob
;
1067 r
= a
+b
> 0 ? a
/ (a
+b
) : 0;
1069 fprintf(stderr
, "Junk probability of message %d: %g\n",
1070 (int)(m
- &message
[0] + 1), r
);
1074 m
->m_flag
&= ~MJUNK
;
1079 rate(const char *word
, enum entry entry
, struct lexstat
*sp
, int unused
)
1082 unsigned long h1
, h2
;
1086 dbhash(word
, &h1
, &h2
);
1087 if ((n
= lookup(h1
, h2
, 0)) != NULL
) {
1092 fprintf(stderr
, "h=%lu:%lu g=%u b=%u p=%.4g %s\n", h1
, h2
,
1093 n
? get(&n
[OF_node_good
]) : 0,
1094 n
? get(&n
[OF_node_bad
]) : 0,
1098 d
= p
>= MID
? p
- MID
: MID
- p
;
1099 if (d
>= best
[BEST
-1].dist
)
1100 for (i
= 0; i
< BEST
; i
++) {
1101 if (h1
== best
[i
].hash1
&& h2
== best
[i
].hash2
)
1104 * For equal distance, this selection prefers
1105 * words with a low probability, since a false
1106 * negative is better than a false positive,
1107 * and since experience has shown that false
1108 * positives are more likely otherwise. Then,
1109 * words from the end of the header and from
1110 * the start of the body are preferred. This
1111 * gives the most interesting verbose output.
1113 if (d
> best
[i
].dist
||
1114 d
== best
[i
].dist
&&
1116 best
[i
].loc
== HEADER
&&
1117 d
== best
[i
].dist
) {
1118 for (j
= BEST
-2; j
>= i
; j
--)
1119 best
[j
+1] = best
[j
];
1122 best
[i
].word
= savestr(word
);
1125 best
[i
].loc
= sp
->loc
;
1132 dbhash(const char *word
, unsigned long *h1
, unsigned long *h2
)
1134 unsigned char digest
[16];
1138 MD5Update(&ctx
, (unsigned char *)word
, strlen(word
));
1139 if (table_version
>= 1)
1140 MD5Update(&ctx
, (unsigned char *)&super
[OF_super_mangle
], 4);
1141 MD5Final(digest
, &ctx
);
1143 if (table_version
< 1) {
1144 *h1
^= getn(&super
[OF_super_mangle
]);
1147 *h2
= get(&digest
[4]);
1151 * The selection of the value for mangling is not critical. It is practically
1152 * impossible for any person to determine the exact time when the database
1153 * was created first (without looking at the database, which would reveal the
1154 * value anyway), so we just use this. The MD5 hash here ensures that each
1155 * single second gives a completely different mangling value (which is not
1156 * necessary anymore if table_version>=1, but does not hurt).
1166 unsigned char digest
[16];
1169 memset(&u
, 0, sizeof u
);
1172 MD5Update(&ctx
, (unsigned char *)u
.c
, sizeof u
.c
);
1173 MD5Final(digest
, &ctx
);
1175 putn(&super
[OF_super_mangle
], s
);
1179 cprobability(void *v
)
1182 unsigned long used
, ngood
, nbad
;
1183 unsigned long h1
, h2
;
1188 if (*args
== NULL
) {
1189 fprintf(stderr
, "No words given.\n");
1192 if (getdb(O_RDONLY
) != OKAY
)
1194 used
= getn(&super
[OF_super_used
]);
1195 ngood
= getn(&super
[OF_super_ngood
]);
1196 nbad
= getn(&super
[OF_super_nbad
]);
1197 printf("Database statistics: tokens=%lu ngood=%lu nbad=%lu\n",
1200 dbhash(*args
, &h1
, &h2
);
1201 printf("\"%s\", hash=%lu:%lu ", *args
, h1
, h2
);
1202 if ((n
= lookup(h1
, h2
, 0)) != NULL
) {
1203 g
= get(&n
[OF_node_good
]);
1204 b
= get(&n
[OF_node_bad
]);
1205 printf("good=%u bad=%u ", g
, b
);
1208 d
= p
>= MID
? p
- MID
: MID
- p
;
1209 printf("prob=%.4g dist=%.4g", p
, d
);
1211 printf("too infrequent");
1213 printf("not in database");
1220 #else /* !USE_JUNK */
1225 fputs(catgets(catd
, CATSET
, 270, "No JUNK support compiled in.\n"),
1266 cprobability(void *v
)
1272 #endif /* USE_JUNK */