From 1a4d3694dbbf45f96519420e38ac49fe9588bf7a Mon Sep 17 00:00:00 2001 From: Bryan Steinbach Date: Sun, 31 Oct 2010 22:52:06 -0700 Subject: [PATCH] Imported problem4 --- week8/problem4/INSTRUCTIONS | 73 +++++++++++++++++ week8/problem4/README | 80 +++++++++++++++++++ week8/problem4/bloom.py | 182 ++++++++++++++++++++++++++++++++++++++++++ week8/problem4/importdictdata | 18 +++++ week8/problem4/indict | 36 +++++++++ 5 files changed, 389 insertions(+) create mode 100644 week8/problem4/INSTRUCTIONS create mode 100644 week8/problem4/README create mode 100644 week8/problem4/bloom.py create mode 100755 week8/problem4/importdictdata create mode 100755 week8/problem4/indict diff --git a/week8/problem4/INSTRUCTIONS b/week8/problem4/INSTRUCTIONS new file mode 100644 index 0000000..bcccab7 --- /dev/null +++ b/week8/problem4/INSTRUCTIONS @@ -0,0 +1,73 @@ +Your mission for this breakout session is to do the following: + +1. Read the beginning of the file README for some context as to the + purpose of the Python code you've just checked out. (You can + skip the "Implementation" section and everything after it for now.) + +2. Create and check out a new branch named "work" based on the current + "master" branch of this repository. + +3. Add a feature to the program "indict" that has it print out + the false-positive rate of its filter algorithm. The code to + compute this rate is already in place -- you just need to + hook up the printing part of things. (The target rate is 5%.) + Briefly document the option in the file "README". + +4. In the course of doing this, you should encounter an OBVIOUS + bug in false-positive-rate code in the file "bloom.py". Fix it. + (You don't need any understanding of the code to spot the bug.) + +5. Update the version of the program listed at the top of the + README to 2.0. + +6. Commit your changes to the repository **as three separate + commits**: + + - One commit fixing the false-positive-rate bug + - One commit adding the new option and documenting it + in the README + - One commit bumping the version number at the top of the README + +7. Merge your work back into the "master" branch. + +8. Tag your latest master commit as "v2.0". Congratulations, you've + made a software release! + +If you run into problems, don't hesitate to ask for help -- Git +is a sharp tool and you can occasionally get your repository into +a weird state if you start flailing. The standard commands are +pretty good about giving you sound advice, however. + +If you finish with time to spare +======================================== + +Here are some things you could try investigating: + +- Fool with around blowing away files and magically rescuing them + via "git checkout". + +- Which commit introduced the false-positive-rate bug? (Try "git + blame") + +- Explore the repository contents and history interactively and + prettily with your web browser by visiting + + http://github.com/pkgwdemo/bloomdemo + +- On a conceptual level, what changes were made between versions of + 1.0 and 2.0? ("git log") + +- What is the full source-code "diff" between versions 1.0 and 2.0? + ("git diff") + +- Try using "git rebase -i" to re-patch the repository history + so that the false-positive-rate bug is never introduced. What happens + to the ID of the most recent commit after the rebase, and why? + +- Create a few temporary branches to engineer a merge conflict, and + resolve it. + +- The very astute user will notice that the timestamps in the + commit history of the repository do not proceed chronologically. + What does that suggest about how I constructed the repository + history? diff --git a/week8/problem4/README b/week8/problem4/README new file mode 100644 index 0000000..e8ffd15 --- /dev/null +++ b/week8/problem4/README @@ -0,0 +1,80 @@ +Bloom Filter Demo +Version: 2.0 +======================================== + +This Git repository contains a very small Python program, "indict", +which tells you whether a word might be in the system +dictionary. Sample usage is: + + $ ./indict barn bern birn born burn + barn MIGHT BE in the dictionary + bern is DEFINITELY NOT in the dictionary + birn MIGHT BE in the dictionary + born MIGHT BE in the dictionary + burn MIGHT BE in the dictionary + +"indict" takes a potential option, "-s", which tells it not to +print out information about words that are definitely not in the +dictionary: + + $ ./indict -s barn bern birn + barn MIGHT BE in the dictionary + birn MIGHT BE in the dictionary + +"indict" takes a potential option, "-f", which tells it to print +the false positive rate in the dictionary: + + $ ./indict -f + False positive rate: 0.05 + +Implementation +======================================== + +This program is implemented using a data structure called a Bloom +filter. YOU DON'T HAVE TO WORRY ABOUT HOW IT WORKS FOR THIS EXERCISE +but you may find it interesting. Bloom filters are essentially like +Python sets: you can add items to them and later test whether they +contain a given item. For a given number of items, a Bloom filter is +much more efficient in computation time and memory than a Python +set. The price of this efficiency is that a Bloom filter can yield +false positives: it may say that a word is in the dictionary when it +actually isn't. On the other hand, a Bloom filter will never yield +false negatives: it won't say that a word isn't in the dictionary when +it actually is. + +"indict" takes one potential option, "-s", which tells it not to +print out information about words that are definitely not in the +dictionary: + +$ ./indict -s barn bern birn +barn MIGHT BE in the dictionary +birn MIGHT BE in the dictionary +$ + +Bloom filters are useful when you have a large dictionary-type +database that is slow to scan. If you're going to check the database +for the presence of many keys that it may not contain, the filter can +help you avoid a large number of slow checks to the database itself. + +NOTE WELL that this particular demo is not any faster than just +grepping the system dictionary (/usr/share/dict/words on traditional +Unix systems). This is because of the significant overhead of starting +up the Python interpreter and loading the Bloom filter data as well as +the simple nature of the dictionary "database". Furthermore, the +filter implementation that I slapped together in 20 minutes is +completely unoptimized. The best data structure is the one that you +don't use because you don't actually need it. + +Repository Contents +======================================== + +.gitignore -- tells "git status" not to report boring files +README -- this document +bloom.py -- a simple Bloom filter implementation. Don't use it + for anything real, because if you actually have a problem that + calls for a Bloom filter, you'll almost surely need an + implementation that was actually designed with care. +dictbf.dat.gz -- the precomputed filter data. (It takes a long + time to compute the filter data from the dictionary.) +importdictdata -- the program to precompute the filter data +indict -- the main dictionary testing program diff --git a/week8/problem4/bloom.py b/week8/problem4/bloom.py new file mode 100644 index 0000000..d40bd63 --- /dev/null +++ b/week8/problem4/bloom.py @@ -0,0 +1,182 @@ +"""bloom - a cheesy Bloom filter implementation + +This module implements a lame version of a Bloom filter, a simple +set-like data structure. You don't need to worry about the details +of it if you don't want to, but it's a nice example of a cute +data structure. + +A Bloom filter has the following properties: + +- You can "add" known items to the filter +- You can then query the filter as to whether it contains a given + item. There may be false positives (the filter says it contains the + item when it doesn't really) but there are never false negatives + (the filter never says that it doesn't contain the item when it + does). + +Bloom filters provide these characters cheaply in terms of both memory +and computation time. This makes them useful as filters to large +backing data stores; for instance, if you have a giant disk database +with many records that you're going to query for items it will +frequently not contain, a Bloom filter can allow you to avoid many +of the queries to the full disk. + +The filter is defined by an array of "m" bits and a set of "k" +functions mapping a filter item to a number "n", 0 <= n < m. The bits +all start out as 0. When you add an item to the filter, you evaluate +each of the k functions and use each function's result as an index to +the bit array, setting the corresponding bits to 1. When you check for +the presence of an item, you evaluate the functions and test each +corresponding bit. If any are 0, the item has definitely not been +added to the filter. + +For more information, see + +http://en.wikipedia.org/wiki/Bloom_filter + +Here are some equations describing the behavior of Bloom filters. +Let + +m = the number of bits in the filter +k = the number of functions +n = the number of items in the filter +p = the false positive rate + +Given k, n, m, then + +p = (1 - exp (-k * n / m)) ** k + +Given n, m, then the optimal k is + +k = (ln 2) * m / n + +Given n, a desired p, and the desire to use the optimal k, + +m = - n * ln p / (ln 2)**2 +k = -ln p / (ln 2) + +""" + +# Activate "true" division: 1 / 2 = 0.5, not 0. See PEP238. +from __future__ import division +import numpy as N +import hashlib + + +def makeHashFunc (m, salt): + """Return a *function* that hashes an input value. The function that + we return takes a string argument and returns an integer between 0 and + m, noninclusive. Our arguments are m, and a "salt" value that we mix + into the hash. The allows us to create different hash functions from + the single SHA1 function. + + These functions will be called many times, so speed is important. We + take the simple route; a real implementation would worry a lot more + about optimizing how this is done.""" + + salt = str (salt) + + def f (value): + # Get a 20-byte string representing the SHA1 cryptographic + # hash of the input plus the "salt" string + h = hashlib.sha1 () + h.update (salt) + h.update (value) + digest = h.digest () + + # Convert the digest from a string to an integer modulo + # m. Python handles bigints automagically so we don't have + # to worry about the high bits causing problems. + v = 0 + scale = 1 + + for b in digest: + v = (v + ord (b) * scale) % m + scale *= 8 + + return v + + return f + + +class BloomFilter (object): + def __init__ (self, m, k): + assert m > 0 + assert k > 0 + + if m % 32 != 0: + raise ValueError ("This lame function can only accept m's that" + " are multuples of 32; I got %d" % m) + self.m = m + self.k = k + + self.bits = N.zeros (m // 32, dtype=N.uint32) + self.n = 0 + self.funcs = [makeHashFunc (m, i) for i in xrange (k)] + + + def fprate (self): + r = (1. - N.exp (-self.k * self.n / self.m)) + r **= self.k + return r + + + def add (self, item): + for func in self.funcs: + n = func (item) + dword = n // 32 + bit = n % 32 + self.bits[dword] |= (1 << bit) + + self.n += 1 + + + def maycontain (self, item): + for func in self.funcs: + n = func (item) + dword = n // 32 + bit = n % 32 + + if self.bits[dword] & (1 << bit) == 0: + return False + + return True + + + def clear (self): + self.bits.fill (0) + + + # These functions allow us to save and load object state + # through Python's standard "pickle" system. Populating the filter + # is slow, so we'll speed things up by saving and restoring + # the filter state. + + def __getstate__ (self): + return self.k, self.n, self.bits + + + def __setstate__ (self, state): + self.k, self.n, self.bits = state + self.m = self.bits.size * 32 + self.funcs = [makeHashFunc (self.m, i) for i in xrange (self.k)] + + +def optimalBloom (fprate, nexpected): + """Create and return a well-optimized Bloom filter given a desired + false-positive rate and an expected number of items to be added to + the filter. This involves figuring out the optimal number of bits + of data and the number of functions we need using the equations + given in the module docstring.""" + + ln2 = N.log (2) + + m = -nexpected * N.log (fprate) / ln2**2 + m = int (N.ceil (m)) + # round to nearest larger multiple of 32 + m = (m + 31) & ~0x1F + + k = ln2 * m / nexpected + k = max (int (k), 1) + + return BloomFilter (m, k) diff --git a/week8/problem4/importdictdata b/week8/problem4/importdictdata new file mode 100755 index 0000000..f42bd37 --- /dev/null +++ b/week8/problem4/importdictdata @@ -0,0 +1,18 @@ +#! /usr/bin/env python +# -*- python -*- + +"""Read the dictionary into a Bloom filter and save the filter +state for quick loading later.""" + +import gzip, cPickle +import bloom + +# This stuff is specific to Peter's machine: the dictionary contains about +# 479000 words and lives in the standard Unixy location. + +bf = bloom.optimalBloom (0.05, 479000) + +for line in file ('/usr/share/dict/words'): + bf.add (line.strip ()) + +cPickle.dump (bf, gzip.GzipFile ('dictbf.dat.gz', 'wb', 9)) diff --git a/week8/problem4/indict b/week8/problem4/indict new file mode 100755 index 0000000..efc7648 --- /dev/null +++ b/week8/problem4/indict @@ -0,0 +1,36 @@ +#! /usr/bin/env python +# -*- python -*- + +"""Prints out whether words might be in the dictionary or not.""" + +import sys, gzip, cPickle +import bloom + +# Poor man's argument handling +skipMisses = '-s' in sys.argv +if skipMisses: + sys.argv.remove ('-s') + +printFp = '-f' in sys.argv +if printFp: + sys.argv.remove('-f') + +wordstocheck = sys.argv[1:] + +# Load up our special data structure. cPickle loads up a preexisting +# BloomFilter object from disk and returns it to us. The class is +# implemented in bloom.py. +bf = cPickle.load (gzip.GzipFile ('dictbf.dat.gz', 'rb', 9)) + +# Compute the false-positive rate of the filter. +# FIXME: do something useful with this number? +fp = bf.fprate () + +if printFp: + print 'False positive rate: ',fp + +for word in wordstocheck: + if bf.maycontain (word): + print word, 'MIGHT BE in the dictionary' + elif not skipMisses: + print word, 'is DEFINITELY NOT in the dictionary' -- 2.11.4.GIT