From 1a4d3694dbbf45f96519420e38ac49fe9588bf7a Mon Sep 17 00:00:00 2001
From: Bryan Steinbach <bsteinbach@berkeley.edu>
Date: Sun, 31 Oct 2010 22:52:06 -0700
Subject: [PATCH] Imported problem4

---
 week8/problem4/INSTRUCTIONS   |  73 +++++++++++++++++
 week8/problem4/README         |  80 +++++++++++++++++++
 week8/problem4/bloom.py       | 182 ++++++++++++++++++++++++++++++++++++++++++
 week8/problem4/importdictdata |  18 +++++
 week8/problem4/indict         |  36 +++++++++
 5 files changed, 389 insertions(+)
 create mode 100644 week8/problem4/INSTRUCTIONS
 create mode 100644 week8/problem4/README
 create mode 100644 week8/problem4/bloom.py
 create mode 100755 week8/problem4/importdictdata
 create mode 100755 week8/problem4/indict

diff --git a/week8/problem4/INSTRUCTIONS b/week8/problem4/INSTRUCTIONS
new file mode 100644
index 0000000..bcccab7
--- /dev/null
+++ b/week8/problem4/INSTRUCTIONS
@@ -0,0 +1,73 @@
+Your mission for this breakout session is to do the following:
+
+1. Read the beginning of the file README for some context as to the
+   purpose of the Python code you've just checked out. (You can
+   skip the "Implementation" section and everything after it for now.)
+
+2. Create and check out a new branch named "work" based on the current
+   "master" branch of this repository.
+
+3. Add a feature to the program "indict" that has it print out
+   the false-positive rate of its filter algorithm. The code to
+   compute this rate is already in place -- you just need to 
+   hook up the printing part of things. (The target rate is 5%.)
+   Briefly document the option in the file "README".
+
+4. In the course of doing this, you should encounter an OBVIOUS
+   bug in false-positive-rate code in the file "bloom.py". Fix it.
+   (You don't need any understanding of the code to spot the bug.)
+
+5. Update the version of the program listed at the top of the
+   README to 2.0.
+
+6. Commit your changes to the repository **as three separate
+   commits**:
+
+   - One commit fixing the false-positive-rate bug
+   - One commit adding the new option and documenting it
+     in the README
+   - One commit bumping the version number at the top of the README
+
+7. Merge your work back into the "master" branch.
+
+8. Tag your latest master commit as "v2.0". Congratulations, you've
+   made a software release!
+
+If you run into problems, don't hesitate to ask for help -- Git
+is a sharp tool and you can occasionally get your repository into 
+a weird state if you start flailing. The standard commands are
+pretty good about giving you sound advice, however.
+
+If you finish with time to spare
+========================================
+
+Here are some things you could try investigating:
+
+- Fool with around blowing away files and magically rescuing them
+  via "git checkout".
+
+- Which commit introduced the false-positive-rate bug? (Try "git
+  blame")
+
+- Explore the repository contents and history interactively and
+  prettily with your web browser by visiting
+
+  http://github.com/pkgwdemo/bloomdemo
+
+- On a conceptual level, what changes were made between versions of
+  1.0 and 2.0? ("git log")
+
+- What is the full source-code "diff" between versions 1.0 and 2.0?
+  ("git diff")
+
+- Try using "git rebase -i" to re-patch the repository history
+  so that the false-positive-rate bug is never introduced. What happens
+  to the ID of the most recent commit after the rebase, and why?
+
+- Create a few temporary branches to engineer a merge conflict, and
+  resolve it.
+
+- The very astute user will notice that the timestamps in the
+  commit history of the repository do not proceed chronologically.
+  What does that suggest about how I constructed the repository
+  history?
diff --git a/week8/problem4/README b/week8/problem4/README
new file mode 100644
index 0000000..e8ffd15
--- /dev/null
+++ b/week8/problem4/README
@@ -0,0 +1,80 @@
+Bloom Filter Demo
+Version: 2.0
+========================================
+
+This Git repository contains a very small Python program, "indict",
+which tells you whether a word might be in the system
+dictionary. Sample usage is:
+
+  $ ./indict barn bern birn born burn
+  barn MIGHT BE in the dictionary
+  bern is DEFINITELY NOT in the dictionary
+  birn MIGHT BE in the dictionary
+  born MIGHT BE in the dictionary
+  burn MIGHT BE in the dictionary
+
+"indict" takes a potential option, "-s", which tells it not to
+print out information about words that are definitely not in the
+dictionary:
+
+  $ ./indict -s barn bern birn
+  barn MIGHT BE in the dictionary
+  birn MIGHT BE in the dictionary
+
+"indict" takes a potential option, "-f", which tells it to print
+the false positive rate in the dictionary:
+
+  $ ./indict -f
+  False positive rate: 0.05
+
+Implementation
+========================================
+
+This program is implemented using a data structure called a Bloom
+filter. YOU DON'T HAVE TO WORRY ABOUT HOW IT WORKS FOR THIS EXERCISE
+but you may find it interesting. Bloom filters are essentially like
+Python sets: you can add items to them and later test whether they
+contain a given item. For a given number of items, a Bloom filter is
+much more efficient in computation time and memory than a Python
+set. The price of this efficiency is that a Bloom filter can yield
+false positives: it may say that a word is in the dictionary when it
+actually isn't. On the other hand, a Bloom filter will never yield
+false negatives: it won't say that a word isn't in the dictionary when
+it actually is.
+
+"indict" takes one potential option, "-s", which tells it not to
+print out information about words that are definitely not in the
+dictionary:
+
+$ ./indict -s barn bern birn
+barn MIGHT BE in the dictionary
+birn MIGHT BE in the dictionary
+$
+
+Bloom filters are useful when you have a large dictionary-type
+database that is slow to scan. If you're going to check the database
+for the presence of many keys that it may not contain, the filter can
+help you avoid a large number of slow checks to the database itself.
+
+NOTE WELL that this particular demo is not any faster than just
+grepping the system dictionary (/usr/share/dict/words on traditional
+Unix systems). This is because of the significant overhead of starting
+up the Python interpreter and loading the Bloom filter data as well as
+the simple nature of the dictionary "database". Furthermore, the
+filter implementation that I slapped together in 20 minutes is
+completely unoptimized. The best data structure is the one that you
+don't use because you don't actually need it.
+
+Repository Contents
+========================================
+
+.gitignore -- tells "git status" not to report boring files
+README -- this document
+bloom.py -- a simple Bloom filter implementation. Don't use it
+  for anything real, because if you actually have a problem that
+  calls for a Bloom filter, you'll almost surely need an
+  implementation that was actually designed with care.
+dictbf.dat.gz -- the precomputed filter data. (It takes a long
+  time to compute the filter data from the dictionary.)
+importdictdata -- the program to precompute the filter data
+indict -- the main dictionary testing program
diff --git a/week8/problem4/bloom.py b/week8/problem4/bloom.py
new file mode 100644
index 0000000..d40bd63
--- /dev/null
+++ b/week8/problem4/bloom.py
@@ -0,0 +1,182 @@
+"""bloom - a cheesy Bloom filter implementation
+
+This module implements a lame version of a Bloom filter, a simple
+set-like data structure. You don't need to worry about the details
+of it if you don't want to, but it's a nice example of a cute
+data structure.
+
+A Bloom filter has the following properties:
+
+- You can "add" known items to the filter
+- You can then query the filter as to whether it contains a given
+  item. There may be false positives (the filter says it contains the
+  item when it doesn't really) but there are never false negatives
+  (the filter never says that it doesn't contain the item when it
+  does).
+
+Bloom filters provide these characters cheaply in terms of both memory
+and computation time. This makes them useful as filters to large
+backing data stores; for instance, if you have a giant disk database
+with many records that you're going to query for items it will
+frequently not contain, a Bloom filter can allow you to avoid many
+of the queries to the full disk.
+
+The filter is defined by an array of "m" bits and a set of "k"
+functions mapping a filter item to a number "n", 0 <= n < m. The bits
+all start out as 0. When you add an item to the filter, you evaluate
+each of the k functions and use each function's result as an index to
+the bit array, setting the corresponding bits to 1. When you check for
+the presence of an item, you evaluate the functions and test each
+corresponding bit. If any are 0, the item has definitely not been
+added to the filter.
+
+For more information, see
+
+http://en.wikipedia.org/wiki/Bloom_filter
+
+Here are some equations describing the behavior of Bloom filters.
+Let 
+
+m = the number of bits in the filter
+k = the number of functions
+n = the number of items in the filter
+p = the false positive rate
+
+Given k, n, m, then
+
+p = (1 - exp (-k * n / m)) ** k
+
+Given n, m, then the optimal k is
+
+k = (ln 2) * m / n
+
+Given n, a desired p, and the desire to use the optimal k,
+
+m = - n * ln p / (ln 2)**2
+k = -ln p / (ln 2)
+
+"""
+
+# Activate "true" division: 1 / 2 = 0.5, not 0. See PEP238.
+from __future__ import division
+import numpy as N
+import hashlib
+
+
+def makeHashFunc (m, salt):
+    """Return a *function* that hashes an input value. The function that
+    we return takes a string argument and returns an integer between 0 and
+    m, noninclusive. Our arguments are m, and a "salt" value that we mix
+    into the hash. The allows us to create different hash functions from
+    the single SHA1 function.
+
+    These functions will be called many times, so speed is important. We
+    take the simple route; a real implementation would worry a lot more
+    about optimizing how this is done."""
+
+    salt = str (salt)
+    
+    def f (value):
+        # Get a 20-byte string representing the SHA1 cryptographic
+        # hash of the input plus the "salt" string
+        h = hashlib.sha1 ()
+        h.update (salt)
+        h.update (value)
+        digest = h.digest ()
+
+        # Convert the digest from a string to an integer modulo
+        # m. Python handles bigints automagically so we don't have
+        # to worry about the high bits causing problems.
+        v = 0
+        scale = 1
+
+        for b in digest:
+            v = (v + ord (b) * scale) % m
+            scale *= 8
+
+        return v
+
+    return f
+
+
+class BloomFilter (object):
+    def __init__ (self, m, k):
+        assert m > 0
+        assert k > 0
+
+        if m % 32 != 0:
+            raise ValueError ("This lame function can only accept m's that"
+                              " are multuples of 32; I got %d" % m)
+        self.m = m
+        self.k = k
+
+        self.bits = N.zeros (m // 32, dtype=N.uint32)
+        self.n = 0
+        self.funcs = [makeHashFunc (m, i) for i in xrange (k)]
+
+
+    def fprate (self):
+        r = (1. - N.exp (-self.k * self.n / self.m))
+        r **= self.k
+        return r
+
+
+    def add (self, item):
+        for func in self.funcs:
+            n = func (item)
+            dword = n // 32
+            bit = n % 32
+            self.bits[dword] |= (1 << bit)
+
+        self.n += 1
+
+
+    def maycontain (self, item):
+        for func in self.funcs:
+            n = func (item)
+            dword = n // 32
+            bit = n % 32
+
+            if self.bits[dword] & (1 << bit) == 0:
+                return False
+
+        return True
+
+
+    def clear (self):
+        self.bits.fill (0)
+
+
+    # These functions allow us to save and load object state
+    # through Python's standard "pickle" system. Populating the filter
+    # is slow, so we'll speed things up by saving and restoring
+    # the filter state.
+
+    def __getstate__ (self):
+        return self.k, self.n, self.bits
+
+
+    def __setstate__ (self, state):
+        self.k, self.n, self.bits = state
+        self.m = self.bits.size * 32
+        self.funcs = [makeHashFunc (self.m, i) for i in xrange (self.k)]
+
+
+def optimalBloom (fprate, nexpected):
+    """Create and return a well-optimized Bloom filter given a desired
+    false-positive rate and an expected number of items to be added to
+    the filter. This involves figuring out the optimal number of bits
+    of data and the number of functions we need using the equations
+    given in the module docstring."""
+
+    ln2 = N.log (2)
+
+    m = -nexpected * N.log (fprate) / ln2**2
+    m = int (N.ceil (m))
+    # round to nearest larger multiple of 32
+    m = (m + 31) & ~0x1F
+
+    k = ln2 * m / nexpected
+    k = max (int (k), 1)
+
+    return BloomFilter (m, k)
diff --git a/week8/problem4/importdictdata b/week8/problem4/importdictdata
new file mode 100755
index 0000000..f42bd37
--- /dev/null
+++ b/week8/problem4/importdictdata
@@ -0,0 +1,18 @@
+#! /usr/bin/env python
+# -*- python -*-
+
+"""Read the dictionary into a Bloom filter and save the filter
+state for quick loading later."""
+
+import gzip, cPickle
+import bloom
+
+# This stuff is specific to Peter's machine: the dictionary contains about
+# 479000 words and lives in the standard Unixy location.
+
+bf = bloom.optimalBloom (0.05, 479000)
+
+for line in file ('/usr/share/dict/words'):
+    bf.add (line.strip ())
+
+cPickle.dump (bf, gzip.GzipFile ('dictbf.dat.gz', 'wb', 9))
diff --git a/week8/problem4/indict b/week8/problem4/indict
new file mode 100755
index 0000000..efc7648
--- /dev/null
+++ b/week8/problem4/indict
@@ -0,0 +1,36 @@
+#! /usr/bin/env python
+# -*- python -*-
+
+"""Prints out whether words might be in the dictionary or not."""
+
+import sys, gzip, cPickle
+import bloom
+
+# Poor man's argument handling
+skipMisses = '-s' in sys.argv
+if skipMisses:
+    sys.argv.remove ('-s')
+
+printFp = '-f' in sys.argv
+if printFp:
+	sys.argv.remove('-f')
+
+wordstocheck = sys.argv[1:]
+
+# Load up our special data structure. cPickle loads up a preexisting
+# BloomFilter object from disk and returns it to us. The class is
+# implemented in bloom.py.
+bf = cPickle.load (gzip.GzipFile ('dictbf.dat.gz', 'rb', 9))
+
+# Compute the false-positive rate of the filter.
+# FIXME: do something useful with this number?
+fp = bf.fprate ()
+
+if printFp:
+	print 'False positive rate: ',fp
+
+for word in wordstocheck:
+    if bf.maycontain (word):
+        print word, 'MIGHT BE in the dictionary'
+    elif not skipMisses:
+        print word, 'is DEFINITELY NOT in the dictionary'
-- 
2.11.4.GIT