lib/more_like_this.rb

   1 module ActsAsFerret #:nodoc:
   2
   3     module MoreLikeThis
   4
   5       module InstanceMethods
   6
   7         # returns other instances of this class, which have similar contents
   8         # like this one. Basically works like this: find out n most interesting
   9         # (i.e. characteristic) terms from this document, and then build a
  10         # query from those which is run against the whole index. Which terms
  11         # are interesting is decided on variour criteria which can be
  12         # influenced by the given options.
  13         #
  14         # The algorithm used here is a quite straight port of the MoreLikeThis class
  15         # from Apache Lucene.
  16         #
  17         # options are:
  18         # :field_names : Array of field names to use for similarity search (mandatory)
  19         # :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc.
  20         # :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs
  21         # :min_word_length => nil, # Ignore words shorter than this length (longer words tend to
  22         #                            be more characteristic for the document they occur in).
  23         # :max_word_length => nil, # Ignore words if greater than this len.
  24         # :max_query_terms => 25,  # maximum number of terms in the query built
  25         # :max_num_tokens => 5000, # maximum number of tokens to examine in a single field
  26         # :boost => false,         # when true, a boost according to the relative score of
  27         #                            a term is applied to this Term's TermQuery.
  28         # :similarity => 'DefaultAAFSimilarity'   # the similarity implementation to use (the default
  29         #                                           equals Ferret's internal similarity implementation)
  30         # :analyzer => 'Ferret::Analysis::StandardAnalyzer' # class name of the analyzer to use
  31         # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
  32         # ferret_options : Ferret options handed over to find_by_contents (i.e. for limits and sorting)
  33         # ar_options : options handed over to find_by_contents for AR scoping
  34         def more_like_this(options = {}, ferret_options = {}, ar_options = {})
  35           options = {
  36             :field_names => nil,  # Default field names
  37             :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc.
  38             :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs
  39             :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
  40             :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
  41             :max_query_terms => 25,  # maximum number of terms in the query built
  42             :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
  43             :boost => false,
  44             :similarity => 'ActsAsFerret::MoreLikeThis::DefaultAAFSimilarity',  # class name of the similarity implementation to use
  45             :analyzer => 'Ferret::Analysis::StandardAnalyzer', # class name of the analyzer to use
  46             :append_to_query => nil,
  47             :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too
  48           }.update(options)
  49           #index.search_each('id:*') do |doc, score|
  50           #  puts "#{doc} == #{index[doc][:description]}"
  51           #end
  52           clazz = options[:base_class]
  53           options[:base_class] = clazz.name
  54           query = clazz.aaf_index.build_more_like_this_query(self.id, self.class.name, options)
  55           options[:append_to_query].call(query) if options[:append_to_query]
  56           clazz.find_by_contents(query, ferret_options, ar_options)
  57         end
  58
  59       end
  60
  61       module IndexMethods
  62
  63         # TODO to allow morelikethis for unsaved records, we have to give the
  64         # unsaved record's data to this method. check how this will work out
  65         # via drb...
  66         def build_more_like_this_query(id, class_name, options)
  67           [:similarity, :analyzer].each { |sym| options[sym] = options[sym].constantize.new }
  68           ferret_index.synchronize do # avoid that concurrent writes close our reader
  69             ferret_index.send(:ensure_reader_open)
  70             reader = ferret_index.send(:reader)
  71             term_freq_map = retrieve_terms(id, class_name, reader, options)
  72             priority_queue = create_queue(term_freq_map, reader, options)
  73             create_query(id, class_name, priority_queue, options)
  74           end
  75         end
  76
  77         protected
  78
  79         def create_query(id, class_name, priority_queue, options={})
  80           query = Ferret::Search::BooleanQuery.new
  81           qterms = 0
  82           best_score = nil
  83           while(cur = priority_queue.pop)
  84             term_query = Ferret::Search::TermQuery.new(cur.field, cur.word)
  85
  86             if options[:boost]
  87               # boost term according to relative score
  88               # TODO untested
  89               best_score ||= cur.score
  90               term_query.boost = cur.score / best_score
  91             end
  92             begin
  93               query.add_query(term_query, :should)
  94             rescue Ferret::Search::BooleanQuery::TooManyClauses
  95               break
  96             end
  97             qterms += 1
  98             break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
  99           end
 100           # exclude the original record
 101           query.add_query(query_for_record(id, class_name), :must_not)
 102           return query
 103         end
 104
 105
 106
 107         # creates a term/term_frequency map for terms from the fields
 108         # given in options[:field_names]
 109         def retrieve_terms(id, class_name, reader, options)
 110           raise "more_like_this atm only works on saved records" if id.nil?
 111           document_number = document_number(id, class_name) rescue nil
 112           field_names = options[:field_names]
 113           max_num_tokens = options[:max_num_tokens]
 114           term_freq_map = Hash.new(0)
 115           doc = nil
 116           record = nil
 117           field_names.each do |field|
 118             #puts "field: #{field}"
 119             term_freq_vector = reader.term_vector(document_number, field) if document_number
 120             #if false
 121             if term_freq_vector
 122               # use stored term vector
 123               # puts 'using stored term vector'
 124               term_freq_vector.terms.each do |term|
 125                 term_freq_map[term.text] += term.positions.size unless noise_word?(term.text, options)
 126               end
 127             else
 128               # puts 'no stored term vector'
 129               # no term vector stored, but we have stored the contents in the index
 130               # -> extract terms from there
 131               content = nil
 132               if document_number
 133                 doc = reader[document_number]
 134                 content = doc[field]
 135               end
 136               unless content
 137                 # no term vector, no stored content, so try content from this instance
 138                 record ||= options[:base_class].constantize.find(id)
 139                 content = record.content_for_field_name(field.to_s)
 140               end
 141               puts "have doc: #{doc[:id]} with #{field} == #{content}"
 142               token_count = 0
 143
 144               ts = options[:analyzer].token_stream(field, content)
 145               while token = ts.next
 146                 break if (token_count+=1) > max_num_tokens
 147                 next if noise_word?(token.text, options)
 148                 term_freq_map[token.text] += 1
 149               end
 150             end
 151           end
 152           term_freq_map
 153         end
 154
 155         # create an ordered(by score) list of word,fieldname,score
 156         # structures
 157         def create_queue(term_freq_map, reader, options)
 158           pq = Array.new(term_freq_map.size)
 159
 160           similarity = options[:similarity]
 161           num_docs = reader.num_docs
 162           term_freq_map.each_pair do |word, tf|
 163             # filter out words that don't occur enough times in the source
 164             next if options[:min_term_freq] && tf < options[:min_term_freq]
 165
 166             # go through all the fields and find the largest document frequency
 167             top_field = options[:field_names].first
 168             doc_freq = 0
 169             options[:field_names].each do |field_name|
 170               freq = reader.doc_freq(field_name, word)
 171               if freq > doc_freq
 172                 top_field = field_name
 173                 doc_freq = freq
 174               end
 175             end
 176             # filter out words that don't occur in enough docs
 177             next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
 178             next if doc_freq == 0 # index update problem ?
 179
 180             idf = similarity.idf(doc_freq, num_docs)
 181             score = tf * idf
 182             pq << FrequencyQueueItem.new(word, top_field, score)
 183           end
 184           pq.compact!
 185           pq.sort! { |a,b| a.score<=>b.score }
 186           return pq
 187         end
 188
 189         def noise_word?(text, options)
 190           len = text.length
 191           (
 192             (options[:min_word_length] > 0 && len < options[:min_word_length]) ||
 193             (options[:max_word_length] > 0 && len > options[:max_word_length]) ||
 194             (options[:stop_words] && options.include?(text))
 195           )
 196         end
 197
 198       end
 199
 200       class DefaultAAFSimilarity
 201         def idf(doc_freq, num_docs)
 202           return 0.0 if num_docs == 0
 203           return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
 204         end
 205       end
 206
 207
 208       class FrequencyQueueItem
 209         attr_reader :word, :field, :score
 210         def initialize(word, field, score)
 211           @word = word; @field = field; @score = score
 212         end
 213       end
 214
 215     end
 216 end
 217