lib/more_like_this.rb

   1 module FerretMixin
   2   module Acts #:nodoc:
   3     module ARFerret #:nodoc:
   4
   5       module MoreLikeThis
   6
   7         class DefaultAAFSimilarity
   8           def idf(doc_freq, num_docs)
   9             return 0.0 if num_docs == 0
  10             return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
  11           end
  12         end
  13
  14         # returns other instances of this class, which have similar contents
  15         # like this one. Basically works like this: find out n most interesting
  16         # (i.e. characteristic) terms from this document, and then build a
  17         # query from those which is run against the whole index. Which terms
  18         # are interesting is decided on variour criteria which can be
  19         # influenced by the given options.
  20         #
  21         # The algorithm used here is a quite straight port of the MoreLikeThis class
  22         # from Apache Lucene.
  23         #
  24         # options are:
  25         # :field_names : Array of field names to use for similarity search (mandatory)
  26         # :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc.
  27         # :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs
  28         # :min_word_length => nil, # Ignore words if less than this len (longer
  29         # words tend to be more characteristic for the document they occur in).
  30         # :max_word_length => nil, # Ignore words if greater than this len.
  31         # :max_query_terms => 25,  # maximum number of terms in the query built
  32         # :max_num_tokens => 5000, # maximum number of tokens to examine in a
  33         # single field
  34         # :boost => false,         # when true, a boost according to the
  35         # relative score of a term is applied to this Term's TermQuery.
  36         # :similarity => Ferret::Search::Similarity.default, # the similarity
  37         # implementation to use
  38         # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to
  39         # use
  40         # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
  41         # find_options : options handed over to find_by_contents
  42         def more_like_this(options = {}, find_options = {})
  43           options = {
  44             :field_names => nil,  # Default field names
  45             :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc.
  46             :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs
  47             :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
  48             :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
  49             :max_query_terms => 25,  # maximum number of terms in the query built
  50             :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
  51             :boost => false,
  52             :similarity => DefaultAAFSimilarity.new,
  53             :analyzer => Ferret::Analysis::StandardAnalyzer.new,
  54             :append_to_query => nil,
  55             :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too
  56           }.update(options)
  57           index = self.class.ferret_index
  58           #index.search_each('id:*') do |doc, score|
  59           #  puts "#{doc} == #{index[doc][:description]}"
  60           #end
  61           index.synchronize do # avoid that concurrent writes close our reader
  62             index.send(:ensure_reader_open)
  63             reader = index.send(:reader)
  64             doc_number = self.document_number
  65             term_freq_map = retrieve_terms(document_number, reader, options)
  66             priority_queue = create_queue(term_freq_map, reader, options)
  67             query = create_query(priority_queue, options)
  68             logger.debug "morelikethis-query: #{query}"
  69             options[:append_to_query].call(query) if options[:append_to_query]
  70             options[:base_class].find_by_contents(query, find_options)
  71           end
  72         end
  73
  74
  75         def create_query(priority_queue, options={})
  76           query = Ferret::Search::BooleanQuery.new
  77           qterms = 0
  78           best_score = nil
  79           while(cur = priority_queue.pop)
  80             term_query = Ferret::Search::TermQuery.new(cur.field, cur.word)
  81
  82             if options[:boost]
  83               # boost term according to relative score
  84               # TODO untested
  85               best_score ||= cur.score
  86               term_query.boost = cur.score / best_score
  87             end
  88             begin
  89               query.add_query(term_query, :should)
  90             rescue Ferret::Search::BooleanQuery::TooManyClauses
  91               break
  92             end
  93             qterms += 1
  94             break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
  95           end
  96           # exclude ourselves
  97           query.add_query(Ferret::Search::TermQuery.new(:id, self.id.to_s), :must_not)
  98           return query
  99         end
 100
 101
 102
 103         # creates a term/term_frequency map for terms from the fields
 104         # given in options[:field_names]
 105         def retrieve_terms(doc_number, reader, options)
 106           field_names = options[:field_names]
 107           max_num_tokens = options[:max_num_tokens]
 108           term_freq_map = Hash.new(0)
 109           doc = nil
 110           field_names.each do |field|
 111             #puts "field: #{field}"
 112             term_freq_vector = reader.term_vector(document_number, field)
 113             #if false
 114             if term_freq_vector
 115               # use stored term vector
 116               # puts 'using stored term vector'
 117               term_freq_vector.terms.each do |term|
 118                 term_freq_map[term.text] += term.positions.size unless noise_word?(term.text, options)
 119               end
 120             else
 121               # puts 'no stored term vector'
 122               # no term vector stored, but we have stored the contents in the index
 123               # -> extract terms from there
 124               doc = reader[doc_number]
 125               content = doc[field]
 126               unless content
 127                 # no term vector, no stored content, so try content from this instance
 128                 content = content_for_field_name(field.to_s)
 129               end
 130               puts "have doc: #{doc[:id]} with #{field} == #{content}"
 131               token_count = 0
 132
 133               ts = options[:analyzer].token_stream(field, content)
 134               while token = ts.next
 135                 break if (token_count+=1) > max_num_tokens
 136                 next if noise_word?(token.text, options)
 137                 term_freq_map[token.text] += 1
 138               end
 139             end
 140           end
 141           term_freq_map
 142         end
 143
 144         # create an ordered(by score) list of word,fieldname,score
 145         # structures
 146         def create_queue(term_freq_map, reader, options)
 147           pq = Array.new(term_freq_map.size)
 148
 149           similarity = options[:similarity]
 150           num_docs = reader.num_docs
 151           term_freq_map.each_pair do |word, tf|
 152             # filter out words that don't occur enough times in the source
 153             next if options[:min_term_freq] && tf < options[:min_term_freq]
 154
 155             # go through all the fields and find the largest document frequency
 156             top_field = options[:field_names].first
 157             doc_freq = 0
 158             options[:field_names].each do |field_name|
 159               freq = reader.doc_freq(field_name, word)
 160               if freq > doc_freq
 161                 top_field = field_name
 162                 doc_freq = freq
 163               end
 164             end
 165             # filter out words that don't occur in enough docs
 166             next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
 167             next if doc_freq == 0 # index update problem ?
 168
 169             idf = similarity.idf(doc_freq, num_docs)
 170             score = tf * idf
 171             pq << FrequencyQueueItem.new(word, top_field, score)
 172           end
 173           pq.compact!
 174           pq.sort! { |a,b| a.score<=>b.score }
 175           return pq
 176         end
 177
 178         def noise_word?(text, options)
 179           len = text.length
 180           (
 181             (options[:min_word_length] > 0 && len < options[:min_word_length]) ||
 182             (options[:max_word_length] > 0 && len > options[:max_word_length]) ||
 183             (options[:stop_words] && options.include?(text))
 184           )
 185         end
 186
 187         def content_for_field_name(field)
 188           self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.send(field.to_sym)
 189         end
 190
 191       end
 192
 193       class FrequencyQueueItem
 194         attr_reader :word, :field, :score
 195         def initialize(word, field, score)
 196           @word = word; @field = field; @score = score
 197         end
 198       end
 199
 200     end
 201   end
 202 end
 203