1 module ActsAsFerret #:nodoc:
7 # returns other instances of this class, which have similar contents
8 # like this one. Basically works like this: find out n most interesting
9 # (i.e. characteristic) terms from this document, and then build a
10 # query from those which is run against the whole index. Which terms
11 # are interesting is decided on variour criteria which can be
12 # influenced by the given options.
14 # The algorithm used here is a quite straight port of the MoreLikeThis class
18 # :field_names : Array of field names to use for similarity search (mandatory)
19 # :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
20 # :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
21 # :min_word_length => nil, # Ignore words shorter than this length (longer words tend to
22 # be more characteristic for the document they occur in).
23 # :max_word_length => nil, # Ignore words if greater than this len.
24 # :max_query_terms => 25, # maximum number of terms in the query built
25 # :max_num_tokens => 5000, # maximum number of tokens to examine in a single field
26 # :boost => false, # when true, a boost according to the relative score of
27 # a term is applied to this Term's TermQuery.
28 # :similarity => 'DefaultAAFSimilarity' # the similarity implementation to use (the default
29 # equals Ferret's internal similarity implementation)
30 # :analyzer => 'Ferret::Analysis::StandardAnalyzer' # class name of the analyzer to use
31 # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
32 # ferret_options : Ferret options handed over to find_by_contents (i.e. for limits and sorting)
33 # ar_options : options handed over to find_by_contents for AR scoping
34 def more_like_this(options = {}, ferret_options = {}, ar_options = {})
36 :field_names => nil, # Default field names
37 :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
38 :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
39 :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
40 :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
41 :max_query_terms => 25, # maximum number of terms in the query built
42 :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
44 :similarity => 'ActsAsFerret::MoreLikeThis::DefaultAAFSimilarity', # class name of the similarity implementation to use
45 :analyzer => 'Ferret::Analysis::StandardAnalyzer', # class name of the analyzer to use
46 :append_to_query => nil,
47 :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too
49 #index.search_each('id:*') do |doc, score|
50 # puts "#{doc} == #{index[doc][:description]}"
52 clazz = options[:base_class]
53 options[:base_class] = clazz.name
54 query = clazz.aaf_index.build_more_like_this_query(self.id, self.class.name, options)
55 options[:append_to_query].call(query) if options[:append_to_query]
56 clazz.find_by_contents(query, ferret_options, ar_options)
63 # TODO to allow morelikethis for unsaved records, we have to give the
64 # unsaved record's data to this method. check how this will work out
66 def build_more_like_this_query(id, class_name, options)
67 [:similarity, :analyzer].each { |sym| options[sym] = options[sym].constantize.new }
68 ferret_index.synchronize do # avoid that concurrent writes close our reader
69 ferret_index.send(:ensure_reader_open)
70 reader = ferret_index.send(:reader)
71 term_freq_map = retrieve_terms(id, class_name, reader, options)
72 priority_queue = create_queue(term_freq_map, reader, options)
73 create_query(id, class_name, priority_queue, options)
79 def create_query(id, class_name, priority_queue, options={})
80 query = Ferret::Search::BooleanQuery.new
83 while(cur = priority_queue.pop)
84 term_query = Ferret::Search::TermQuery.new(cur.field, cur.word)
87 # boost term according to relative score
89 best_score ||= cur.score
90 term_query.boost = cur.score / best_score
93 query.add_query(term_query, :should)
94 rescue Ferret::Search::BooleanQuery::TooManyClauses
98 break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
100 # exclude the original record
101 query.add_query(query_for_record(id, class_name), :must_not)
107 # creates a term/term_frequency map for terms from the fields
108 # given in options[:field_names]
109 def retrieve_terms(id, class_name, reader, options)
110 raise "more_like_this atm only works on saved records" if id.nil?
111 document_number = document_number(id, class_name) rescue nil
112 field_names = options[:field_names]
113 max_num_tokens = options[:max_num_tokens]
114 term_freq_map = Hash.new(0)
117 field_names.each do |field|
118 #puts "field: #{field}"
119 term_freq_vector = reader.term_vector(document_number, field) if document_number
122 # use stored term vector
123 # puts 'using stored term vector'
124 term_freq_vector.terms.each do |term|
125 term_freq_map[term.text] += term.positions.size unless noise_word?(term.text, options)
128 # puts 'no stored term vector'
129 # no term vector stored, but we have stored the contents in the index
130 # -> extract terms from there
133 doc = reader[document_number]
137 # no term vector, no stored content, so try content from this instance
138 record ||= options[:base_class].constantize.find(id)
139 content = record.content_for_field_name(field.to_s)
141 puts "have doc: #{doc[:id]} with #{field} == #{content}"
144 ts = options[:analyzer].token_stream(field, content)
145 while token = ts.next
146 break if (token_count+=1) > max_num_tokens
147 next if noise_word?(token.text, options)
148 term_freq_map[token.text] += 1
155 # create an ordered(by score) list of word,fieldname,score
157 def create_queue(term_freq_map, reader, options)
158 pq = Array.new(term_freq_map.size)
160 similarity = options[:similarity]
161 num_docs = reader.num_docs
162 term_freq_map.each_pair do |word, tf|
163 # filter out words that don't occur enough times in the source
164 next if options[:min_term_freq] && tf < options[:min_term_freq]
166 # go through all the fields and find the largest document frequency
167 top_field = options[:field_names].first
169 options[:field_names].each do |field_name|
170 freq = reader.doc_freq(field_name, word)
172 top_field = field_name
176 # filter out words that don't occur in enough docs
177 next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
178 next if doc_freq == 0 # index update problem ?
180 idf = similarity.idf(doc_freq, num_docs)
182 pq << FrequencyQueueItem.new(word, top_field, score)
185 pq.sort! { |a,b| a.score<=>b.score }
189 def noise_word?(text, options)
192 (options[:min_word_length] > 0 && len < options[:min_word_length]) ||
193 (options[:max_word_length] > 0 && len > options[:max_word_length]) ||
194 (options[:stop_words] && options.include?(text))
200 class DefaultAAFSimilarity
201 def idf(doc_freq, num_docs)
202 return 0.0 if num_docs == 0
203 return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
208 class FrequencyQueueItem
209 attr_reader :word, :field, :score
210 def initialize(word, field, score)
211 @word = word; @field = field; @score = score