3 module ARFerret #:nodoc:
7 class DefaultAAFSimilarity
8 def idf(doc_freq, num_docs)
9 return 0.0 if num_docs == 0
10 return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
14 # returns other instances of this class, which have similar contents
15 # like this one. Basically works like this: find out n most interesting
16 # (i.e. characteristic) terms from this document, and then build a
17 # query from those which is run against the whole index. Which terms
18 # are interesting is decided on variour criteria which can be
19 # influenced by the given options.
21 # The algorithm used here is a quite straight port of the MoreLikeThis class
25 # :field_names : Array of field names to use for similarity search (mandatory)
26 # :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
27 # :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
28 # :min_word_length => nil, # Ignore words if less than this len (longer
29 # words tend to be more characteristic for the document they occur in).
30 # :max_word_length => nil, # Ignore words if greater than this len.
31 # :max_query_terms => 25, # maximum number of terms in the query built
32 # :max_num_tokens => 5000, # maximum number of tokens to examine in a
34 # :boost => false, # when true, a boost according to the
35 # relative score of a term is applied to this Term's TermQuery.
36 # :similarity => Ferret::Search::Similarity.default, # the similarity
37 # implementation to use
38 # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to
40 # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
41 # find_options : options handed over to find_by_contents
42 def more_like_this(options = {}, find_options = {})
44 :field_names => nil, # Default field names
45 :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
46 :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
47 :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
48 :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
49 :max_query_terms => 25, # maximum number of terms in the query built
50 :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
52 :similarity => DefaultAAFSimilarity.new,
53 :analyzer => Ferret::Analysis::StandardAnalyzer.new,
54 :append_to_query => nil,
55 :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too
57 index = self.class.ferret_index
58 #index.search_each('id:*') do |doc, score|
59 # puts "#{doc} == #{index[doc][:description]}"
61 index.synchronize do # avoid that concurrent writes close our reader
62 index.send(:ensure_reader_open)
63 reader = index.send(:reader)
64 doc_number = self.document_number
65 term_freq_map = retrieve_terms(document_number, reader, options)
66 priority_queue = create_queue(term_freq_map, reader, options)
67 query = create_query(priority_queue, options)
68 logger.debug "morelikethis-query: #{query}"
69 options[:append_to_query].call(query) if options[:append_to_query]
70 options[:base_class].find_by_contents(query, find_options)
75 def create_query(priority_queue, options={})
76 query = Ferret::Search::BooleanQuery.new
79 while(cur = priority_queue.pop)
80 term_query = Ferret::Search::TermQuery.new(cur.field, cur.word)
83 # boost term according to relative score
85 best_score ||= cur.score
86 term_query.boost = cur.score / best_score
89 query.add_query(term_query, :should)
90 rescue Ferret::Search::BooleanQuery::TooManyClauses
94 break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
97 query.add_query(Ferret::Search::TermQuery.new(:id, self.id.to_s), :must_not)
103 # creates a term/term_frequency map for terms from the fields
104 # given in options[:field_names]
105 def retrieve_terms(doc_number, reader, options)
106 field_names = options[:field_names]
107 max_num_tokens = options[:max_num_tokens]
108 term_freq_map = Hash.new(0)
110 field_names.each do |field|
111 #puts "field: #{field}"
112 term_freq_vector = reader.term_vector(document_number, field)
115 # use stored term vector
116 # puts 'using stored term vector'
117 term_freq_vector.terms.each do |term|
118 term_freq_map[term.text] += term.positions.size unless noise_word?(term.text, options)
121 # puts 'no stored term vector'
122 # no term vector stored, but we have stored the contents in the index
123 # -> extract terms from there
124 doc = reader[doc_number]
127 # no term vector, no stored content, so try content from this instance
128 content = content_for_field_name(field.to_s)
130 puts "have doc: #{doc[:id]} with #{field} == #{content}"
133 ts = options[:analyzer].token_stream(field, content)
134 while token = ts.next
135 break if (token_count+=1) > max_num_tokens
136 next if noise_word?(token.text, options)
137 term_freq_map[token.text] += 1
144 # create an ordered(by score) list of word,fieldname,score
146 def create_queue(term_freq_map, reader, options)
147 pq = Array.new(term_freq_map.size)
149 similarity = options[:similarity]
150 num_docs = reader.num_docs
151 term_freq_map.each_pair do |word, tf|
152 # filter out words that don't occur enough times in the source
153 next if options[:min_term_freq] && tf < options[:min_term_freq]
155 # go through all the fields and find the largest document frequency
156 top_field = options[:field_names].first
158 options[:field_names].each do |field_name|
159 freq = reader.doc_freq(field_name, word)
161 top_field = field_name
165 # filter out words that don't occur in enough docs
166 next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
167 next if doc_freq == 0 # index update problem ?
169 idf = similarity.idf(doc_freq, num_docs)
171 pq << FrequencyQueueItem.new(word, top_field, score)
174 pq.sort! { |a,b| a.score<=>b.score }
178 def noise_word?(text, options)
181 (options[:min_word_length] > 0 && len < options[:min_word_length]) ||
182 (options[:max_word_length] > 0 && len > options[:max_word_length]) ||
183 (options[:stop_words] && options.include?(text))
187 def content_for_field_name(field)
188 self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.send(field.to_sym)
193 class FrequencyQueueItem
194 attr_reader :word, :field, :score
195 def initialize(word, field, score)
196 @word = word; @field = field; @score = score