1 # Copyright (c) 2006 Kasper Weibel Nielsen-Refs, Thomas Lockney, Jens Krämer
3 # Permission is hereby granted, free of charge, to any person obtaining a copy
4 # of this software and associated documentation files (the "Software"), to deal
5 # in the Software without restriction, including without limitation the rights
6 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 # copies of the Software, and to permit persons to whom the Software is
8 # furnished to do so, subject to the following conditions:
10 # The above copyright notice and this permission notice shall be included in all
11 # copies or substantial portions of the Software.
13 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 require 'active_record'
23 # Ferret 0.3.2 is considered the most reliable ferret version for now, all unit
24 # tests should pass (with or w/o the C extension). Speed is not as good as with the
25 # C-only Ferret 0.9.1, but still fast enough for common scenarios and work
26 # loads. Until Ferret 0.9.x stabilizes, you should consider this
27 # version for production scenarios.
28 require_gem 'ferret', '=0.3.2'
30 # Ferret >=0.9, Ruby-only, is much slower than 0.3.2 with it's small C
31 # extension, so it's not really an option.
32 # some tests related to searching multiple indexes at once are failing here
33 # (returning more results than expected)
36 # This will use the most recent installed ferret version, usually this will be
37 # 0.9.1 in the C-flavour. Difficult topic, as some parts of the API is not
38 # accessible yet. Several tests fail with this version, but basic single-index
39 # functionality is there and working.
42 # Yet another Ferret Mixin.
44 # This mixin adds full text search capabilities to any Rails model.
46 # It is heavily based on the original acts_as_ferret plugin done by
47 # Kasper Weibel and a modified version done by Thomas Lockney, which
48 # both can be found on
49 # http://ferret.davebalmain.com/trac/wiki/FerretOnRails
51 # Changes I did to the original version include:
53 # - automatic creation of missing index directories
54 # - I took out the storage of class names in the index, as I prefer
55 # the 'one model, one index'-approach. If needed, multiple models
56 # can share one index by using a common superclass for these.
57 # - separate index directories for different Rails environments, so
58 # unit tests don't mess up the production/development indexes.
59 # - default to AND queries, as this is the behaviour most users expect
60 # - index searcher instances are kept as class variables and will be re-used
61 # until an index change is detected, as opening a searcher is quite expensive
62 # this should improve search performance
63 # - query parser is kept as a class variable
66 # include the following in your model class (specifiying the fields you want to get indexed):
67 # acts_as_ferret :fields => [ 'title', 'description' ]
69 # now you can use ModelClass.find_by_contents(query) to find instances of your model
70 # whose indexed fields match a given query. All query terms are required by default, but
71 # explicit OR queries are possible. This differs from the ferret default, but imho is the more
72 # often needed/expected behaviour (more query terms result in less results).
74 # Released under the MIT license.
77 # Kasper Weibel Nielsen-Refs (original author)
78 # Jens Kraemer <jk@jkraemer.net>
82 module ARFerret #:nodoc:
84 def self.ensure_directory(dir)
85 FileUtils.mkdir_p dir unless File.directory? dir
88 # make sure the default index base dir exists. by default, all indexes are created
89 # under RAILS_ROOT/index/RAILS_ENV
90 def self.init_index_basedir
91 index_base = "#{RAILS_ROOT}/index"
92 ensure_directory index_base
93 @@index_dir = "#{index_base}/#{RAILS_ENV}"
94 ensure_directory @@index_dir
97 mattr_accessor :index_dir
100 def self.append_features(base)
102 base.extend(ClassMethods)
105 # declare the class level helper methods
106 # which will load the relevant instance methods defined below when invoked
109 # helper that defines a method that adds the given field to a lucene
111 def define_to_field_method(field, options = {})
112 default_opts = { :store => Ferret::Document::Field::Store::NO,
113 :index => Ferret::Document::Field::Index::TOKENIZED,
114 :term_vector => Ferret::Document::Field::TermVector::NO,
118 default_opts.update(options) if options.is_a?(Hash)
119 fields_for_ferret << field
120 define_method("#{field}_to_ferret".to_sym) do
122 val = self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.method(field).call
124 logger.debug("Error retrieving value for field #{field}: #{$!}")
127 logger.debug("Adding field #{field} with value '#{val}' to index")
128 Ferret::Document::Field.new(field.to_s, val,
129 default_opts[:store],
130 default_opts[:index],
131 default_opts[:term_vector],
132 default_opts[:binary],
133 default_opts[:boost])
137 # TODO: do we need to define this at this level ? Maybe it's
138 # sufficient to do this only in classes calling acts_as_ferret ?
139 def reloadable?; false end
141 @@ferret_indexes = Hash.new
142 def ferret_indexes; @@ferret_indexes end
144 @@multi_indexes = Hash.new
145 def multi_indexes; @@multi_indexes end
147 # declares a class as ferret-searchable.
151 # fields:: names all fields to include in the index. If not given,
152 # all attributes of the class will be indexed. You may also give
153 # symbols pointing to instance methods of your model here, i.e.
154 # to retrieve and index data from a related model.
156 # index_dir:: declares the directory where to put the index for this class.
157 # The default is RAILS_ROOT/index/RAILS_ENV/CLASSNAME.
158 # The index directory will be created if it doesn't exist.
160 # store_class_name:: to make search across multiple models useful, set
161 # this to true. the model class name will be stored in a keyword field
164 # ferret_options may be:
165 # occur_default:: - whether query terms are required by
166 # default (the default), or not. Specify one of
167 # Ferret::Search::BooleanClause::Occur::MUST or
168 # Ferret::Search::BooleanClause::Occur::SHOULD
170 # analyzer:: the analyzer to use for query parsing (default: nil,
171 # wihch means the ferret default Analyzer gets used)
173 def acts_as_ferret(options={}, ferret_options={})
176 :index_dir => "#{FerretMixin::Acts::ARFerret::index_dir}/#{self.name.underscore}",
177 :store_class_name => false
179 ferret_configuration = {
180 :occur_default => Ferret::Search::BooleanClause::Occur::MUST,
181 :handle_parse_errors => true,
182 :default_search_field => '*',
183 # :analyzer => Analysis::StandardAnalyzer.new,
184 # :wild_lower => true
186 configuration.update(options) if options.is_a?(Hash)
187 ferret_configuration.update(ferret_options) if ferret_options.is_a?(Hash)
188 # these properties are somewhat vital to the plugin and shouldn't
189 # be overwritten by the user:
190 ferret_configuration.update(
192 :path => configuration[:index_dir],
194 :create_if_missing => true
198 include FerretMixin::Acts::ARFerret::InstanceMethods
200 before_create :ferret_before_create
201 before_update :ferret_before_update
202 after_create :ferret_create
203 after_update :ferret_update
204 after_destroy :ferret_destroy
206 cattr_accessor :fields_for_ferret
207 cattr_accessor :configuration
208 cattr_accessor :ferret_configuration
210 @@fields_for_ferret = Array.new
211 @@configuration = configuration
212 @@ferret_configuration = ferret_configuration
214 if configuration[:fields].respond_to?(:each_pair)
215 configuration[:fields].each_pair do |key,val|
216 define_to_field_method(key,val)
218 elsif configuration[:fields].respond_to?(:each)
219 configuration[:fields].each do |field|
220 define_to_field_method(field)
223 @@fields_for_ferret = nil
226 FerretMixin::Acts::ARFerret::ensure_directory configuration[:index_dir]
230 configuration[:index_dir]
233 # rebuild the index from all data stored for this model.
234 # This is called automatically when no index exists yet.
236 # TODO: the automatic index initialization only works if
237 # every model class has it's
238 # own index, otherwise the index will get populated only
239 # with instances from the first model loaded
241 index = Ferret::Index::Index.new(ferret_configuration.merge(:create => true))
242 self.find_all.each { |content| index << content.to_doc }
243 logger.debug("Created Ferret index in: #{class_index_dir}")
249 # Retrieve the Ferret::Index::Index instance for this model class.
251 # Index instances are stored in a hash, using the index directory
252 # as the key. So model classes sharing a single index will share their
255 ferret_indexes[class_index_dir] ||= create_index_instance
258 # creates a new Index::Index instance. Before that, a check is done
259 # to see if the index exists in the file system. If not, index rebuild
260 # from all model data retrieved by find(:all) is triggered.
261 def create_index_instance
262 rebuild_index unless File.file? "#{class_index_dir}/segments"
263 Ferret::Index::Index.new(ferret_configuration)
266 # Finds instances by contents. Terms are ANDed by default, can be circumvented
267 # by using OR between terms.
269 # :first_doc - first hit to retrieve (useful for paging)
270 # :num_docs - number of hits to retrieve
272 # find_options is a hash passed on to active_record's find when
273 # retrieving the data from db, useful to i.e. prefetch relationships.
274 def find_by_contents(q, options = {}, find_options = {})
277 find_id_by_contents(q, options) do |element|
278 id_array << id = element[:id].to_i
279 scores_by_id[id] = element[:score]
282 if self.superclass == ActiveRecord::Base
283 result = self.find(id_array, find_options)
285 # no direct subclass of Base --> STI
286 # TODO: AR will filter out hits from other classes for us, but this
287 # will lead to less results retrieved --> scoping of ferret query
288 # to self.class is still needed.
289 result = self.find(:all,
290 find_options.merge(:conditions => ["id in (?)",id_array]))
293 logger.debug "REBUILD YOUR INDEX! One of the id's didn't have an associated record: #{id_array}"
296 # sort results by score (descending)
297 result.sort! { |b, a| scores_by_id[a.id] <=> scores_by_id[b.id] }
299 logger.debug "Query: #{q}\nResult id_array: #{id_array.inspect},\nresult: #{result},\nscores: #{scores_by_id.inspect}"
303 # Finds instance model name, ids and scores by contents.
304 # Useful if you want to search across models
305 # Terms are ANDed by default, can be circumvented by using OR between terms.
307 # Example controller code (not tested):
308 # def multi_search(query)
310 # result << (Model1.find_id_by_contents query)
311 # result << (Model2.find_id_by_contents query)
312 # result << (Model3.find_id_by_contents query)
314 # result.sort! {|element| element[:score]}
315 # # Figure out for yourself how to retreive and present the data from modelname and id
318 # Note that the scores retrieved this way aren't normalized across
319 # indexes, so that the order of results after sorting by score will
320 # differ from the order you would get when running the same query
321 # on a single index containing all the data from Model1, Model2
325 # :first_doc - first hit to retrieve (useful for paging)
326 # :num_docs - number of hits to retrieve
328 # a block can be given too, it will be executed with every result hash:
329 # find_id_by_contents(q, options) do |element|
330 # id_array << id = element[:id].to_i
331 # scores_by_id[id] = element[:score]
334 def find_id_by_contents(q, options = {})
336 hits = ferret_index.search(q, options)
337 hits.each do |hit, score|
338 result << {:model => self.name, :id => ferret_index[hit][:id], :score => score}
339 yield result.last if block_given?
341 logger.debug "id_score_model array: #{result.inspect}"
345 # requires the store_class_name option of acts_as_ferret to be true
346 # for all models queried this way.
348 # TODO: not optimal as each instance is fetched in a db call for it's
350 def multi_search(query, additional_models = [], options = {})
352 id_multi_search(query, additional_models, options).each { |hit|
353 result << Object.const_get(hit[:model]).find(hit[:id].to_i)
358 # returns an array of hashes, each containing :class_name,
359 # :id and :score for a hit.
361 def id_multi_search(query, additional_models = [], options = {})
362 additional_models << self
363 searcher = multi_index(additional_models)
365 hits = searcher.search(query, options)
366 hits.each { |hit, score|
367 doc = searcher.doc(hit)
368 result << { :model => doc[:class_name], :id => doc[:id], :score => score }
373 # returns a MultiIndex instance operating on a MultiReader
374 def multi_index(model_classes)
375 model_classes.sort! { |a, b| a.name <=> b.name }
376 key = model_classes.inject("") { |s, clazz| s << clazz.name }
377 @@multi_indexes[key] ||= MultiIndex.new(model_classes, ferret_configuration)
388 # todo: check for necessary index rebuilds in this place, too
389 # idea - each class gets a create_reader method that does this
390 def initialize(model_classes, options = {})
391 @model_classes = model_classes
393 :default_search_field => '*',
394 :analyzer => Ferret::Analysis::WhiteSpaceAnalyzer.new
399 def search(query, options={})
400 query = process_query(query)
401 searcher.search(query, options)
405 create_new_multi_reader unless @reader
406 unless @reader.latest?
408 @searcher.close # will close the multi_reader and all sub_readers as well
410 @reader.close # just close the reader
412 create_new_multi_reader
419 @searcher ||= Ferret::Search::IndexSearcher.new(@reader)
427 @query_parser ||= Ferret::QueryParser.new(@options[:default_search_field], @options)
430 def process_query(query)
431 query = query_parser.parse(query) if query.is_a?(String)
435 # creates a new MultiReader to search the given Models
436 def create_new_multi_reader
437 sub_readers = @model_classes.map { |clazz|
438 Ferret::Index::IndexReader.open(clazz.class_index_dir)
440 @reader = Ferret::Index::MultiReader.new(sub_readers)
441 query_parser.fields = @reader.get_field_names.to_a
446 module InstanceMethods
448 @ferret_reindex = true
450 def ferret_before_update
451 @ferret_reindex = true
453 alias :ferret_before_create :ferret_before_update
457 logger.debug "ferret_create/update: #{self.class.name} : #{self.id}"
458 self.class.ferret_index << self.to_doc if @ferret_reindex
459 @ferret_reindex = true
462 alias :ferret_update :ferret_create
467 self.class.ferret_index.query_delete("+id:#{self.id}")
469 logger.warn("Could not find indexed value for this object")
474 # convert instance to ferret document
476 logger.debug "creating doc for class: #{self.class.name}"
477 # Churn through the complete Active Record and add it to the Ferret document
478 doc = Ferret::Document::Document.new
479 # store the id of each item
480 doc << Ferret::Document::Field.new( "id", self.id,
481 Ferret::Document::Field::Store::YES,
482 Ferret::Document::Field::Index::UNTOKENIZED )
483 # store the class name if configured to do so
484 if configuration[:store_class_name]
485 doc << Ferret::Document::Field.new( "class_name", self.class.name,
486 Ferret::Document::Field::Store::YES,
487 Ferret::Document::Field::Index::UNTOKENIZED )
489 # iterate through the fields and add them to the document
491 # have user defined fields
492 fields_for_ferret.each do |field|
493 doc << self.send("#{field}_to_ferret")
497 self.attributes.each_pair do |key,val|
499 logger.debug "add field #{key} with value #{val}"
500 doc << Ferret::Document::Field.new(
503 Ferret::Document::Field::Store::NO,
504 Ferret::Document::Field::Index::TOKENIZED)
511 # BIG TODO: this file really gets too big. need to refactor a bit...
512 # maybe extract the more like this stuff, could be useful somewhere
516 # returns other instances of this class, which have similar contents
517 # like this one. Basically works like this: find out n most interesting
518 # (i.e. characteristic) terms from this document, and then build a
519 # query from those which is run against the whole index. Which terms
520 # are interesting is decided on variour criteria which can be
521 # influenced by the given options.
523 # The algorithm used here is a quite straight port of the MoreLikeThis class
524 # from Apache Lucene.
527 # :field_names : Array of field names to use for similarity search (mandatory)
528 # :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
529 # :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
530 # :min_word_length => nil, # Ignore words if less than this len (longer
531 # words tend to be more characteristic for the document they occur in).
532 # :max_word_length => nil, # Ignore words if greater than this len.
533 # :max_query_terms => 25, # maximum number of terms in the query built
534 # :max_num_tokens => 5000, # maximum number of tokens to examine in a
536 # :boost => false, # when true, a boost according to the
537 # relative score of a term is applied to this Term's TermQuery.
538 # :similarity => Ferret::Search::Similarity.default, # the similarity
539 # implementation to use
540 # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to
542 def more_like_this(options={})
544 :field_names => nil, # Default field names
545 :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
546 :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
547 :min_word_length => nil, # Ignore words if less than this len.
548 :max_word_length => nil, # Ignore words if greater than this len.
549 :max_query_terms => 25, # maximum number of terms in the query built
550 :max_num_tokens => 5000,
552 :similarity => Ferret::Search::Similarity.default,
553 :analyzer => Ferret::Analysis::StandardAnalyzer.new
555 index = self.class.ferret_index
557 reader = index.send(:reader)
559 # ferret >=0.9, C-Version doesn't allow access to Index#reader
560 reader = Ferret::Index::IndexReader.open(Ferret::Store::FSDirectory.new(self.class.class_index_dir, false))
562 doc_number = self.document_number
563 term_freq_map = retrieve_terms(document_number, reader, options)
564 priority_queue = create_queue(term_freq_map, reader, options)
565 query = create_query(priority_queue, options)
566 self.class.find_by_contents(query)
570 def create_query(priority_queue, options={})
571 query = Ferret::Search::BooleanQuery.new
574 while(cur = priority_queue.pop)
575 term_query = Ferret::Search::TermQuery.new(cur.to_term)
578 # boost term according to relative score
580 best_score ||= cur.score
581 term_query.boost = cur.score / best_score
584 query.add_query(term_query, Ferret::Search::BooleanClause::Occur::SHOULD)
585 rescue Ferret::Search::BooleanQuery::TooManyClauses
589 break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
592 t = Ferret::Index::Term.new('id', self.id.to_s)
593 query.add_query(Ferret::Search::TermQuery.new(t),
594 Ferret::Search::BooleanClause::Occur::MUST_NOT)
600 hits = self.class.ferret_index.search("id:#{self.id}")
601 hits.each { |hit, score| return hit }
604 # creates a term/term_frequency map for terms from the fields
605 # given in options[:field_names]
606 def retrieve_terms(doc_number, reader, options)
607 field_names = options[:field_names]
608 max_num_tokens = options[:max_num_tokens]
609 term_freq_map = Hash.new(0)
610 field_names.each do |field|
611 term_freq_vector = reader.get_term_vector(document_number, field)
613 # use stored term vector
615 term_freq_vector.terms.each_with_index do |term, i|
616 term_freq_map[term] += term_freq_vector.freqs[i] unless noise_word?(term)
619 # no term vector stored, extract terms from document content
620 # TODO: if no content stored, maybe use content from self ?
621 doc = reader.get_document(doc_number)
624 # C-Ferret >=0.9 again, no #each in tokenstream :-(
625 ts = options[:analyzer].token_stream(field, doc[field])
626 while token = ts.next
627 #options[:analyzer].token_stream(field, doc[field]).each do |token|
628 break if (token_count+=1) > max_num_tokens
630 next if noise_word?(token_text(token))
631 term_freq_map[token_text(token)] += 1
638 # extract textual value of a token
639 def token_text(token)
640 # token.term_text is for ferret 0.3.2
641 token.respond_to?(:text) ? token.text : token.term_text
644 # create an ordered(by score) list of word,fieldname,score
646 def create_queue(term_freq_map, reader, options)
647 pq = Array.new(term_freq_map.size)
649 similarity = options[:similarity]
650 num_docs = reader.num_docs
651 term_freq_map.each_pair do |word, tf|
652 # filter out words that don't occur enough times in the source
653 next if options[:min_term_freq] && tf < options[:min_term_freq]
655 # go through all the fields and find the largest document frequency
656 top_field = options[:field_names].first
658 options[:field_names].each do |field_name|
659 freq = reader.doc_freq(Ferret::Index::Term.new(field_name, word))
661 top_field = field_name
665 # filter out words that don't occur in enough docs
666 next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
667 next if doc_freq == 0 # index update problem ?
669 idf = similarity.idf(doc_freq, num_docs)
671 pq << FrequencyQueueItem.new(word, top_field, score)
674 pq.sort! { |a,b| a.score<=>b.score }
678 def noise_word?(text)
684 class FrequencyQueueItem
685 attr_reader :word, :field, :score
686 def initialize(word, field, score)
687 @word = word; @field = field; @score = score
690 Ferret::Index::Term.new(self.field, self.word)
698 # reopen ActiveRecord and include all the above to make
699 # them available to all our models if they want it
700 ActiveRecord::Base.class_eval do
701 include FerretMixin::Acts::ARFerret
704 class Ferret::Index::MultiReader
706 # TODO: Exception handling added to resolve ticket #6.
707 # It should be clarified wether this is a bug in Ferret
708 # in which case a bug report should be posted on the Ferret Trac.
710 @sub_readers.each { |r| return false unless r.latest? }
718 # END acts_as_ferret.rb