1 # Copyright (c) 2006 Kasper Weibel Nielsen-Refs, Thomas Lockney, Jens Krämer
3 # Permission is hereby granted, free of charge, to any person obtaining a copy
4 # of this software and associated documentation files (the "Software"), to deal
5 # in the Software without restriction, including without limitation the rights
6 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 # copies of the Software, and to permit persons to whom the Software is
8 # furnished to do so, subject to the following conditions:
10 # The above copyright notice and this permission notice shall be included in all
11 # copies or substantial portions of the Software.
13 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 require 'active_record'
24 # Yet another Ferret Mixin.
26 # This mixin adds full text search capabilities to any Rails model.
28 # It is heavily based on the original acts_as_ferret plugin done by
29 # Kasper Weibel and a modified version done by Thomas Lockney, which
30 # both can be found on
31 # http://ferret.davebalmain.com/trac/wiki/FerretOnRails
33 # Changes I did to the original version include:
35 # - automatic creation of missing index directories
36 # - I took out the storage of class names in the index, as I prefer
37 # the 'one model, one index'-approach. If needed, multiple models
38 # can share one index by using a common superclass for these.
39 # - separate index directories for different Rails environments, so
40 # unit tests don't mess up the production/development indexes.
41 # - default to AND queries, as this is the behaviour most users expect
42 # - index searcher instances are kept as class variables and will be re-used
43 # until an index change is detected, as opening a searcher is quite expensive
44 # this should improve search performance
45 # - query parser is kept as a class variable
48 # include the following in your model class (specifiying the fields you want to get indexed):
49 # acts_as_ferret :fields => [ 'title', 'description' ]
51 # now you can use ModelClass.find_by_contents(query) to find instances of your model
52 # whose indexed fields match a given query. All query terms are required by default, but
53 # explicit OR queries are possible. This differs from the ferret default, but imho is the more
54 # often needed/expected behaviour (more query terms result in less results).
56 # Released under the MIT license.
59 # Kasper Weibel Nielsen-Refs (original author)
60 # Jens Kraemer <jk@jkraemer.net>
64 module ARFerret #:nodoc:
66 def self.ensure_directory(dir)
67 Dir.mkdir dir unless File.directory? dir
70 # make sure the default index base dir exists. by default, all indexes are created
71 # under RAILS_ROOT/index/RAILS_ENV
72 def self.init_index_basedir
73 index_base = "#{RAILS_ROOT}/index"
74 ensure_directory index_base
75 @@index_dir = "#{index_base}/#{RAILS_ENV}"
76 ensure_directory @@index_dir
79 mattr_accessor :index_dir
82 def self.append_features(base)
84 base.extend(ClassMethods)
87 # declare the class level helper methods
88 # which will load the relevant instance methods defined below when invoked
92 # helper that defines a method that adds the given field to a lucene
94 def define_to_field_method(field, options = {})
95 default_opts = { :store => Ferret::Document::Field::Store::NO,
96 :index => Ferret::Document::Field::Index::TOKENIZED,
97 :term_vector => Ferret::Document::Field::TermVector::NO,
101 default_opts.update(options) if options.is_a?(Hash)
102 fields_for_ferret << field
103 define_method("#{field}_to_ferret".to_sym) do
105 val = self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.method(field).call
107 logger.debug("Error retrieving value for field #{field}: #{$!}")
110 logger.debug("Adding field #{field} with value '#{val}' to index")
111 Ferret::Document::Field.new(field.to_s, val,
112 default_opts[:store],
113 default_opts[:index],
114 default_opts[:term_vector],
115 default_opts[:binary],
116 default_opts[:boost])
120 # TODO: do we need to define this at this level ? Maybe it's
121 # sufficient to do this only in classes calling acts_as_ferret ?
122 def reloadable?; false end
124 @@ferret_indexes = Hash.new
125 def ferret_indexes; @@ferret_indexes end
127 @@multi_indexes = Hash.new
128 def multi_indexes; @@multi_indexes end
130 # declares a class as ferret-searchable.
134 # fields:: names all fields to include in the index. If not given,
135 # all attributes of the class will be indexed. You may also give
136 # symbols pointing to instance methods of your model here, i.e.
137 # to retrieve and index data from a related model.
139 # index_dir:: declares the directory where to put the index for this class.
140 # The default is RAILS_ROOT/index/RAILS_ENV/CLASSNAME.
141 # The index directory will be created if it doesn't exist.
143 # store_class_name:: to make search across multiple models useful, set
144 # this to true. the model class name will be stored in a keyword field
147 # ferret_options may be:
148 # occur_default:: - whether query terms are required by
149 # default (the default), or not. Specify one of
150 # Ferret::Search::BooleanClause::Occur::MUST or
151 # Ferret::Search::BooleanClause::Occur::SHOULD
153 # analyzer:: the analyzer to use for query parsing (default: nil,
154 # wihch means the ferret default Analyzer gets used)
156 def acts_as_ferret(options={}, ferret_options={})
159 :index_dir => "#{FerretMixin::Acts::ARFerret::index_dir}/#{self.name}",
160 :store_class_name => false
162 ferret_configuration = {
163 :occur_default => Search::BooleanClause::Occur::MUST,
164 :handle_parse_errors => true,
165 :default_search_field => '*',
166 # :analyzer => Analysis::StandardAnalyzer.new,
167 # :wild_lower => true
169 configuration.update(options) if options.is_a?(Hash)
170 ferret_configuration.update(ferret_options) if ferret_options.is_a?(Hash)
171 # these properties are somewhat vital to the plugin and shouldn't
172 # be overwritten by the user:
173 ferret_configuration.update(
175 :path => configuration[:index_dir],
177 :create_if_missing => true
181 include FerretMixin::Acts::ARFerret::InstanceMethods
183 before_create :ferret_before_create
184 before_update :ferret_before_update
185 after_create :ferret_create
186 after_update :ferret_update
187 after_destroy :ferret_destroy
189 cattr_accessor :fields_for_ferret
190 cattr_accessor :configuration
191 cattr_accessor :ferret_configuration
193 @@fields_for_ferret = Array.new
194 @@configuration = configuration
195 @@ferret_configuration = ferret_configuration
197 if configuration[:fields].respond_to?(:each_pair)
198 configuration[:fields].each_pair do |key,val|
199 define_to_field_method(key,val)
201 elsif configuration[:fields].respond_to?(:each)
202 configuration[:fields].each do |field|
203 define_to_field_method(field)
206 @@fields_for_ferret = nil
209 FerretMixin::Acts::ARFerret::ensure_directory configuration[:index_dir]
213 configuration[:index_dir]
216 # rebuild the index from all data stored for this model.
217 # This is called automatically when no index exists yet.
219 # TODO: the automatic index initialization only works if
220 # every model class has it's
221 # own index, otherwise the index will get populated only
222 # with instances from the first model loaded
224 index = Index::Index.new(ferret_configuration.merge(:create => true))
225 self.find_all.each { |content| index << content.to_doc }
226 logger.debug("Created Ferret index in: #{class_index_dir}")
232 # Retrieve the Ferret::Index::Index instance for this model class.
234 # Index instances are stored in a hash, using the index directory
235 # as the key. So model classes sharing a single index will share their
238 ferret_indexes[class_index_dir] ||= create_index_instance
241 # creates a new Index::Index instance. Before that, a check is done
242 # to see if the index exists in the file system. If not, index rebuild
243 # from all model data retrieved by find(:all) is triggered.
244 def create_index_instance
245 rebuild_index unless File.file? "#{class_index_dir}/segments"
246 Index::Index.new(ferret_configuration)
249 # Finds instances by contents. Terms are ANDed by default, can be circumvented
250 # by using OR between terms.
252 # :first_doc - first hit to retrieve (useful for paging)
253 # :num_docs - number of hits to retrieve
254 def find_by_contents(q, options = {})
256 find_id_by_contents(q, options).each do |element|
257 id_array << element[:id]
259 logger.debug "id_array: #{id_array.inspect}"
261 if self.superclass == ActiveRecord::Base
262 result = self.find(id_array)
264 # no direct subclass of Base --> STI
265 # TODO: AR will filter out hits from other classes for us, but this
266 # will lead to less results retrieved --> scoping of ferret query
267 # to self.class is still needed.
268 result = self.find(:all, :conditions => ["id in (?)",id_array])
271 logger.debug "REBUILD YOUR INDEX! One of the id's didn't have an associated record: #{id_array}"
273 logger.debug "Result id_array: #{id_array.inspect}, result: #{result}"
277 # Finds instance model name, ids and scores by contents.
278 # Useful if you want to search across models
279 # Terms are ANDed by default, can be circumvented by using OR between terms.
281 # Example controller code (not tested):
282 # def multi_search(query)
284 # result << (Model1.find_id_by_contents query)
285 # result << (Model2.find_id_by_contents query)
286 # result << (Model3.find_id_by_contents query)
288 # result.sort! {|element| element[:score]}
289 # # Figure out for yourself how to retreive and present the data from modelname and id
292 # Note that the scores retrieved this way aren't normalized across
293 # indexes, so that the order of results after sorting by score will
294 # differ from the order you would get when running the same query
295 # on a single index containing all the data from Model1, Model2
299 # :first_doc - first hit to retrieve (useful for paging)
300 # :num_docs - number of hits to retrieve
301 def find_id_by_contents(q, options = {})
303 hits = ferret_index.search(q, options)
304 hits.each do |hit, score|
305 result << {:model => self.name, :id => ferret_index[hit][:id], :score => score}
307 logger.debug "id_score_model array: #{result.inspect}"
311 # requires the store_class_name option of acts_as_ferret to be true
312 # for all models queried this way.
314 # TODO: not optimal as each instance is fetched in a db call for it's
316 def multi_search(query, additional_models = [], options = {})
318 id_multi_search(query, additional_models, options).each { |hit|
319 result << Object.const_get(hit[:model]).find(hit[:id].to_i)
324 # returns an array of hashes, each containing :class_name,
325 # :id and :score for a hit.
327 def id_multi_search(query, additional_models = [], options = {})
328 additional_models << self
329 searcher = multi_index(additional_models)
331 hits = searcher.search(query, options)
332 hits.each { |hit, score|
333 doc = searcher.doc(hit)
334 result << { :model => doc[:class_name], :id => doc[:id], :score => score }
339 # returns a MultiIndex instance operating on a MultiReader
340 def multi_index(model_classes)
341 model_classes.sort! { |a, b| a.name <=> b.name }
342 key = model_classes.inject("") { |s, clazz| s << clazz.name }
343 @@multi_indexes[key] ||= MultiIndex.new(model_classes, ferret_configuration)
355 # todo: check for necessary index rebuilds in this place, too
356 # idea - each class gets a create_reader method that does this
357 def initialize(model_classes, options = {})
358 @model_classes = model_classes
360 :default_search_field => '*',
361 :analyzer => Analysis::WhiteSpaceAnalyzer.new
366 def search(query, options={})
367 query = process_query(query)
368 searcher.search(query, options)
372 create_new_multi_reader unless @reader
373 unless @reader.latest?
375 @searcher.close # will close the multi_reader and all sub_readers as well
377 @reader.close # just close the reader
379 create_new_multi_reader
386 @searcher ||= Search::IndexSearcher.new(@reader)
394 @query_parser ||= QueryParser.new(@options[:default_search_field], @options)
397 def process_query(query)
398 query = query_parser.parse(query) if query.is_a?(String)
402 # creates a new MultiReader to search the given Models
403 def create_new_multi_reader
404 sub_readers = @model_classes.map { |clazz|
405 Index::IndexReader.open(clazz.class_index_dir)
407 @reader = Index::MultiReader.new(sub_readers)
408 query_parser.fields = @reader.get_field_names.to_a
413 module InstanceMethods
416 @ferret_reindex = true
418 def ferret_before_update
419 @ferret_reindex = true
421 alias :ferret_before_create :ferret_before_update
425 logger.debug "ferret_create/update: #{self.class.name} : #{self.id}"
426 self.class.ferret_index << self.to_doc if @ferret_reindex
427 @ferret_reindex = true
430 alias :ferret_update :ferret_create
435 self.class.ferret_index.query_delete("+id:#{self.id}")
437 logger.warn("Could not find indexed value for this object")
442 # convert instance to ferret document
444 logger.debug "creating doc for class: #{self.class.name}"
445 # Churn through the complete Active Record and add it to the Ferret document
446 doc = Document::Document.new
447 # store the id of each item
448 doc << Document::Field.new( "id", self.id,
449 Document::Field::Store::YES,
450 Document::Field::Index::UNTOKENIZED )
451 # store the class name if configured to do so
452 if configuration[:store_class_name]
453 doc << Document::Field.new( "class_name", self.class.name,
454 Document::Field::Store::YES,
455 Document::Field::Index::UNTOKENIZED )
457 # iterate through the fields and add them to the document
459 # have user defined fields
460 fields_for_ferret.each do |field|
461 doc << self.send("#{field}_to_ferret")
465 self.attributes.each_pair do |key,val|
467 logger.debug "add field #{key} with value #{val}"
468 doc << Document::Field.new(
471 Ferret::Document::Field::Store::NO,
472 Ferret::Document::Field::Index::TOKENIZED)
484 # reopen ActiveRecord and include all the above to make
485 # them available to all our models if they want it
486 ActiveRecord::Base.class_eval do
487 include FerretMixin::Acts::ARFerret
490 class Ferret::Index::MultiReader
492 # TODO: Exception handling added to resolve ticket #6.
493 # It should be clarified wether this is a bug in Ferret
494 # in which case a bug report should be posted on the Ferret Trac.
496 @sub_readers.each { |r| return false unless r.latest? }
504 # END acts_as_ferret.rb