3 module ARFerret #:nodoc:
5 # declare the class level helper methods
6 # which will load the relevant instance methods defined below when invoked
9 # helper that defines a method that adds the given field to a lucene
11 def define_to_field_method(field, options = {})
16 :term_vector => :with_positions_offsets,
17 :boost => 1.0 }.update(options)
18 fields_for_ferret[field] = options
19 define_method("#{field}_to_ferret".to_sym) do
21 val = content_for_field_name(field)
23 logger.warn("Error retrieving value for field #{field}: #{$!}")
26 logger.debug("Adding field #{field} with value '#{val}' to index")
31 def add_fields(field_config)
32 if field_config.respond_to?(:each_pair)
33 field_config.each_pair do |key,val|
34 define_to_field_method(key,val)
36 elsif field_config.respond_to?(:each)
37 field_config.each do |field|
38 define_to_field_method(field)
43 def reloadable?; false end
45 @@ferret_indexes = Hash.new
46 def ferret_indexes; @@ferret_indexes end
48 @@multi_indexes = Hash.new
49 def multi_indexes; @@multi_indexes end
51 # declares a class as ferret-searchable.
55 # fields:: names all fields to include in the index. If not given,
56 # all attributes of the class will be indexed. You may also give
57 # symbols pointing to instance methods of your model here, i.e.
58 # to retrieve and index data from a related model.
60 # additional_fields:: names fields to include in the index, in addition
61 # to those derived from the db scheme. use if you want to add
62 # custom fields derived from methods to the db fields (which will be picked
63 # by aaf). This option will be ignored when the fields option is given, in
64 # that case additional fields get specified there.
66 # index_dir:: declares the directory where to put the index for this class.
67 # The default is RAILS_ROOT/index/RAILS_ENV/CLASSNAME.
68 # The index directory will be created if it doesn't exist.
70 # single_index:: set this to true to let this class use a Ferret
71 # index that is shared by all classes having :single_index set to true.
72 # :store_class_name is set to true implicitly, as well as index_dir, so
73 # don't bother setting these when using this option. the shared index
74 # will be located in index/<RAILS_ENV>/shared .
76 # store_class_name:: to make search across multiple models useful, set
77 # this to true. the model class name will be stored in a keyword field
80 # max_results:: number of results to retrieve for :num_docs => :all,
81 # default value is 1000
83 # ferret_options may be:
84 # or_default:: - whether query terms are required by
85 # default (the default, false), or not (true)
87 # analyzer:: the analyzer to use for query parsing (default: nil,
88 # wihch means the ferret StandardAnalyzer gets used)
90 def acts_as_ferret(options={}, ferret_options={})
92 :index_dir => "#{FerretMixin::Acts::ARFerret::index_dir}/#{self.name.underscore}",
93 :store_class_name => false,
94 :single_index => false,
97 ferret_configuration = {
99 :handle_parser_errors => true
100 #:max_clauses => 512,
101 #:default_field => '*',
102 #:analyzer => Ferret::Analysis::StandardAnalyzer.new,
103 # :wild_card_downcase => true
105 configuration.update(options) if options.is_a?(Hash)
107 # apply appropriate settings for shared index
108 if configuration[:single_index]
109 configuration[:index_dir] = "#{FerretMixin::Acts::ARFerret::index_dir}/shared"
110 configuration[:store_class_name] = true
112 ferret_configuration.update(ferret_options) if ferret_options.is_a?(Hash)
113 # these properties are somewhat vital to the plugin and shouldn't
114 # be overwritten by the user:
115 ferret_configuration.update(
117 :key => (configuration[:single_index] ? [:id, :class_name] : :id),
118 :path => configuration[:index_dir],
120 :create_if_missing => true
124 include FerretMixin::Acts::ARFerret::InstanceMethods
127 after_create :ferret_create
128 after_update :ferret_update
129 after_destroy :ferret_destroy
131 cattr_accessor :fields_for_ferret
132 cattr_accessor :configuration
133 cattr_accessor :ferret_configuration
135 @@fields_for_ferret = Hash.new
136 @@configuration = configuration
137 @@ferret_configuration = ferret_configuration
139 if configuration[:fields]
140 add_fields(configuration[:fields])
142 add_fields(self.new.attributes.keys.map { |k| k.to_sym })
143 add_fields(configuration[:additional_fields])
147 FerretMixin::Acts::ARFerret::ensure_directory configuration[:index_dir]
151 configuration[:index_dir]
154 # rebuild the index from all data stored for this model.
155 # This is called automatically when no index exists yet.
157 # TODO: the automatic index initialization only works if
158 # every model class has it's
159 # own index, otherwise the index will get populated only
160 # with instances from the first model loaded
162 # When calling this method manually, you can give any additional
163 # model classes that should also go into this index as parameters.
164 # Useful when using the :single_index option.
165 # Note that attributes named the same in different models will share
166 # the same field options in the shared index.
167 def rebuild_index(*models)
169 # default attributes for fields
170 fi = Ferret::Index::FieldInfos.new(:store => :no,
175 fi.add_field(:id, :store => :yes, :index => :untokenized)
177 if configuration[:store_class_name]
178 fi.add_field(:class_name, :store => :yes, :index => :untokenized)
180 # collect field options from all models
182 models.each do |model|
183 fields.update(model.fields_for_ferret)
185 logger.debug("class #{self.name}: fields for index: #{fields.keys.join(',')}")
186 fields.each_pair do |field, options|
187 fi.add_field(field, { :store => :no,
188 :index => :yes }.update(options))
190 fi.create_index(ferret_configuration[:path])
192 index = Ferret::Index::Index.new(ferret_configuration.dup.update(:auto_flush => false))
194 models.each do |model|
195 # index in batches of 1000 to limit memory consumption (fixes #24)
197 0.step(model.count, batch_size) do |i|
198 model.find(:all, :limit => batch_size, :offset => i).each do |rec|
204 logger.debug("Created Ferret index in: #{class_index_dir}")
210 # Retrieve the Ferret::Index::Index instance for this model class.
212 # Index instances are stored in a hash, using the index directory
213 # as the key. So model classes sharing a single index will share their
216 ferret_indexes[class_index_dir] ||= create_index_instance
219 # creates a new Index::Index instance. Before that, a check is done
220 # to see if the index exists in the file system. If not, index rebuild
221 # from all model data retrieved by find(:all) is triggered.
222 def create_index_instance
223 rebuild_index unless File.file? "#{class_index_dir}/segments"
224 Ferret::Index::Index.new(ferret_configuration)
227 # Finds instances by contents. Terms are ANDed by default, can be circumvented
228 # by using OR between terms.
230 # offset:: first hit to retrieve (useful for paging)
231 # limit:: number of hits to retrieve, or :all to retrieve
232 # max_results results, which by default is 1000
233 # and can be changed in the call to acts_as_ferret
234 # or on demand like this:
235 # Model.configuration[:max_results] = 1000000
237 # find_options is a hash passed on to active_record's find when
238 # retrieving the data from db, useful to i.e. prefetch relationships.
240 # this method returns a SearchResults instance, which really is an Array that has
241 # been decorated with a total_hits accessor that delivers the total
242 # number of hits (including those not fetched because of a low num_docs
244 def find_by_contents(q, options = {}, find_options = {})
245 # handle shared index
246 return single_index_find_by_contents(q, options, find_options) if configuration[:single_index]
249 total_hits = find_id_by_contents(q, options) do |model, id, score|
251 # store index of this id for later ordering of results
252 id_positions[id] = id_array.size
255 # TODO: in case of STI AR will filter out hits from other
256 # classes for us, but this
257 # will lead to less results retrieved --> scoping of ferret query
258 # to self.class is still needed.
262 conditions = [ "#{self.table_name}.id in (?)", id_array ]
263 # combine our conditions with those given by user, if any
264 if find_options[:conditions]
265 cust_opts = find_options[:conditions].dup
266 conditions.first << " and " << cust_opts.shift
267 conditions.concat(cust_opts)
269 result = self.find(:all,
270 find_options.merge(:conditions => conditions))
273 logger.debug "REBUILD YOUR INDEX! One of the id's didn't have an associated record: #{id_array}"
276 # order results as they were found by ferret, unless an AR :order
278 unless find_options[:order]
279 result.sort! { |a, b| id_positions[a.id] <=> id_positions[b.id] }
282 logger.debug "Query: #{q}\nResult id_array: #{id_array.inspect},\nresult: #{result}"
283 return SearchResults.new(result, total_hits)
286 # determine all field names in the shared index
287 def single_index_field_names(models)
288 @single_index_field_names ||= (
289 searcher = Ferret::Search::Searcher.new(class_index_dir)
290 if searcher.reader.respond_to?(:get_field_names)
291 (searcher.reader.send(:get_field_names) - ['id', 'class_name']).to_a
294 unable to retrieve field names for class #{self.name}, please
295 consider naming all indexed fields in your call to acts_as_ferret!
297 models.map { |m| m.content_columns.map { |col| col.name } }.flatten
303 # weiter: checken ob ferret-bug, dass wir die queries so selber bauen
304 # muessen - liegt am downcasen des qparsers ? - gucken ob jetzt mit
305 # ferret geht (content_cols) und dave um zugriff auf qp bitten, oder
307 def single_index_find_by_contents(q, options = {}, find_options = {})
310 unless options[:models] == :all # search needs to be restricted by one or more class names
311 options[:models] ||= []
312 # add this class to the list of given models
313 options[:models] << self unless options[:models].include?(self)
314 # keep original query
317 # work around ferret bug in #process_query (doesn't ensure the
319 ferret_index.synchronize do
320 ferret_index.send(:ensure_reader_open)
321 original_query = ferret_index.process_query(q)
322 end if q.is_a? String
324 q = Ferret::Search::BooleanQuery.new
325 q.add_query(original_query, :must)
326 model_query = Ferret::Search::BooleanQuery.new
327 options[:models].each do |model|
328 model_query.add_query(Ferret::Search::TermQuery.new(:class_name, model.name), :should)
330 q.add_query(model_query, :must)
334 total_hits = find_id_by_contents(q, options) do |model, id, score|
335 result << Object.const_get(model).find(id, find_options.dup)
337 return SearchResults.new(result, total_hits)
339 protected :single_index_find_by_contents
341 # Finds instance model name, ids and scores by contents.
342 # Useful if you want to search across models
343 # Terms are ANDed by default, can be circumvented by using OR between terms.
345 # Example controller code (not tested):
346 # def multi_search(query)
348 # result << (Model1.find_id_by_contents query)
349 # result << (Model2.find_id_by_contents query)
350 # result << (Model3.find_id_by_contents query)
352 # result.sort! {|element| element[:score]}
353 # # Figure out for yourself how to retreive and present the data from modelname and id
356 # Note that the scores retrieved this way aren't normalized across
357 # indexes, so that the order of results after sorting by score will
358 # differ from the order you would get when running the same query
359 # on a single index containing all the data from Model1, Model2
363 # :first_doc - first hit to retrieve (useful for paging)
364 # :num_docs - number of hits to retrieve, or :all to retrieve
365 # max_results results, which by default is 1000 and can be changed in
366 # the call to acts_as_ferret or on demand like this:
367 # Model.configuration[:max_results] = 1000000
369 # a block can be given too, it will be executed with every result:
370 # find_id_by_contents(q, options) do |model, id, score|
372 # scores_by_id[id] = score
374 # NOTE: in case a block is given, the total_hits value will be returned
375 # instead of the result list!
377 def find_id_by_contents(q, options = {})
378 deprecated_options_support(options)
379 options[:limit] = configuration[:max_results] if options[:limit] == :all
382 index = self.ferret_index
383 #hits = index.search(q, options)
384 #hits.each do |hit, score|
385 total_hits = index.search_each(q, options) do |hit, score|
386 # only collect result data if we intend to return it
388 model = configuration[:store_class_name] ? doc[:class_name] : self.name
390 yield model, doc[:id].to_i, score
392 result << { :model => model, :id => doc[:id], :score => score }
395 logger.debug "id_score_model array: #{result.inspect}"
396 return block_given? ? total_hits : result
399 # requires the store_class_name option of acts_as_ferret to be true
400 # for all models queried this way.
402 # TODO: not optimal as each instance is fetched in a db call for it's
404 def multi_search(query, additional_models = [], options = {})
406 total_hits = id_multi_search(query, additional_models, options) do |model, id, score|
407 result << Object.const_get(model).find(id)
409 SearchResults.new(result, total_hits)
412 # returns an array of hashes, each containing :class_name,
413 # :id and :score for a hit.
415 # if a block is given, class_name, id and score of each hit will
416 # be yielded, and the total number of hits is returned.
418 def id_multi_search(query, additional_models = [], options = {})
419 deprecated_options_support(options)
420 # TODO remove this, ferret supports :all by itself now
421 options[:limit] = configuration[:max_results] if options[:limit] == :all
422 additional_models << self
423 searcher = multi_index(additional_models)
425 total_hits = searcher.search_each (query, options) do |hit, score|
428 yield doc[:class_name], doc[:id].to_i, score
430 result << { :model => doc[:class_name], :id => doc[:id], :score => score }
433 return block_given? ? total_hits : result
436 # returns a MultiIndex instance operating on a MultiReader
437 def multi_index(model_classes)
438 model_classes.sort! { |a, b| a.name <=> b.name }
439 key = model_classes.inject("") { |s, clazz| s << clazz.name }
440 @@multi_indexes[key] ||= MultiIndex.new(model_classes, ferret_configuration)
443 def deprecated_options_support(options)
444 if options[:num_docs]
445 logger.warn ":num_docs is deprecated, use :limit instead!"
446 options[:limit] ||= options[:num_docs]
448 if options[:first_doc]
449 logger.warn ":first_doc is deprecated, use :offset instead!"
450 options[:offset] ||= options[:first_doc]