lib/class_methods.rb

   1 module FerretMixin
   2   module Acts #:nodoc:
   3     module ARFerret #:nodoc:
   4
   5       # declare the class level helper methods
   6       # which will load the relevant instance methods defined below when invoked
   7       module ClassMethods
   8
   9         # helper that defines a method that adds the given field to a lucene
  10         # document instance
  11         def define_to_field_method(field, options = {})
  12           options = {
  13             :store => :no,
  14             :highlight => :yes,
  15             :index => :yes,
  16             :term_vector => :with_positions_offsets,
  17             :boost => 1.0 }.update(options)
  18           fields_for_ferret[field] = options
  19           define_method("#{field}_to_ferret".to_sym) do
  20             begin
  21               val = content_for_field_name(field)
  22             rescue
  23               logger.warn("Error retrieving value for field #{field}: #{$!}")
  24               val = ''
  25             end
  26             logger.debug("Adding field #{field} with value '#{val}' to index")
  27             val
  28           end
  29         end
  30
  31         def add_fields(field_config)
  32           if field_config.respond_to?(:each_pair)
  33             field_config.each_pair do |key,val|
  34               define_to_field_method(key,val)
  35             end
  36           elsif field_config.respond_to?(:each)
  37             field_config.each do |field|
  38               define_to_field_method(field)
  39             end
  40           end
  41         end
  42
  43         def reloadable?; false end
  44
  45         @@ferret_indexes = Hash.new
  46         def ferret_indexes; @@ferret_indexes end
  47
  48         @@multi_indexes = Hash.new
  49         def multi_indexes; @@multi_indexes end
  50
  51         # declares a class as ferret-searchable.
  52         #
  53         # options are:
  54         #
  55         # fields:: names all fields to include in the index. If not given,
  56         #   all attributes of the class will be indexed. You may also give
  57         #   symbols pointing to instance methods of your model here, i.e.
  58         #   to retrieve and index data from a related model.
  59         #
  60         # additional_fields:: names fields to include in the index, in addition
  61         #   to those derived from the db scheme. use if you want to add
  62         #   custom fields derived from methods to the db fields (which will be picked
  63         #   by aaf). This option will be ignored when the fields option is given, in
  64         #   that case additional fields get specified there.
  65         #
  66         # index_dir:: declares the directory where to put the index for this class.
  67         #   The default is RAILS_ROOT/index/RAILS_ENV/CLASSNAME.
  68         #   The index directory will be created if it doesn't exist.
  69         #
  70         # single_index:: set this to true to let this class use a Ferret
  71         # index that is shared by all classes having :single_index set to true.
  72         # :store_class_name is set to true implicitly, as well as index_dir, so
  73         # don't bother setting these when using this option. the shared index
  74         # will be located in index/<RAILS_ENV>/shared .
  75         #
  76         # store_class_name:: to make search across multiple models useful, set
  77         # this to true. the model class name will be stored in a keyword field
  78         # named class_name
  79         #
  80         # max_results:: number of results to retrieve for :num_docs => :all,
  81         # default value is 1000
  82         #
  83         # ferret_options may be:
  84         # or_default:: - whether query terms are required by
  85         #   default (the default, false), or not (true)
  86         #
  87         # analyzer:: the analyzer to use for query parsing (default: nil,
  88         #   wihch means the ferret StandardAnalyzer gets used)
  89         #
  90         def acts_as_ferret(options={}, ferret_options={})
  91           configuration = {
  92             :index_dir => "#{FerretMixin::Acts::ARFerret::index_dir}/#{self.name.underscore}",
  93             :store_class_name => false,
  94             :single_index => false,
  95             :max_results => 1000
  96           }
  97           ferret_configuration = {
  98             :or_default => false,
  99             :handle_parser_errors => true
 100             #:max_clauses => 512,
 101             #:default_field => '*',
 102             #:analyzer => Ferret::Analysis::StandardAnalyzer.new,
 103             # :wild_card_downcase => true
 104           }
 105           configuration.update(options) if options.is_a?(Hash)
 106
 107           # apply appropriate settings for shared index
 108           if configuration[:single_index]
 109             configuration[:index_dir] = "#{FerretMixin::Acts::ARFerret::index_dir}/shared"
 110             configuration[:store_class_name] = true
 111           end
 112           ferret_configuration.update(ferret_options) if ferret_options.is_a?(Hash)
 113           # these properties are somewhat vital to the plugin and shouldn't
 114           # be overwritten by the user:
 115           ferret_configuration.update(
 116
 117             :key               => (configuration[:single_index] ? [:id, :class_name] : :id),
 118             :path              => configuration[:index_dir],
 119             :auto_flush        => true,
 120             :create_if_missing => true
 121           )
 122
 123           class_eval <<-EOV
 124               include FerretMixin::Acts::ARFerret::InstanceMethods
 125
 126
 127               after_create :ferret_create
 128               after_update :ferret_update
 129               after_destroy :ferret_destroy
 130
 131               cattr_accessor :fields_for_ferret
 132               cattr_accessor :configuration
 133               cattr_accessor :ferret_configuration
 134
 135               @@fields_for_ferret = Hash.new
 136               @@configuration = configuration
 137               @@ferret_configuration = ferret_configuration
 138
 139               if configuration[:fields]
 140                 add_fields(configuration[:fields])
 141               else
 142                 add_fields(self.new.attributes.keys.map { |k| k.to_sym })
 143                 add_fields(configuration[:additional_fields])
 144               end
 145
 146             EOV
 147           FerretMixin::Acts::ARFerret::ensure_directory configuration[:index_dir]
 148         end
 149
 150         def class_index_dir
 151           configuration[:index_dir]
 152         end
 153
 154         # rebuild the index from all data stored for this model.
 155         # This is called automatically when no index exists yet.
 156         #
 157         # TODO: the automatic index initialization only works if
 158         # every model class has it's
 159         # own index, otherwise the index will get populated only
 160         # with instances from the first model loaded
 161         #
 162         # When calling this method manually, you can give any additional
 163         # model classes that should also go into this index as parameters.
 164         # Useful when using the :single_index option.
 165         # Note that attributes named the same in different models will share
 166         # the same field options in the shared index.
 167         def rebuild_index(*models)
 168           models << self
 169           # default attributes for fields
 170           fi = Ferret::Index::FieldInfos.new(:store => :no,
 171                                              :index => :yes,
 172                                              :term_vector => :no,
 173                                              :boost => 1.0)
 174           # primary key
 175           fi.add_field(:id, :store => :yes, :index => :untokenized)
 176           # class_name
 177           if configuration[:store_class_name]
 178             fi.add_field(:class_name, :store => :yes, :index => :untokenized)
 179           end
 180           # collect field options from all models
 181           fields = {}
 182           models.each do |model|
 183             fields.update(model.fields_for_ferret)
 184           end
 185           logger.debug("class #{self.name}: fields for index: #{fields.keys.join(',')}")
 186           fields.each_pair do |field, options|
 187             fi.add_field(field, { :store => :no,
 188                                   :index => :yes }.update(options))
 189           end
 190           fi.create_index(ferret_configuration[:path])
 191
 192           index = Ferret::Index::Index.new(ferret_configuration.dup.update(:auto_flush => false))
 193           batch_size = 1000
 194           models.each do |model|
 195             # index in batches of 1000 to limit memory consumption (fixes #24)
 196             model.transaction do
 197               0.step(model.count, batch_size) do |i|
 198                 model.find(:all, :limit => batch_size, :offset => i).each do |rec|
 199                   index << rec.to_doc
 200                 end
 201               end
 202             end
 203           end
 204           logger.debug("Created Ferret index in: #{class_index_dir}")
 205           index.flush
 206           index.optimize
 207           index.close
 208         end
 209
 210         # Retrieve the Ferret::Index::Index instance for this model class.
 211         #
 212         # Index instances are stored in a hash, using the index directory
 213         # as the key. So model classes sharing a single index will share their
 214         # Index object, too.
 215         def ferret_index
 216           ferret_indexes[class_index_dir] ||= create_index_instance
 217         end
 218
 219         # creates a new Index::Index instance. Before that, a check is done
 220         # to see if the index exists in the file system. If not, index rebuild
 221         # from all model data retrieved by find(:all) is triggered.
 222         def create_index_instance
 223           rebuild_index unless File.file? "#{class_index_dir}/segments"
 224           Ferret::Index::Index.new(ferret_configuration)
 225         end
 226
 227         # Finds instances by contents. Terms are ANDed by default, can be circumvented
 228         # by using OR between terms.
 229         # options:
 230         # offset::      first hit to retrieve (useful for paging)
 231         # limit::       number of hits to retrieve, or :all to retrieve
 232         #               max_results results, which by default is 1000
 233         #               and can be changed in the call to acts_as_ferret
 234         #               or on demand like this:
 235         #               Model.configuration[:max_results] = 1000000
 236         #
 237         # find_options is a hash passed on to active_record's find when
 238         # retrieving the data from db, useful to i.e. prefetch relationships.
 239         #
 240         # this method returns a SearchResults instance, which really is an Array that has
 241         # been decorated with a total_hits accessor that delivers the total
 242         # number of hits (including those not fetched because of a low num_docs
 243         # value).
 244         def find_by_contents(q, options = {}, find_options = {})
 245           # handle shared index
 246           return single_index_find_by_contents(q, options, find_options) if configuration[:single_index]
 247           id_array = []
 248           id_positions = {}
 249           total_hits = find_id_by_contents(q, options) do |model, id, score|
 250             id_array << id
 251             # store index of this id for later ordering of results
 252             id_positions[id] = id_array.size
 253           end
 254           begin
 255             # TODO: in case of STI AR will filter out hits from other
 256             # classes for us, but this
 257             # will lead to less results retrieved --> scoping of ferret query
 258             # to self.class is still needed.
 259             if id_array.empty?
 260               result = []
 261             else
 262               conditions = [ "#{self.table_name}.id in (?)", id_array ]
 263               # combine our conditions with those given by user, if any
 264               if find_options[:conditions]
 265                 cust_opts = find_options[:conditions].dup
 266                 conditions.first << " and " << cust_opts.shift
 267                 conditions.concat(cust_opts)
 268               end
 269               result = self.find(:all,
 270                                  find_options.merge(:conditions => conditions))
 271             end
 272           rescue
 273             logger.debug "REBUILD YOUR INDEX! One of the id's didn't have an associated record: #{id_array}"
 274           end
 275
 276           # order results as they were found by ferret, unless an AR :order
 277           # option was given
 278           unless find_options[:order]
 279             result.sort! { |a, b| id_positions[a.id] <=> id_positions[b.id] }
 280           end
 281
 282           logger.debug "Query: #{q}\nResult id_array: #{id_array.inspect},\nresult: #{result}"
 283           return SearchResults.new(result, total_hits)
 284         end
 285
 286         # determine all field names in the shared index
 287         def single_index_field_names(models)
 288           @single_index_field_names ||= (
 289               searcher = Ferret::Search::Searcher.new(class_index_dir)
 290               if searcher.reader.respond_to?(:get_field_names)
 291                 (searcher.reader.send(:get_field_names) - ['id', 'class_name']).to_a
 292               else
 293                 puts <<-END
 294   unable to retrieve field names for class #{self.name}, please
 295   consider naming all indexed fields in your call to acts_as_ferret!
 296                 END
 297                 models.map { |m| m.content_columns.map { |col| col.name } }.flatten
 298               end
 299           )
 300
 301         end
 302
 303         # weiter: checken ob ferret-bug, dass wir die queries so selber bauen
 304         # muessen - liegt am downcasen des qparsers ? - gucken ob jetzt mit
 305         # ferret geht (content_cols) und dave um zugriff auf qp bitten, oder
 306         # auf reader
 307         def single_index_find_by_contents(q, options = {}, find_options = {})
 308           result = []
 309
 310           unless options[:models] == :all # search needs to be restricted by one or more class names
 311             options[:models] ||= []
 312             # add this class to the list of given models
 313             options[:models] << self unless options[:models].include?(self)
 314             # keep original query
 315             original_query = q
 316
 317             # work around ferret bug in #process_query (doesn't ensure the
 318             # reader is open)
 319             ferret_index.synchronize do
 320               ferret_index.send(:ensure_reader_open)
 321               original_query = ferret_index.process_query(q)
 322             end if q.is_a? String
 323
 324             q = Ferret::Search::BooleanQuery.new
 325             q.add_query(original_query, :must)
 326             model_query = Ferret::Search::BooleanQuery.new
 327             options[:models].each do |model|
 328               model_query.add_query(Ferret::Search::TermQuery.new(:class_name, model.name), :should)
 329             end
 330             q.add_query(model_query, :must)
 331             #end
 332           end
 333           #puts q.to_s
 334           total_hits = find_id_by_contents(q, options) do |model, id, score|
 335             result << Object.const_get(model).find(id, find_options.dup)
 336           end
 337           return SearchResults.new(result, total_hits)
 338         end
 339         protected :single_index_find_by_contents
 340
 341         # Finds instance model name, ids and scores by contents.
 342         # Useful if you want to search across models
 343         # Terms are ANDed by default, can be circumvented by using OR between terms.
 344         #
 345         # Example controller code (not tested):
 346         # def multi_search(query)
 347         #   result = []
 348         #   result << (Model1.find_id_by_contents query)
 349         #   result << (Model2.find_id_by_contents query)
 350         #   result << (Model3.find_id_by_contents query)
 351         #   result.flatten!
 352         #   result.sort! {|element| element[:score]}
 353         #   # Figure out for yourself how to retreive and present the data from modelname and id
 354         # end
 355         #
 356         # Note that the scores retrieved this way aren't normalized across
 357         # indexes, so that the order of results after sorting by score will
 358         # differ from the order you would get when running the same query
 359         # on a single index containing all the data from Model1, Model2
 360         # and Model
 361         #
 362         # options:
 363         # :first_doc - first hit to retrieve (useful for paging)
 364         # :num_docs - number of hits to retrieve, or :all to retrieve
 365         # max_results results, which by default is 1000 and can be changed in
 366         # the call to acts_as_ferret or on demand like this:
 367         # Model.configuration[:max_results] = 1000000
 368         #
 369         # a block can be given too, it will be executed with every result:
 370         # find_id_by_contents(q, options) do |model, id, score|
 371         #    id_array << id
 372         #    scores_by_id[id] = score
 373         # end
 374         # NOTE: in case a block is given, the total_hits value will be returned
 375         # instead of the result list!
 376         #
 377         def find_id_by_contents(q, options = {})
 378           deprecated_options_support(options)
 379           options[:limit] = configuration[:max_results] if options[:limit] == :all
 380
 381           result = []
 382           index = self.ferret_index
 383           #hits = index.search(q, options)
 384           #hits.each do |hit, score|
 385           total_hits = index.search_each(q, options) do |hit, score|
 386             # only collect result data if we intend to return it
 387             doc = index[hit]
 388             model = configuration[:store_class_name] ? doc[:class_name] : self.name
 389             if block_given?
 390               yield model, doc[:id].to_i, score
 391             else
 392               result << { :model => model, :id => doc[:id], :score => score }
 393             end
 394           end
 395           logger.debug "id_score_model array: #{result.inspect}"
 396           return block_given? ? total_hits : result
 397         end
 398
 399         # requires the store_class_name option of acts_as_ferret to be true
 400         # for all models queried this way.
 401         #
 402         # TODO: not optimal as each instance is fetched in a db call for it's
 403         # own.
 404         def multi_search(query, additional_models = [], options = {})
 405           result = []
 406           total_hits = id_multi_search(query, additional_models, options) do |model, id, score|
 407             result << Object.const_get(model).find(id)
 408           end
 409           SearchResults.new(result, total_hits)
 410         end
 411
 412         # returns an array of hashes, each containing :class_name,
 413         # :id and :score for a hit.
 414         #
 415         # if a block is given, class_name, id and score of each hit will
 416         # be yielded, and the total number of hits is returned.
 417         #
 418         def id_multi_search(query, additional_models = [], options = {})
 419           deprecated_options_support(options)
 420           # TODO remove this, ferret supports :all by itself now
 421           options[:limit] = configuration[:max_results] if options[:limit] == :all
 422           additional_models << self
 423           searcher = multi_index(additional_models)
 424           result = []
 425           total_hits = searcher.search_each (query, options) do |hit, score|
 426             doc = searcher[hit]
 427             if block_given?
 428               yield doc[:class_name], doc[:id].to_i, score
 429             else
 430               result << { :model => doc[:class_name], :id => doc[:id], :score => score }
 431             end
 432           end
 433           return block_given? ? total_hits : result
 434         end
 435
 436         # returns a MultiIndex instance operating on a MultiReader
 437         def multi_index(model_classes)
 438           model_classes.sort! { |a, b| a.name <=> b.name }
 439           key = model_classes.inject("") { |s, clazz| s << clazz.name }
 440           @@multi_indexes[key] ||= MultiIndex.new(model_classes, ferret_configuration)
 441         end
 442
 443         def deprecated_options_support(options)
 444           if options[:num_docs]
 445             logger.warn ":num_docs is deprecated, use :limit instead!"
 446             options[:limit] ||= options[:num_docs]
 447           end
 448           if options[:first_doc]
 449             logger.warn ":first_doc is deprecated, use :offset instead!"
 450             options[:offset] ||= options[:first_doc]
 451           end
 452         end
 453
 454       end
 455
 456     end
 457   end
 458 end
 459