lib/acts_as_ferret.rb

   1 # Copyright (c) 2006 Kasper Weibel Nielsen-Refs, Thomas Lockney, Jens Krämer
   2 #
   3 # Permission is hereby granted, free of charge, to any person obtaining a copy
   4 # of this software and associated documentation files (the "Software"), to deal
   5 # in the Software without restriction, including without limitation the rights
   6 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7 # copies of the Software, and to permit persons to whom the Software is
   8 # furnished to do so, subject to the following conditions:
   9 #
  10 # The above copyright notice and this permission notice shall be included in all
  11 # copies or substantial portions of the Software.
  12 #
  13 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19 # SOFTWARE.
  20
  21 require 'active_record'
  22 require 'ferret'
  23
  24 # Yet another Ferret Mixin.
  25 #
  26 # This mixin adds full text search capabilities to any Rails model.
  27 #
  28 # It is heavily based on the original acts_as_ferret plugin done by
  29 # Kasper Weibel and a modified version done by Thomas Lockney, which
  30 # both can be found on
  31 # http://ferret.davebalmain.com/trac/wiki/FerretOnRails
  32 #
  33 # Changes I did to the original version include:
  34 #
  35 # - automatic creation of missing index directories
  36 # - I took out the storage of class names in the index, as I prefer
  37 #   the 'one model, one index'-approach. If needed, multiple models
  38 #   can share one index by using a common superclass for these.
  39 # - separate index directories for different Rails environments, so
  40 #   unit tests don't mess up the production/development indexes.
  41 # - default to AND queries, as this is the behaviour most users expect
  42 # - index searcher instances are kept as class variables and will be re-used
  43 #   until an index change is detected, as opening a searcher is quite expensive
  44 #   this should improve search performance
  45 # - query parser is kept as a class variable
  46 #
  47 # usage:
  48 # include the following in your model class (specifiying the fields you want to get indexed):
  49 # acts_as_ferret :fields => [ 'title', 'description' ]
  50 #
  51 # now you can use ModelClass.find_by_contents(query) to find instances of your model
  52 # whose indexed fields match a given query. All query terms are required by default, but
  53 # explicit OR queries are possible. This differs from the ferret default, but imho is the more
  54 # often needed/expected behaviour (more query terms result in less results).
  55 #
  56 # Released under the MIT license.
  57 #
  58 # Authors:
  59 # Kasper Weibel Nielsen-Refs (original author)
  60 # Jens Kraemer <jk@jkraemer.net>
  61 #
  62 module FerretMixin
  63   module Acts #:nodoc:
  64     module ARFerret #:nodoc:
  65
  66       def self.ensure_directory(dir)
  67         Dir.mkdir dir unless File.directory? dir
  68       end
  69
  70       # make sure the default index base dir exists. by default, all indexes are created
  71       # under RAILS_ROOT/index/RAILS_ENV
  72       def self.init_index_basedir
  73         index_base = "#{RAILS_ROOT}/index"
  74         ensure_directory index_base
  75         @@index_dir = "#{index_base}/#{RAILS_ENV}"
  76         ensure_directory @@index_dir
  77       end
  78
  79       mattr_accessor :index_dir
  80       init_index_basedir
  81
  82       def self.append_features(base)
  83         super
  84         base.extend(ClassMethods)
  85       end
  86
  87       # declare the class level helper methods
  88       # which will load the relevant instance methods defined below when invoked
  89       module ClassMethods
  90         include Ferret
  91
  92         # helper that defines a method that adds the given field to a lucene
  93         # document instance
  94         def define_to_field_method(field, options = {})
  95           default_opts = { :store => Ferret::Document::Field::Store::NO,
  96             :index => Ferret::Document::Field::Index::TOKENIZED,
  97             :term_vector => Ferret::Document::Field::TermVector::NO,
  98             :binary => false,
  99             :boost => 1.0
 100           }
 101           default_opts.update(options) if options.is_a?(Hash)
 102           fields_for_ferret << field
 103           define_method("#{field}_to_ferret".to_sym) do
 104             begin
 105               val = self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.method(field).call
 106             rescue
 107               logger.debug("Error retrieving value for field #{field}: #{$!}")
 108               val = ''
 109             end
 110             logger.debug("Adding field #{field} with value '#{val}' to index")
 111             Ferret::Document::Field.new(field.to_s, val,
 112                                         default_opts[:store],
 113                                         default_opts[:index],
 114                                         default_opts[:term_vector],
 115                                         default_opts[:binary],
 116                                         default_opts[:boost])
 117           end
 118         end
 119
 120         # TODO: do we need to define this at this level ? Maybe it's
 121         # sufficient to do this only in classes calling acts_as_ferret ?
 122         def reloadable?; false end
 123
 124         @@ferret_indexes = Hash.new
 125         def ferret_indexes; @@ferret_indexes end
 126
 127         @@multi_indexes = Hash.new
 128         def multi_indexes; @@multi_indexes end
 129
 130         # declares a class as ferret-searchable.
 131         #
 132         # options are:
 133         #
 134         # fields:: names all fields to include in the index. If not given,
 135         #   all attributes of the class will be indexed. You may also give
 136         #   symbols pointing to instance methods of your model here, i.e.
 137         #   to retrieve and index data from a related model.
 138         #
 139         # index_dir:: declares the directory where to put the index for this class.
 140         #   The default is RAILS_ROOT/index/RAILS_ENV/CLASSNAME.
 141         #   The index directory will be created if it doesn't exist.
 142         #
 143         # store_class_name:: to make search across multiple models useful, set
 144         # this to true. the model class name will be stored in a keyword field
 145         # named class_name
 146         #
 147         # ferret_options may be:
 148         # occur_default:: - whether query terms are required by
 149         #   default (the default), or not. Specify one of
 150         #   Ferret::Search::BooleanClause::Occur::MUST or
 151         #   Ferret::Search::BooleanClause::Occur::SHOULD
 152         #
 153         # analyzer:: the analyzer to use for query parsing (default: nil,
 154         #   wihch means the ferret default Analyzer gets used)
 155         #
 156         def acts_as_ferret(options={}, ferret_options={})
 157           configuration = {
 158             :fields => nil,
 159             :index_dir => "#{FerretMixin::Acts::ARFerret::index_dir}/#{self.name}",
 160             :store_class_name => false
 161           }
 162           ferret_configuration = {
 163             :occur_default => Search::BooleanClause::Occur::MUST,
 164             :handle_parse_errors => true,
 165             :default_search_field => '*',
 166             # :analyzer => Analysis::StandardAnalyzer.new,
 167             # :wild_lower => true
 168           }
 169           configuration.update(options) if options.is_a?(Hash)
 170           ferret_configuration.update(ferret_options) if ferret_options.is_a?(Hash)
 171           # these properties are somewhat vital to the plugin and shouldn't
 172           # be overwritten by the user:
 173           ferret_configuration.update(
 174                                       :key               => 'id',
 175           :path              => configuration[:index_dir],
 176           :auto_flush        => true,
 177           :create_if_missing => true
 178           )
 179
 180           class_eval <<-EOV
 181               include FerretMixin::Acts::ARFerret::InstanceMethods
 182
 183               before_create :ferret_before_create
 184               before_update :ferret_before_update
 185               after_create :ferret_create
 186               after_update :ferret_update
 187               after_destroy :ferret_destroy
 188
 189               cattr_accessor :fields_for_ferret
 190               cattr_accessor :configuration
 191               cattr_accessor :ferret_configuration
 192
 193               @@fields_for_ferret = Array.new
 194               @@configuration = configuration
 195               @@ferret_configuration = ferret_configuration
 196
 197               if configuration[:fields].respond_to?(:each_pair)
 198                 configuration[:fields].each_pair do |key,val|
 199                   define_to_field_method(key,val)
 200                 end
 201               elsif configuration[:fields].respond_to?(:each)
 202                 configuration[:fields].each do |field|
 203                   define_to_field_method(field)
 204                 end
 205               else
 206                 @@fields_for_ferret = nil
 207               end
 208             EOV
 209           FerretMixin::Acts::ARFerret::ensure_directory configuration[:index_dir]
 210         end
 211
 212         def class_index_dir
 213           configuration[:index_dir]
 214         end
 215
 216         # rebuild the index from all data stored for this model.
 217         # This is called automatically when no index exists yet.
 218         #
 219         # TODO: the automatic index initialization only works if
 220         # every model class has it's
 221         # own index, otherwise the index will get populated only
 222         # with instances from the first model loaded
 223         def rebuild_index
 224           index = Index::Index.new(ferret_configuration.merge(:create => true))
 225           self.find_all.each { |content| index << content.to_doc }
 226           logger.debug("Created Ferret index in: #{class_index_dir}")
 227           index.flush
 228           index.optimize
 229           index.close
 230         end
 231
 232         # Retrieve the Ferret::Index::Index instance for this model class.
 233         #
 234         # Index instances are stored in a hash, using the index directory
 235         # as the key. So model classes sharing a single index will share their
 236         # Index object, too.
 237         def ferret_index
 238           ferret_indexes[class_index_dir] ||= create_index_instance
 239         end
 240
 241         # creates a new Index::Index instance. Before that, a check is done
 242         # to see if the index exists in the file system. If not, index rebuild
 243         # from all model data retrieved by find(:all) is triggered.
 244         def create_index_instance
 245           rebuild_index unless File.file? "#{class_index_dir}/segments"
 246           Index::Index.new(ferret_configuration)
 247         end
 248
 249         # Finds instances by contents. Terms are ANDed by default, can be circumvented
 250         # by using OR between terms.
 251         # options:
 252         # :first_doc - first hit to retrieve (useful for paging)
 253         # :num_docs - number of hits to retrieve
 254         def find_by_contents(q, options = {})
 255           id_array = []
 256           find_id_by_contents(q, options).each do |element|
 257             id_array << element[:id]
 258           end
 259           logger.debug "id_array: #{id_array.inspect}"
 260           begin
 261             if self.superclass == ActiveRecord::Base
 262               result = self.find(id_array)
 263             else
 264               # no direct subclass of Base --> STI
 265               # TODO: AR will filter out hits from other classes for us, but this
 266               # will lead to less results retrieved --> scoping of ferret query
 267               # to self.class is still needed.
 268               result = self.find(:all, :conditions => ["id in (?)",id_array])
 269             end
 270           rescue
 271             logger.debug "REBUILD YOUR INDEX! One of the id's didn't have an associated record: #{id_array}"
 272           end
 273           logger.debug "Result id_array: #{id_array.inspect}, result: #{result}"
 274           return result
 275         end
 276
 277         # Finds instance model name, ids and scores by contents.
 278         # Useful if you want to search across models
 279         # Terms are ANDed by default, can be circumvented by using OR between terms.
 280         #
 281         # Example controller code (not tested):
 282         # def multi_search(query)
 283         #   result = []
 284         #   result << (Model1.find_id_by_contents query)
 285         #   result << (Model2.find_id_by_contents query)
 286         #   result << (Model3.find_id_by_contents query)
 287         #   result.flatten!
 288         #   result.sort! {|element| element[:score]}
 289         #   # Figure out for yourself how to retreive and present the data from modelname and id
 290         # end
 291         #
 292         # Note that the scores retrieved this way aren't normalized across
 293         # indexes, so that the order of results after sorting by score will
 294         # differ from the order you would get when running the same query
 295         # on a single index containing all the data from Model1, Model2
 296         # and Model
 297         #
 298         # options:
 299         # :first_doc - first hit to retrieve (useful for paging)
 300         # :num_docs - number of hits to retrieve
 301         def find_id_by_contents(q, options = {})
 302           result = []
 303           hits = ferret_index.search(q, options)
 304           hits.each do |hit, score|
 305             result << {:model => self.name, :id => ferret_index[hit][:id], :score => score}
 306           end
 307           logger.debug "id_score_model array: #{result.inspect}"
 308           result
 309         end
 310
 311         # requires the store_class_name option of acts_as_ferret to be true
 312         # for all models queried this way.
 313         #
 314         # TODO: not optimal as each instance is fetched in a db call for it's
 315         # own.
 316         def multi_search(query, additional_models = [], options = {})
 317           result = []
 318           id_multi_search(query, additional_models, options).each { |hit|
 319             result << Object.const_get(hit[:model]).find(hit[:id].to_i)
 320           }
 321           result
 322         end
 323
 324         # returns an array of hashes, each containing :class_name,
 325         # :id and :score for a hit.
 326         #
 327         def id_multi_search(query, additional_models = [], options = {})
 328           additional_models << self
 329           searcher = multi_index(additional_models)
 330           result = []
 331           hits = searcher.search(query, options)
 332           hits.each { |hit, score|
 333             doc = searcher.doc(hit)
 334             result << { :model => doc[:class_name], :id => doc[:id], :score => score }
 335           }
 336           result
 337         end
 338
 339         # returns a MultiIndex instance operating on a MultiReader
 340         def multi_index(model_classes)
 341           model_classes.sort! { |a, b| a.name <=> b.name }
 342           key = model_classes.inject("") { |s, clazz| s << clazz.name }
 343           @@multi_indexes[key] ||= MultiIndex.new(model_classes, ferret_configuration)
 344         end
 345
 346       end
 347
 348
 349       # not threadsafe
 350       class MultiIndex
 351         include Ferret
 352
 353         attr_reader :reader
 354
 355         # todo: check for necessary index rebuilds in this place, too
 356         # idea - each class gets a create_reader method that does this
 357         def initialize(model_classes, options = {})
 358           @model_classes = model_classes
 359           @options = {
 360             :default_search_field => '*',
 361             :analyzer => Analysis::WhiteSpaceAnalyzer.new
 362           }.update(options)
 363           ensure_reader
 364         end
 365
 366         def search(query, options={})
 367           query = process_query(query)
 368           searcher.search(query, options)
 369         end
 370
 371         def ensure_reader
 372           create_new_multi_reader unless @reader
 373           unless @reader.latest?
 374             if @searcher
 375               @searcher.close # will close the multi_reader and all sub_readers as well
 376             else
 377               @reader.close # just close the reader
 378             end
 379             create_new_multi_reader
 380             @searcher = nil
 381           end
 382         end
 383
 384         def searcher
 385           ensure_reader
 386           @searcher ||= Search::IndexSearcher.new(@reader)
 387         end
 388
 389         def doc(i)
 390           searcher.doc(i)
 391         end
 392
 393         def query_parser
 394           @query_parser ||= QueryParser.new(@options[:default_search_field], @options)
 395         end
 396
 397         def process_query(query)
 398           query = query_parser.parse(query) if query.is_a?(String)
 399           return query
 400         end
 401
 402         # creates a new MultiReader to search the given Models
 403         def create_new_multi_reader
 404           sub_readers = @model_classes.map { |clazz|
 405             Index::IndexReader.open(clazz.class_index_dir)
 406           }
 407           @reader = Index::MultiReader.new(sub_readers)
 408           query_parser.fields = @reader.get_field_names.to_a
 409         end
 410
 411       end
 412
 413       module InstanceMethods
 414         include Ferret
 415         attr_reader :reindex
 416         @ferret_reindex = true
 417
 418         def ferret_before_update
 419           @ferret_reindex = true
 420         end
 421         alias :ferret_before_create :ferret_before_update
 422
 423         # add to index
 424         def ferret_create
 425           logger.debug "ferret_create/update: #{self.class.name} : #{self.id}"
 426           self.class.ferret_index << self.to_doc if @ferret_reindex
 427           @ferret_reindex = true
 428           true
 429         end
 430         alias :ferret_update :ferret_create
 431
 432         # remove from index
 433         def ferret_destroy
 434           begin
 435             self.class.ferret_index.query_delete("+id:#{self.id}")
 436           rescue
 437             logger.warn("Could not find indexed value for this object")
 438           end
 439           true
 440         end
 441
 442         # convert instance to ferret document
 443         def to_doc
 444           logger.debug "creating doc for class: #{self.class.name}"
 445           # Churn through the complete Active Record and add it to the Ferret document
 446           doc = Document::Document.new
 447           # store the id of each item
 448           doc << Document::Field.new( "id", self.id,
 449           Document::Field::Store::YES,
 450           Document::Field::Index::UNTOKENIZED )
 451           # store the class name if configured to do so
 452           if configuration[:store_class_name]
 453             doc << Document::Field.new( "class_name", self.class.name,
 454             Document::Field::Store::YES,
 455             Document::Field::Index::UNTOKENIZED )
 456           end
 457           # iterate through the fields and add them to the document
 458           if fields_for_ferret
 459             # have user defined fields
 460             fields_for_ferret.each do |field|
 461               doc << self.send("#{field}_to_ferret")
 462             end
 463           else
 464             # take all fields
 465             self.attributes.each_pair do |key,val|
 466               unless key == :id
 467                 logger.debug "add field #{key} with value #{val}"
 468                 doc << Document::Field.new(
 469                                            key,
 470                                            val.to_s,
 471                                            Ferret::Document::Field::Store::NO,
 472                                            Ferret::Document::Field::Index::TOKENIZED)
 473               end
 474             end
 475           end
 476           return doc
 477         end
 478
 479       end
 480     end
 481   end
 482 end
 483
 484 # reopen ActiveRecord and include all the above to make
 485 # them available to all our models if they want it
 486 ActiveRecord::Base.class_eval do
 487   include FerretMixin::Acts::ARFerret
 488 end
 489
 490 class Ferret::Index::MultiReader
 491   def latest?
 492     # TODO: Exception handling added to resolve ticket #6.
 493     # It should be clarified wether this is a bug in Ferret
 494     # in which case a bug report should be posted on the Ferret Trac.
 495     begin
 496       @sub_readers.each { |r| return false unless r.latest? }
 497     rescue
 498       return false
 499     end
 500     true
 501   end
 502 end
 503
 504 # END acts_as_ferret.rb