lib/acts_as_ferret.rb

   1 # Copyright (c) 2006 Kasper Weibel Nielsen-Refs, Thomas Lockney, Jens Krämer
   2 #
   3 # Permission is hereby granted, free of charge, to any person obtaining a copy
   4 # of this software and associated documentation files (the "Software"), to deal
   5 # in the Software without restriction, including without limitation the rights
   6 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7 # copies of the Software, and to permit persons to whom the Software is
   8 # furnished to do so, subject to the following conditions:
   9 #
  10 # The above copyright notice and this permission notice shall be included in all
  11 # copies or substantial portions of the Software.
  12 #
  13 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19 # SOFTWARE.
  20
  21 require 'active_record'
  22
  23 # Ferret 0.3.2 is considered the most reliable ferret version for now, all unit
  24 # tests should pass (with or w/o the C extension). Speed is not as good as with the
  25 # C-only Ferret 0.9.1, but still fast enough for common scenarios and work
  26 # loads. Until Ferret 0.9.x stabilizes, you should consider this
  27 # version for production scenarios.
  28 require_gem 'ferret', '=0.3.2'
  29
  30 # Ferret >=0.9, Ruby-only, is much slower than 0.3.2 with it's small C
  31 # extension, so it's not really an option.
  32 # some tests related to searching multiple indexes at once are failing here
  33 # (returning more results than expected)
  34 #require 'rferret'
  35
  36 # This will use the most recent installed ferret version, usually this will be
  37 # 0.9.1 in the C-flavour. Difficult topic, as some parts of the API is not
  38 # accessible yet. Several tests fail with this version, but basic single-index
  39 # functionality is there and working.
  40 #require 'ferret'
  41
  42 # Yet another Ferret Mixin.
  43 #
  44 # This mixin adds full text search capabilities to any Rails model.
  45 #
  46 # It is heavily based on the original acts_as_ferret plugin done by
  47 # Kasper Weibel and a modified version done by Thomas Lockney, which
  48 # both can be found on
  49 # http://ferret.davebalmain.com/trac/wiki/FerretOnRails
  50 #
  51 # Changes I did to the original version include:
  52 #
  53 # - automatic creation of missing index directories
  54 # - I took out the storage of class names in the index, as I prefer
  55 #   the 'one model, one index'-approach. If needed, multiple models
  56 #   can share one index by using a common superclass for these.
  57 # - separate index directories for different Rails environments, so
  58 #   unit tests don't mess up the production/development indexes.
  59 # - default to AND queries, as this is the behaviour most users expect
  60 # - index searcher instances are kept as class variables and will be re-used
  61 #   until an index change is detected, as opening a searcher is quite expensive
  62 #   this should improve search performance
  63 # - query parser is kept as a class variable
  64 #
  65 # usage:
  66 # include the following in your model class (specifiying the fields you want to get indexed):
  67 # acts_as_ferret :fields => [ 'title', 'description' ]
  68 #
  69 # now you can use ModelClass.find_by_contents(query) to find instances of your model
  70 # whose indexed fields match a given query. All query terms are required by default, but
  71 # explicit OR queries are possible. This differs from the ferret default, but imho is the more
  72 # often needed/expected behaviour (more query terms result in less results).
  73 #
  74 # Released under the MIT license.
  75 #
  76 # Authors:
  77 # Kasper Weibel Nielsen-Refs (original author)
  78 # Jens Kraemer <jk@jkraemer.net>
  79 #
  80 module FerretMixin
  81   module Acts #:nodoc:
  82     module ARFerret #:nodoc:
  83
  84       def self.ensure_directory(dir)
  85         FileUtils.mkdir_p dir unless File.directory? dir
  86       end
  87
  88       # make sure the default index base dir exists. by default, all indexes are created
  89       # under RAILS_ROOT/index/RAILS_ENV
  90       def self.init_index_basedir
  91         index_base = "#{RAILS_ROOT}/index"
  92         ensure_directory index_base
  93         @@index_dir = "#{index_base}/#{RAILS_ENV}"
  94         ensure_directory @@index_dir
  95       end
  96
  97       mattr_accessor :index_dir
  98       init_index_basedir
  99
 100       def self.append_features(base)
 101         super
 102         base.extend(ClassMethods)
 103       end
 104
 105       # declare the class level helper methods
 106       # which will load the relevant instance methods defined below when invoked
 107       module ClassMethods
 108
 109         # helper that defines a method that adds the given field to a lucene
 110         # document instance
 111         def define_to_field_method(field, options = {})
 112           default_opts = { :store => Ferret::Document::Field::Store::NO,
 113             :index => Ferret::Document::Field::Index::TOKENIZED,
 114             :term_vector => Ferret::Document::Field::TermVector::NO,
 115             :binary => false,
 116             :boost => 1.0
 117           }
 118           default_opts.update(options) if options.is_a?(Hash)
 119           fields_for_ferret << field
 120           define_method("#{field}_to_ferret".to_sym) do
 121             begin
 122               val = self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.method(field).call
 123             rescue
 124               logger.debug("Error retrieving value for field #{field}: #{$!}")
 125               val = ''
 126             end
 127             logger.debug("Adding field #{field} with value '#{val}' to index")
 128             Ferret::Document::Field.new(field.to_s, val,
 129                                         default_opts[:store],
 130                                         default_opts[:index],
 131                                         default_opts[:term_vector],
 132                                         default_opts[:binary],
 133                                         default_opts[:boost])
 134           end
 135         end
 136
 137         # TODO: do we need to define this at this level ? Maybe it's
 138         # sufficient to do this only in classes calling acts_as_ferret ?
 139         def reloadable?; false end
 140
 141         @@ferret_indexes = Hash.new
 142         def ferret_indexes; @@ferret_indexes end
 143
 144         @@multi_indexes = Hash.new
 145         def multi_indexes; @@multi_indexes end
 146
 147         # declares a class as ferret-searchable.
 148         #
 149         # options are:
 150         #
 151         # fields:: names all fields to include in the index. If not given,
 152         #   all attributes of the class will be indexed. You may also give
 153         #   symbols pointing to instance methods of your model here, i.e.
 154         #   to retrieve and index data from a related model.
 155         #
 156         # index_dir:: declares the directory where to put the index for this class.
 157         #   The default is RAILS_ROOT/index/RAILS_ENV/CLASSNAME.
 158         #   The index directory will be created if it doesn't exist.
 159         #
 160         # store_class_name:: to make search across multiple models useful, set
 161         # this to true. the model class name will be stored in a keyword field
 162         # named class_name
 163         #
 164         # ferret_options may be:
 165         # occur_default:: - whether query terms are required by
 166         #   default (the default), or not. Specify one of
 167         #   Ferret::Search::BooleanClause::Occur::MUST or
 168         #   Ferret::Search::BooleanClause::Occur::SHOULD
 169         #
 170         # analyzer:: the analyzer to use for query parsing (default: nil,
 171         #   wihch means the ferret default Analyzer gets used)
 172         #
 173         def acts_as_ferret(options={}, ferret_options={})
 174           configuration = {
 175             :fields => nil,
 176             :index_dir => "#{FerretMixin::Acts::ARFerret::index_dir}/#{self.name.underscore}",
 177             :store_class_name => false
 178           }
 179           ferret_configuration = {
 180             :occur_default => Ferret::Search::BooleanClause::Occur::MUST,
 181             :handle_parse_errors => true,
 182             :default_search_field => '*',
 183             # :analyzer => Analysis::StandardAnalyzer.new,
 184             # :wild_lower => true
 185           }
 186           configuration.update(options) if options.is_a?(Hash)
 187           ferret_configuration.update(ferret_options) if ferret_options.is_a?(Hash)
 188           # these properties are somewhat vital to the plugin and shouldn't
 189           # be overwritten by the user:
 190           ferret_configuration.update(
 191                                       :key               => 'id',
 192           :path              => configuration[:index_dir],
 193           :auto_flush        => true,
 194           :create_if_missing => true
 195           )
 196
 197           class_eval <<-EOV
 198               include FerretMixin::Acts::ARFerret::InstanceMethods
 199
 200               before_create :ferret_before_create
 201               before_update :ferret_before_update
 202               after_create :ferret_create
 203               after_update :ferret_update
 204               after_destroy :ferret_destroy
 205
 206               cattr_accessor :fields_for_ferret
 207               cattr_accessor :configuration
 208               cattr_accessor :ferret_configuration
 209
 210               @@fields_for_ferret = Array.new
 211               @@configuration = configuration
 212               @@ferret_configuration = ferret_configuration
 213
 214               if configuration[:fields].respond_to?(:each_pair)
 215                 configuration[:fields].each_pair do |key,val|
 216                   define_to_field_method(key,val)
 217                 end
 218               elsif configuration[:fields].respond_to?(:each)
 219                 configuration[:fields].each do |field|
 220                   define_to_field_method(field)
 221                 end
 222               else
 223                 @@fields_for_ferret = nil
 224               end
 225             EOV
 226           FerretMixin::Acts::ARFerret::ensure_directory configuration[:index_dir]
 227         end
 228
 229         def class_index_dir
 230           configuration[:index_dir]
 231         end
 232
 233         # rebuild the index from all data stored for this model.
 234         # This is called automatically when no index exists yet.
 235         #
 236         # TODO: the automatic index initialization only works if
 237         # every model class has it's
 238         # own index, otherwise the index will get populated only
 239         # with instances from the first model loaded
 240         def rebuild_index
 241           index = Ferret::Index::Index.new(ferret_configuration.merge(:create => true))
 242           self.find_all.each { |content| index << content.to_doc }
 243           logger.debug("Created Ferret index in: #{class_index_dir}")
 244           index.flush
 245           index.optimize
 246           index.close
 247         end
 248
 249         # Retrieve the Ferret::Index::Index instance for this model class.
 250         #
 251         # Index instances are stored in a hash, using the index directory
 252         # as the key. So model classes sharing a single index will share their
 253         # Index object, too.
 254         def ferret_index
 255           ferret_indexes[class_index_dir] ||= create_index_instance
 256         end
 257
 258         # creates a new Index::Index instance. Before that, a check is done
 259         # to see if the index exists in the file system. If not, index rebuild
 260         # from all model data retrieved by find(:all) is triggered.
 261         def create_index_instance
 262           rebuild_index unless File.file? "#{class_index_dir}/segments"
 263           Ferret::Index::Index.new(ferret_configuration)
 264         end
 265
 266         # Finds instances by contents. Terms are ANDed by default, can be circumvented
 267         # by using OR between terms.
 268         # options:
 269         # :first_doc - first hit to retrieve (useful for paging)
 270         # :num_docs - number of hits to retrieve
 271         #
 272         # find_options is a hash passed on to active_record's find when
 273         # retrieving the data from db, useful to i.e. prefetch relationships.
 274         def find_by_contents(q, options = {}, find_options = {})
 275           id_array = []
 276           scores_by_id = {}
 277           find_id_by_contents(q, options) do |element|
 278             id_array << id = element[:id].to_i
 279             scores_by_id[id] = element[:score]
 280           end
 281           begin
 282             if self.superclass == ActiveRecord::Base
 283               result = self.find(id_array, find_options)
 284             else
 285               # no direct subclass of Base --> STI
 286               # TODO: AR will filter out hits from other classes for us, but this
 287               # will lead to less results retrieved --> scoping of ferret query
 288               # to self.class is still needed.
 289               result = self.find(:all,
 290                                  find_options.merge(:conditions => ["id in (?)",id_array]))
 291             end
 292           rescue
 293             logger.debug "REBUILD YOUR INDEX! One of the id's didn't have an associated record: #{id_array}"
 294           end
 295
 296           # sort results by score (descending)
 297           result.sort! { |b, a| scores_by_id[a.id] <=> scores_by_id[b.id] }
 298
 299           logger.debug "Query: #{q}\nResult id_array: #{id_array.inspect},\nresult: #{result},\nscores: #{scores_by_id.inspect}"
 300           return result
 301         end
 302
 303         # Finds instance model name, ids and scores by contents.
 304         # Useful if you want to search across models
 305         # Terms are ANDed by default, can be circumvented by using OR between terms.
 306         #
 307         # Example controller code (not tested):
 308         # def multi_search(query)
 309         #   result = []
 310         #   result << (Model1.find_id_by_contents query)
 311         #   result << (Model2.find_id_by_contents query)
 312         #   result << (Model3.find_id_by_contents query)
 313         #   result.flatten!
 314         #   result.sort! {|element| element[:score]}
 315         #   # Figure out for yourself how to retreive and present the data from modelname and id
 316         # end
 317         #
 318         # Note that the scores retrieved this way aren't normalized across
 319         # indexes, so that the order of results after sorting by score will
 320         # differ from the order you would get when running the same query
 321         # on a single index containing all the data from Model1, Model2
 322         # and Model
 323         #
 324         # options:
 325         # :first_doc - first hit to retrieve (useful for paging)
 326         # :num_docs - number of hits to retrieve
 327         #
 328         # a block can be given too, it will be executed with every result hash:
 329         # find_id_by_contents(q, options) do |element|
 330         #    id_array << id = element[:id].to_i
 331         #    scores_by_id[id] = element[:score]
 332         # end
 333         #
 334         def find_id_by_contents(q, options = {})
 335           result = []
 336           hits = ferret_index.search(q, options)
 337           hits.each do |hit, score|
 338             result << {:model => self.name, :id => ferret_index[hit][:id], :score => score}
 339             yield result.last if block_given?
 340           end
 341           logger.debug "id_score_model array: #{result.inspect}"
 342           result
 343         end
 344
 345         # requires the store_class_name option of acts_as_ferret to be true
 346         # for all models queried this way.
 347         #
 348         # TODO: not optimal as each instance is fetched in a db call for it's
 349         # own.
 350         def multi_search(query, additional_models = [], options = {})
 351           result = []
 352           id_multi_search(query, additional_models, options).each { |hit|
 353             result << Object.const_get(hit[:model]).find(hit[:id].to_i)
 354           }
 355           result
 356         end
 357
 358         # returns an array of hashes, each containing :class_name,
 359         # :id and :score for a hit.
 360         #
 361         def id_multi_search(query, additional_models = [], options = {})
 362           additional_models << self
 363           searcher = multi_index(additional_models)
 364           result = []
 365           hits = searcher.search(query, options)
 366           hits.each { |hit, score|
 367             doc = searcher.doc(hit)
 368             result << { :model => doc[:class_name], :id => doc[:id], :score => score }
 369           }
 370           result
 371         end
 372
 373         # returns a MultiIndex instance operating on a MultiReader
 374         def multi_index(model_classes)
 375           model_classes.sort! { |a, b| a.name <=> b.name }
 376           key = model_classes.inject("") { |s, clazz| s << clazz.name }
 377           @@multi_indexes[key] ||= MultiIndex.new(model_classes, ferret_configuration)
 378         end
 379
 380       end
 381
 382
 383       # not threadsafe
 384       class MultiIndex
 385
 386         attr_reader :reader
 387
 388         # todo: check for necessary index rebuilds in this place, too
 389         # idea - each class gets a create_reader method that does this
 390         def initialize(model_classes, options = {})
 391           @model_classes = model_classes
 392           @options = {
 393             :default_search_field => '*',
 394             :analyzer => Ferret::Analysis::WhiteSpaceAnalyzer.new
 395           }.update(options)
 396           ensure_reader
 397         end
 398
 399         def search(query, options={})
 400           query = process_query(query)
 401           searcher.search(query, options)
 402         end
 403
 404         def ensure_reader
 405           create_new_multi_reader unless @reader
 406           unless @reader.latest?
 407             if @searcher
 408               @searcher.close # will close the multi_reader and all sub_readers as well
 409             else
 410               @reader.close # just close the reader
 411             end
 412             create_new_multi_reader
 413             @searcher = nil
 414           end
 415         end
 416
 417         def searcher
 418           ensure_reader
 419           @searcher ||= Ferret::Search::IndexSearcher.new(@reader)
 420         end
 421
 422         def doc(i)
 423           searcher.doc(i)
 424         end
 425
 426         def query_parser
 427           @query_parser ||= Ferret::QueryParser.new(@options[:default_search_field], @options)
 428         end
 429
 430         def process_query(query)
 431           query = query_parser.parse(query) if query.is_a?(String)
 432           return query
 433         end
 434
 435         # creates a new MultiReader to search the given Models
 436         def create_new_multi_reader
 437           sub_readers = @model_classes.map { |clazz|
 438             Ferret::Index::IndexReader.open(clazz.class_index_dir)
 439           }
 440           @reader = Ferret::Index::MultiReader.new(sub_readers)
 441           query_parser.fields = @reader.get_field_names.to_a
 442         end
 443
 444       end
 445
 446       module InstanceMethods
 447         attr_reader :reindex
 448         @ferret_reindex = true
 449
 450         def ferret_before_update
 451           @ferret_reindex = true
 452         end
 453         alias :ferret_before_create :ferret_before_update
 454
 455         # add to index
 456         def ferret_create
 457           logger.debug "ferret_create/update: #{self.class.name} : #{self.id}"
 458           self.class.ferret_index << self.to_doc if @ferret_reindex
 459           @ferret_reindex = true
 460           true
 461         end
 462         alias :ferret_update :ferret_create
 463
 464         # remove from index
 465         def ferret_destroy
 466           begin
 467             self.class.ferret_index.query_delete("+id:#{self.id}")
 468           rescue
 469             logger.warn("Could not find indexed value for this object")
 470           end
 471           true
 472         end
 473
 474         # convert instance to ferret document
 475         def to_doc
 476           logger.debug "creating doc for class: #{self.class.name}"
 477           # Churn through the complete Active Record and add it to the Ferret document
 478           doc = Ferret::Document::Document.new
 479           # store the id of each item
 480           doc << Ferret::Document::Field.new( "id", self.id,
 481           Ferret::Document::Field::Store::YES,
 482           Ferret::Document::Field::Index::UNTOKENIZED )
 483           # store the class name if configured to do so
 484           if configuration[:store_class_name]
 485             doc << Ferret::Document::Field.new( "class_name", self.class.name,
 486             Ferret::Document::Field::Store::YES,
 487             Ferret::Document::Field::Index::UNTOKENIZED )
 488           end
 489           # iterate through the fields and add them to the document
 490           if fields_for_ferret
 491             # have user defined fields
 492             fields_for_ferret.each do |field|
 493               doc << self.send("#{field}_to_ferret")
 494             end
 495           else
 496             # take all fields
 497             self.attributes.each_pair do |key,val|
 498               unless key == :id
 499                 logger.debug "add field #{key} with value #{val}"
 500                 doc << Ferret::Document::Field.new(
 501                                            key,
 502                                            val.to_s,
 503                                            Ferret::Document::Field::Store::NO,
 504                                            Ferret::Document::Field::Index::TOKENIZED)
 505               end
 506             end
 507           end
 508           return doc
 509         end
 510
 511         # BIG TODO: this file really gets too big. need to refactor a bit...
 512         # maybe extract the more like this stuff, could be useful somewhere
 513         # else, too...
 514
 515
 516         # returns other instances of this class, which have similar contents
 517         # like this one. Basically works like this: find out n most interesting
 518         # (i.e. characteristic) terms from this document, and then build a
 519         # query from those which is run against the whole index. Which terms
 520         # are interesting is decided on variour criteria which can be
 521         # influenced by the given options.
 522         #
 523         # The algorithm used here is a quite straight port of the MoreLikeThis class
 524         # from Apache Lucene.
 525         #
 526         # options are:
 527         # :field_names : Array of field names to use for similarity search (mandatory)
 528         # :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc.
 529         # :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs
 530         # :min_word_length => nil, # Ignore words if less than this len (longer
 531         # words tend to be more characteristic for the document they occur in).
 532         # :max_word_length => nil, # Ignore words if greater than this len.
 533         # :max_query_terms => 25,  # maximum number of terms in the query built
 534         # :max_num_tokens => 5000, # maximum number of tokens to examine in a
 535         # single field
 536         # :boost => false,         # when true, a boost according to the
 537         # relative score of a term is applied to this Term's TermQuery.
 538         # :similarity => Ferret::Search::Similarity.default, # the similarity
 539         # implementation to use
 540         # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to
 541         # use
 542         def more_like_this(options={})
 543           options = {
 544             :field_names => nil,  # Default field names
 545             :min_term_freq => 2,  # Ignore terms with less than this frequency in the source doc.
 546             :min_doc_freq => 5,   # Ignore words which do not occur in at least this many docs
 547             :min_word_length => nil, # Ignore words if less than this len.
 548             :max_word_length => nil, # Ignore words if greater than this len.
 549             :max_query_terms => 25,  # maximum number of terms in the query built
 550             :max_num_tokens => 5000,
 551             :boost => false,
 552             :similarity => Ferret::Search::Similarity.default,
 553             :analyzer => Ferret::Analysis::StandardAnalyzer.new
 554           }.update(options)
 555           index = self.class.ferret_index
 556           begin
 557             reader = index.send(:reader)
 558           rescue
 559             # ferret >=0.9, C-Version doesn't allow access to Index#reader
 560             reader = Ferret::Index::IndexReader.open(Ferret::Store::FSDirectory.new(self.class.class_index_dir, false))
 561           end
 562           doc_number = self.document_number
 563           term_freq_map = retrieve_terms(document_number, reader, options)
 564           priority_queue = create_queue(term_freq_map, reader, options)
 565           query = create_query(priority_queue, options)
 566           self.class.find_by_contents(query)
 567         end
 568
 569
 570         def create_query(priority_queue, options={})
 571           query = Ferret::Search::BooleanQuery.new
 572           qterms = 0
 573           best_score = 0
 574           while(cur = priority_queue.pop)
 575             term_query = Ferret::Search::TermQuery.new(cur.to_term)
 576
 577             if options[:boost]
 578               # boost term according to relative score
 579               # TODO untested
 580               best_score ||= cur.score
 581               term_query.boost = cur.score / best_score
 582             end
 583             begin
 584               query.add_query(term_query, Ferret::Search::BooleanClause::Occur::SHOULD)
 585             rescue Ferret::Search::BooleanQuery::TooManyClauses
 586               break
 587             end
 588             qterms += 1
 589             break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
 590           end
 591           # exclude ourselves
 592           t = Ferret::Index::Term.new('id', self.id.to_s)
 593           query.add_query(Ferret::Search::TermQuery.new(t),
 594                           Ferret::Search::BooleanClause::Occur::MUST_NOT)
 595           return query
 596         end
 597
 598
 599         def document_number
 600           hits = self.class.ferret_index.search("id:#{self.id}")
 601           hits.each { |hit, score| return hit }
 602         end
 603
 604         # creates a term/term_frequency map for terms from the fields
 605         # given in options[:field_names]
 606         def retrieve_terms(doc_number, reader, options)
 607           field_names = options[:field_names]
 608           max_num_tokens = options[:max_num_tokens]
 609           term_freq_map = Hash.new(0)
 610           field_names.each do |field|
 611             term_freq_vector = reader.get_term_vector(document_number, field)
 612             if term_freq_vector
 613               # use stored term vector
 614               # TODO untested
 615               term_freq_vector.terms.each_with_index do |term, i|
 616                 term_freq_map[term] += term_freq_vector.freqs[i] unless noise_word?(term)
 617               end
 618             else
 619               # no term vector stored, extract terms from document content
 620               # TODO: if no content stored, maybe use content from self ?
 621               doc = reader.get_document(doc_number)
 622               token_count = 0
 623
 624               # C-Ferret >=0.9 again, no #each in tokenstream :-(
 625               ts = options[:analyzer].token_stream(field, doc[field])
 626               while token = ts.next
 627               #options[:analyzer].token_stream(field, doc[field]).each do |token|
 628                 break if (token_count+=1) > max_num_tokens
 629
 630                 next if noise_word?(token_text(token))
 631                 term_freq_map[token_text(token)] += 1
 632               end
 633             end
 634           end
 635           term_freq_map
 636         end
 637
 638         # extract textual value of a token
 639         def token_text(token)
 640           # token.term_text is for ferret 0.3.2
 641           token.respond_to?(:text) ? token.text : token.term_text
 642         end
 643
 644         # create an ordered(by score) list of word,fieldname,score
 645         # structures
 646         def create_queue(term_freq_map, reader, options)
 647           pq = Array.new(term_freq_map.size)
 648
 649           similarity = options[:similarity]
 650           num_docs = reader.num_docs
 651           term_freq_map.each_pair do |word, tf|
 652             # filter out words that don't occur enough times in the source
 653             next if options[:min_term_freq] && tf < options[:min_term_freq]
 654
 655             # go through all the fields and find the largest document frequency
 656             top_field = options[:field_names].first
 657             doc_freq = 0
 658             options[:field_names].each do |field_name|
 659               freq = reader.doc_freq(Ferret::Index::Term.new(field_name, word))
 660               if freq > doc_freq
 661                 top_field = field_name
 662                 doc_freq = freq
 663               end
 664             end
 665             # filter out words that don't occur in enough docs
 666             next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
 667             next if doc_freq == 0 # index update problem ?
 668
 669             idf = similarity.idf(doc_freq, num_docs)
 670             score = tf * idf
 671             pq << FrequencyQueueItem.new(word, top_field, score)
 672           end
 673           pq.compact!
 674           pq.sort! { |a,b| a.score<=>b.score }
 675           return pq
 676         end
 677
 678         def noise_word?(text)
 679           false
 680         end
 681
 682       end
 683
 684       class FrequencyQueueItem
 685         attr_reader :word, :field, :score
 686         def initialize(word, field, score)
 687           @word = word; @field = field; @score = score
 688         end
 689         def to_term
 690           Ferret::Index::Term.new(self.field, self.word)
 691         end
 692       end
 693
 694     end
 695   end
 696 end
 697
 698 # reopen ActiveRecord and include all the above to make
 699 # them available to all our models if they want it
 700 ActiveRecord::Base.class_eval do
 701   include FerretMixin::Acts::ARFerret
 702 end
 703
 704 class Ferret::Index::MultiReader
 705   def latest?
 706     # TODO: Exception handling added to resolve ticket #6.
 707     # It should be clarified wether this is a bug in Ferret
 708     # in which case a bug report should be posted on the Ferret Trac.
 709     begin
 710       @sub_readers.each { |r| return false unless r.latest? }
 711     rescue
 712       return false
 713     end
 714     true
 715   end
 716 end
 717
 718 # END acts_as_ferret.rb