From d10bb77f17172a51b8de546087373e0eba77ce9d Mon Sep 17 00:00:00 2001 From: Abhay Kumar Date: Wed, 30 Jan 2008 02:38:13 -0800 Subject: [PATCH] major refactoring * turned Calais into a Module * added Response/Client objects * Response::Name/Response::Relationship objects also added * made things generally awesome --- CHANGELOG | 2 +- README | 16 +++--- lib/calais.rb | 150 ++++++++++++++++--------------------------------- lib/calais/client.rb | 62 ++++++++++++++++++++ lib/calais/response.rb | 73 ++++++++++++++++++++++++ spec/calais_spec.rb | 45 ++++++--------- 6 files changed, 209 insertions(+), 139 deletions(-) rewrite lib/calais.rb (84%) create mode 100644 lib/calais/client.rb create mode 100644 lib/calais/response.rb diff --git a/CHANGELOG b/CHANGELOG index 6b83d9f..8c07990 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ == UNRELEASED * Access to Open Calais's Enlighten action -* Convenience method to all of the names in a document \ No newline at end of file +* Single point of access to process a document diff --git a/README b/README index 15f1471..fca6b7e 100644 --- a/README +++ b/README @@ -8,9 +8,10 @@ A Ruby interface to the Open Calais API (http://opencalais.com) == FEATURES/PROBLEMS: -* Basic access to the Open Calais API's Enlighten action. * Accepts documents in text/plain, text/xml and text/html format. -* Output is RDF representation of input document. +* Basic access to the Open Calais API's Enlighten action. + * Output is RDF representation of input document. +* Single function ability to tag a document and receive a response in RDF format, names in the document, and their relationships. == SYNOPSIS: @@ -20,18 +21,17 @@ This is a very basic wrapper to the Open Calais API. It uses the POST endpoint a This is the easiest way to get the RDF-formated response from the OpenCalais service. -If you want to do something more fun like getting all the names from a body of text, try this: - - Calais.names(:content => "The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.", :content_type => :text) +If you want to do something more fun like getting all sorts of fun information about a document, you can try this: -This will return an hash of arrays that would look like this: + Calais.process_document(:content => "The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.", :content_type => :text) - {"Country"=>["United Kingdom"], "Company"=>["McDonald's"], "IndustryTerm"=>["food chain"]} +This will return an object containing the RDF representation of the text, the names in the text, and any relationships that exist there. == REQUIREMENTS: * Ruby 1.8.5 or better - * Uses the following standard libraries: digest/sha1, net/http, rexml/document, yaml + * Uses the following standard libraries: digest/sha1, net/http, yaml, cgi +* Hpricot == INSTALL: diff --git a/lib/calais.rb b/lib/calais.rb dissimilarity index 84% index 92bf799..2c03010 100644 --- a/lib/calais.rb +++ b/lib/calais.rb @@ -1,103 +1,47 @@ -require 'digest/sha1' -require 'net/http' -require 'rexml/document' -require 'yaml' - -$KCODE = "UTF8" - -class Calais - POST_URL = "http://api.opencalais.com" - - AVAILABLE_OUTPUT_FORMATS = { - :rdf => "XML/RDF" - } - DEFAULT_OUTPUT_FORMAT = :rdf - - AVAILABLE_CONTENT_TYPES = { - :xml => "TEXT/XML", - :html => "TEXT/HTML", - :text => "TEXT/TXT" - } - DEFAULT_CONTENT_TYPE = :xml - - DEFAULT_SUBMITTER = "calais.rb" - - AVAILABLE_METHODS = { - :enlighten => "/enlighten/calais.asmx/Enlighten" - } - - MAX_RETRIES = 5 - - class << self - def enlighten(*args, &block) Calais.new(*args, &block).call(:enlighten) end - def names(*args, &block) Calais.get_names(Calais.enlighten(*args, &block)) end - end - - attr_accessor :license_id - attr_accessor :content - attr_accessor :content_type, :output_format - attr_accessor :allow_distribution, :allow_search, :submitter, :external_id - attr_accessor :external_metadata - - def initialize(options={}, &block) - @license_id = YAML.load(File.read(File.join(File.dirname(__FILE__), '..', 'conf', 'calais.yml')))['key'] - options.each {|k,v| send("#{k}=", v)} - yield(self) if block_given? - end - - def call(method, times=1) - method = method.intern unless method.is_a?(Symbol) - raise ArgumentError.new("Unknown method: #{method}") unless AVAILABLE_METHODS.keys.include? method - - post_args = { - "licenseID" => @license_id, - "content" => @content, - "paramsXML" => params_xml - } - - url = URI.parse(POST_URL + AVAILABLE_METHODS[method]) - resp, data = Net::HTTP.post_form(url, post_args) - handle_response(resp, data, post_args, method, times) - end - - def handle_response(resp, data, post_args, method, times) - if resp.is_a?(Net::HTTPOK) - REXML::Document.new(data).root.text - elsif times >= MAX_RETRIES - %[\nToo many retries (#{MAX_RETRIES}.\nLast response was #{resp})] - else - call(method, times+1) - end - end - - def self.get_names(rdf) - doc = REXML::Document.new(rdf) - return {} if doc.elements["error"] - doc.root.elements.to_a("//rdf:Description/c:name").inject({}) do |hsh, ele| - type = ele.parent.elements["rdf:type"].attribute("rdf:resource").value.match(%r{type/em/e/(.*)})[1] rescue nil - hsh[type] = hsh[type] ? hsh[type].concat([ele.text]) : [ele.text] if type - hsh - end - end - - private - def params_xml - content_type = @content_type && AVAILABLE_CONTENT_TYPES.keys.include?(@content_type) ? AVAILABLE_CONTENT_TYPES[@content_type] : AVAILABLE_CONTENT_TYPES[DEFAULT_CONTENT_TYPE] - output_format = @output_format && AVAILABLE_OUTPUT_FORMATS.keys.include?(@output_format) ? AVAILABLE_OUTPUT_FORMATS[@output_format] : AVAILABLE_OUTPUT_FORMATS[DEFAULT_OUTPUT_FORMAT] - allow_distribution = @allow_distribution ? "true" : "false" - allow_search = @allow_search ? "true" : "false" - submitter = @submitter || DEFAULT_SUBMITTER - external_id = @external_id || Digest::SHA1.hexdigest(@content.inspect) - external_metadata = @external_metadata || "" - - xml = %[] - xml += %[] - xml += %[] - xml += %[#{external_metadata}] - xml += %[] - end -end - -class Calais - VERSION = '0.0.1' -end +require 'digest/sha1' +require 'net/http' +require 'yaml' +require 'cgi' + +require 'rubygems' +require 'hpricot' + +$KCODE = "UTF8" + +Dir.glob(File.join(File.dirname(__FILE__), 'calais/*.rb')).each { |f| require f } + +module Calais + POST_URL = "http://api.opencalais.com" + + AVAILABLE_OUTPUT_FORMATS = { + :rdf => "XML/RDF" + } + DEFAULT_OUTPUT_FORMAT = :rdf + + AVAILABLE_CONTENT_TYPES = { + :xml => "TEXT/XML", + :html => "TEXT/HTML", + :text => "TEXT/TXT" + } + DEFAULT_CONTENT_TYPE = :xml + + DEFAULT_SUBMITTER = "calais.rb" + + AVAILABLE_METHODS = { + :enlighten => "/enlighten/calais.asmx/Enlighten" + } + + MAX_RETRIES = 5 + + class << self + def enlighten(*args, &block) Client.new(*args, &block).call(:enlighten) end + def process_document(*args, &block) + data, error = Calais.enlighten(*args, &block) + Client.process_data(data, error) + end + end +end + +module Calais + VERSION = '0.0.1' +end diff --git a/lib/calais/client.rb b/lib/calais/client.rb new file mode 100644 index 0000000..eb2acfd --- /dev/null +++ b/lib/calais/client.rb @@ -0,0 +1,62 @@ +module Calais + class Client + attr_accessor :license_id + attr_accessor :content + attr_accessor :content_type, :output_format + attr_accessor :allow_distribution, :allow_search, :submitter, :external_id + attr_accessor :external_metadata + + def initialize(options={}, &block) + @license_id = YAML.load(File.read(File.join(File.dirname(__FILE__), '..', '..', 'conf', 'calais.yml')))['key'] + options.each {|k,v| send("#{k}=", v)} + yield(self) if block_given? + end + + def call(method, times=1) + method = method.intern unless method.is_a?(Symbol) + raise ArgumentError.new("Unknown method: #{method}") unless AVAILABLE_METHODS.keys.include? method + + post_args = { + "licenseID" => @license_id, + "content" => @content, + "paramsXML" => params_xml + } + + url = URI.parse(POST_URL + AVAILABLE_METHODS[method]) + resp, data = Net::HTTP.post_form(url, post_args) + + handle_response(resp, data, method, times) + end + + def self.process_data(data, error=nil) + Calais::Response.new(data, error) + end + + private + def handle_response(resp, data, method, times) + if resp.is_a? Net::HTTPOK + [data, nil] + elsif times >= MAX_RETRIES + [data, "Too many retries: #{times}"] + else + call(method, times+1) + end + end + + def params_xml + content_type = @content_type && AVAILABLE_CONTENT_TYPES.keys.include?(@content_type) ? AVAILABLE_CONTENT_TYPES[@content_type] : AVAILABLE_CONTENT_TYPES[DEFAULT_CONTENT_TYPE] + output_format = @output_format && AVAILABLE_OUTPUT_FORMATS.keys.include?(@output_format) ? AVAILABLE_OUTPUT_FORMATS[@output_format] : AVAILABLE_OUTPUT_FORMATS[DEFAULT_OUTPUT_FORMAT] + allow_distribution = @allow_distribution ? "true" : "false" + allow_search = @allow_search ? "true" : "false" + submitter = @submitter || DEFAULT_SUBMITTER + external_id = @external_id || Digest::SHA1.hexdigest(@content.inspect) + external_metadata = @external_metadata || "" + + xml = %[] + xml += %[] + xml += %[] + xml += %[#{external_metadata}] + xml += %[] + end + end +end \ No newline at end of file diff --git a/lib/calais/response.rb b/lib/calais/response.rb new file mode 100644 index 0000000..9829bf4 --- /dev/null +++ b/lib/calais/response.rb @@ -0,0 +1,73 @@ +module Calais + class Response + attr_reader :rdf, :names, :relationships, :error + + def initialize(raw, error=nil) + @error = error + @names = [] + @relationships = [] + + parse_rdf(raw) + parse_names + parse_relationships + end + + private + def parse_rdf(raw) + @rdf = CGI::unescapeHTML Hpricot.XML(raw).at("/string").inner_html + @hpricot = Hpricot.XML(@rdf) + @error = Hpricot.XML(response).at("/Error/Exception").inner_html rescue @error + end + + def parse_names + @names = @hpricot.root.search("rdf:Description//c:name//..").map do |ele| + Calais::Response::Name.new( + :name => ele.at("c:name").inner_html, + :hash => ele.attributes["rdf:about"].split('/').last, + :type => ele.at("rdf:type").attributes["rdf:resource"].split('/').last + ) + end unless @error + end + + def parse_relationships + doc = @hpricot.dup + doc.search("rdf:Description//c:docId//..").remove + doc.search("rdf:Description//c:document//..").remove + doc.search("rdf:Description//c:name//..").remove + + @relationships = doc.root.search("rdf:Description").map do |ele| + relationship = ele.at("rdf:type") + actor = relationship.next_sibling + metadata = actor.next_sibling.attributes["rdf:resource"] ? nil : actor.next_sibling.inner_html.strip + target = metadata ? actor.next_sibling.next_sibling : actor.next_sibling + + Calais::Response::Relationship.new( + :type => relationship.attributes["rdf:resource"].split('/').last, + :actor => Name.find_in_names(actor.attributes["rdf:resource"].split('/').last, @names), + :target => Name.find_in_names(target.attributes["rdf:resource"].split('/').last, @names), + :metadata => metadata + ) + end + end + + class Name + attr_accessor :name, :type, :hash + + def initialize(args={}) + args.each {|k,v| send("#{k}=", v)} + end + + def self.find_in_names(hash, names) + names.select {|name| name.hash == hash }.first + end + end + + class Relationship + attr_accessor :type, :actor, :target, :metadata + + def initialize(args={}) + args.each {|k,v| send("#{k}=", v)} + end + end + end +end \ No newline at end of file diff --git a/spec/calais_spec.rb b/spec/calais_spec.rb index 13dbfc3..a038342 100644 --- a/spec/calais_spec.rb +++ b/spec/calais_spec.rb @@ -6,11 +6,11 @@ describe Calais do end end -describe Calais, ".new" do +describe Calais::Client, ".new" do it "accepts arguments as a hash" do client = nil - lambda { client = Calais.new(:content => SAMPLE_DOCUMENT) }.should_not raise_error(ArgumentError) + lambda { client = Calais::Client.new(:content => SAMPLE_DOCUMENT) }.should_not raise_error(ArgumentError) client.license_id.should == LICENSE_KEY client.content.should == SAMPLE_DOCUMENT @@ -20,7 +20,7 @@ describe Calais, ".new" do client = nil lambda { - client = Calais.new do |c| + client = Calais::Client.new do |c| c.content = SAMPLE_DOCUMENT end }.should_not raise_error(ArgumentError) @@ -29,37 +29,28 @@ describe Calais, ".new" do client.content.should == SAMPLE_DOCUMENT end - it "should not accept unkonwn attributes" do - lambda { Calais.new(:monkey => "monkey") }.should raise_error(NoMethodError) + it "should not accept unknown attributes" do + lambda { Calais::Client.new(:monkey => "monkey") }.should raise_error(NoMethodError) end end -describe Calais, ".enlighten" do - before(:all) do - @marked = Calais.enlighten(:content => SAMPLE_DOCUMENT, :content_type => :xml) - end - - it "returns a string" do - @marked.should_not be_nil - @marked.should be_a_kind_of(String) +describe Calais, ".process_document" do + it "returns a Calais::Response" do + response = Calais.process_document(:content => SAMPLE_DOCUMENT, :content_type => :xml) + response.should_not be_nil + response.should be_a_kind_of(Calais::Response) end -end -describe Calais, ".names" do - before(:all) do - @names = Calais.names(:content => SAMPLE_DOCUMENT, :content_type => :xml) - end - - it "returns a hash of key/array pairs" do - @names.should_not be_nil - @names.should be_a_kind_of(Hash) - @names.each_value {|v| v.should be_a_kind_of(Array)} + it "returns a Calais::Response (with relationships)" do + response = Calais.process_document(:content => File.read(File.join(File.dirname(__FILE__), 'fixtures', 'bicycles_austrailia.xml')), :content_type => :xml) + response.should_not be_nil + response.should be_a_kind_of(Calais::Response) end end -describe Calais, ".call" do +describe Calais::Client, ".call" do before(:all) do - @client = Calais.new(:content => SAMPLE_DOCUMENT) + @client = Calais::Client.new(:content => SAMPLE_DOCUMENT) end it "accepts known methods" do @@ -71,9 +62,9 @@ describe Calais, ".call" do end end -describe Calais, ".params_xml" do +describe Calais::Client, ".params_xml" do it "returns an xml encoded string" do - client = Calais.new(:content => SAMPLE_DOCUMENT, :content_type => :xml) + client = Calais::Client.new(:content => SAMPLE_DOCUMENT, :content_type => :xml) client.send("params_xml").should_not be_nil client.send("params_xml").should == %[] end -- 2.11.4.GIT