1 require 'html/tokenizer'
3 require 'html/selector'
6 # A top-level HTMl document. You give it a body of text, and it will parse that
7 # text into a tree of nodes.
8 class Document #:nodoc:
10 # The root of the parsed document.
13 # Create a new Document from the given text.
14 def initialize(text, strict=false, xml=false)
15 tokenizer = Tokenizer.new(text)
17 node_stack = [ @root ]
18 while token = tokenizer.next
19 node = Node.parse(node_stack.last, tokenizer.line, tokenizer.position, token)
21 node_stack.last.children << node unless node.tag? && node.closing == :close
23 if node_stack.length > 1 && node.closing == :close
24 if node_stack.last.name == node.name
25 if node_stack.last.children.empty?
26 node_stack.last.children << Text.new(node_stack.last, node.line, node.position, "")
30 open_start = node_stack.last.position - 20
31 open_start = 0 if open_start < 0
32 close_start = node.position - 20
33 close_start = 0 if close_start < 0
35 ignoring attempt to close #{node_stack.last.name} with #{node.name}
36 opened at byte #{node_stack.last.position}, line #{node_stack.last.line}
37 closed at byte #{node.position}, line #{node.line}
38 attributes at open: #{node_stack.last.attributes.inspect}
39 text around open: #{text[open_start,40].inspect}
40 text around close: #{text[close_start,40].inspect}
42 strict ? raise(msg) : warn(msg)
44 elsif !node.childless?(xml) && node.closing != :close
51 # Search the tree for (and return) the first node that matches the given
52 # conditions. The conditions are interpreted differently for different node
53 # types, see HTML::Text#find and HTML::Tag#find.
55 @root.find(conditions)
58 # Search the tree for (and return) all nodes that match the given
59 # conditions. The conditions are interpreted differently for different node
60 # types, see HTML::Text#find and HTML::Tag#find.
61 def find_all(conditions)
62 @root.find_all(conditions)