2 # Copyright (c) 2006 Assaf Arkin (http://labnotes.org)
3 # Under MIT and/or CC By license.
8 # Selects HTML elements using CSS 2 selectors.
10 # The +Selector+ class uses CSS selector expressions to match and select
14 # selector = HTML::Selector.new "form.login[action=/login]"
15 # creates a new selector that matches any +form+ element with the class
16 # +login+ and an attribute +action+ with the value <tt>/login</tt>.
18 # === Matching Elements
20 # Use the #match method to determine if an element matches the selector.
22 # For simple selectors, the method returns an array with that element,
23 # or +nil+ if the element does not match. For complex selectors (see below)
24 # the method returns an array with all matched elements, of +nil+ if no
28 # if selector.match(element)
29 # puts "Element is a login form"
32 # === Selecting Elements
34 # Use the #select method to select all matching elements starting with
35 # one element and going through all children in depth-first order.
37 # This method returns an array of all matching elements, an empty array
38 # if no match is found
41 # selector = HTML::Selector.new "input[type=text]"
42 # matches = selector.select(element)
43 # matches.each do |match|
44 # puts "Found text field with name #{match.attributes['name']}"
49 # Selectors can match elements using any of the following criteria:
50 # * <tt>name</tt> -- Match an element based on its name (tag name).
51 # For example, <tt>p</tt> to match a paragraph. You can use <tt>*</tt>
52 # to match any element.
53 # * <tt>#</tt><tt>id</tt> -- Match an element based on its identifier (the
54 # <tt>id</tt> attribute). For example, <tt>#</tt><tt>page</tt>.
55 # * <tt>.class</tt> -- Match an element based on its class name, all
56 # class names if more than one specified.
57 # * <tt>[attr]</tt> -- Match an element that has the specified attribute.
58 # * <tt>[attr=value]</tt> -- Match an element that has the specified
59 # attribute and value. (More operators are supported see below)
60 # * <tt>:pseudo-class</tt> -- Match an element based on a pseudo class,
61 # such as <tt>:nth-child</tt> and <tt>:empty</tt>.
62 # * <tt>:not(expr)</tt> -- Match an element that does not match the
63 # negation expression.
65 # When using a combination of the above, the element name comes first
66 # followed by identifier, class names, attributes, pseudo classes and
67 # negation in any order. Do not seprate these parts with spaces!
68 # Space separation is used for descendant selectors.
71 # selector = HTML::Selector.new "form.login[action=/login]"
72 # The matched element must be of type +form+ and have the class +login+.
73 # It may have other classes, but the class +login+ is required to match.
74 # It must also have an attribute called +action+ with the value
77 # This selector will match the following element:
78 # <form class="login form" method="post" action="/login">
79 # but will not match the element:
80 # <form method="post" action="/logout">
82 # === Attribute Values
84 # Several operators are supported for matching attributes:
85 # * <tt>name</tt> -- The element must have an attribute with that name.
86 # * <tt>name=value</tt> -- The element must have an attribute with that
88 # * <tt>name^=value</tt> -- The attribute value must start with the
90 # * <tt>name$=value</tt> -- The attribute value must end with the
92 # * <tt>name*=value</tt> -- The attribute value must contain the
94 # * <tt>name~=word</tt> -- The attribute value must contain the specified
95 # word (space separated).
96 # * <tt>name|=word</tt> -- The attribute value must start with specified
99 # For example, the following two selectors match the same element:
102 # and so do the following two selectors:
106 # === Alternatives, siblings, children
108 # Complex selectors use a combination of expressions to match elements:
109 # * <tt>expr1 expr2</tt> -- Match any element against the second expression
110 # if it has some parent element that matches the first expression.
111 # * <tt>expr1 > expr2</tt> -- Match any element against the second expression
112 # if it is the child of an element that matches the first expression.
113 # * <tt>expr1 + expr2</tt> -- Match any element against the second expression
114 # if it immediately follows an element that matches the first expression.
115 # * <tt>expr1 ~ expr2</tt> -- Match any element against the second expression
116 # that comes after an element that matches the first expression.
117 # * <tt>expr1, expr2</tt> -- Match any element against the first expression,
118 # or against the second expression.
120 # Since children and sibling selectors may match more than one element given
121 # the first element, the #match method may return more than one match.
125 # Pseudo classes were introduced in CSS 3. They are most often used to select
126 # elements in a given position:
127 # * <tt>:root</tt> -- Match the element only if it is the root element
128 # (no parent element).
129 # * <tt>:empty</tt> -- Match the element only if it has no child elements,
130 # and no text content.
131 # * <tt>:only-child</tt> -- Match the element if it is the only child (element)
132 # of its parent element.
133 # * <tt>:only-of-type</tt> -- Match the element if it is the only child (element)
134 # of its parent element and its type.
135 # * <tt>:first-child</tt> -- Match the element if it is the first child (element)
136 # of its parent element.
137 # * <tt>:first-of-type</tt> -- Match the element if it is the first child (element)
138 # of its parent element of its type.
139 # * <tt>:last-child</tt> -- Match the element if it is the last child (element)
140 # of its parent element.
141 # * <tt>:last-of-type</tt> -- Match the element if it is the last child (element)
142 # of its parent element of its type.
143 # * <tt>:nth-child(b)</tt> -- Match the element if it is the b-th child (element)
144 # of its parent element. The value <tt>b</tt> specifies its index, starting with 1.
145 # * <tt>:nth-child(an+b)</tt> -- Match the element if it is the b-th child (element)
146 # in each group of <tt>a</tt> child elements of its parent element.
147 # * <tt>:nth-child(-an+b)</tt> -- Match the element if it is the first child (element)
148 # in each group of <tt>a</tt> child elements, up to the first <tt>b</tt> child
149 # elements of its parent element.
150 # * <tt>:nth-child(odd)</tt> -- Match element in the odd position (i.e. first, third).
151 # Same as <tt>:nth-child(2n+1)</tt>.
152 # * <tt>:nth-child(even)</tt> -- Match element in the even position (i.e. second,
153 # fourth). Same as <tt>:nth-child(2n+2)</tt>.
154 # * <tt>:nth-of-type(..)</tt> -- As above, but only counts elements of its type.
155 # * <tt>:nth-last-child(..)</tt> -- As above, but counts from the last child.
156 # * <tt>:nth-last-of-type(..)</tt> -- As above, but counts from the last child and
157 # only elements of its type.
158 # * <tt>:not(selector)</tt> -- Match the element only if the element does not
159 # match the simple selector.
161 # As you can see, <tt>:nth-child<tt> pseudo class and its varient can get quite
162 # tricky and the CSS specification doesn't do a much better job explaining it.
163 # But after reading the examples and trying a few combinations, it's easy to
167 # table tr:nth-child(odd)
168 # Selects every second row in the table starting with the first one.
171 # Selects the fourth paragraph in the +div+, but not if the +div+ contains
172 # other elements, since those are also counted.
174 # div p:nth-of-type(4)
175 # Selects the fourth paragraph in the +div+, counting only paragraphs, and
176 # ignoring all other elements.
178 # div p:nth-of-type(-n+4)
179 # Selects the first four paragraphs, ignoring all others.
181 # And you can always select an element that matches one set of rules but
182 # not another using <tt>:not</tt>. For example:
184 # Matches all paragraphs that do not have the class <tt>.post</tt>.
186 # === Substitution Values
188 # You can use substitution with identifiers, class names and element values.
189 # A substitution takes the form of a question mark (<tt>?</tt>) and uses the
190 # next value in the argument list following the CSS expression.
192 # The substitution value may be a string or a regular expression. All other
193 # values are converted to strings.
196 # selector = HTML::Selector.new "#?", /^\d+$/
197 # matches any element whose identifier consists of one or more digits.
199 # See http://www.w3.org/TR/css3-selectors/
203 # An invalid selector.
204 class InvalidSelectorError < StandardError #:nodoc:
211 # Selector.for_class(cls) => selector
213 # Creates a new selector for the given class name.
215 self.new([".?", cls])
220 # Selector.for_id(id) => selector
222 # Creates a new selector for the given id.
231 # Selector.new(string, [values ...]) => selector
233 # Creates a new selector from a CSS 2 selector expression.
235 # The first argument is the selector expression. All other arguments
236 # are used for value substitution.
238 # Throws InvalidSelectorError is the selector expression is invalid.
239 def initialize(selector, *values)
240 raise ArgumentError, "CSS expression cannot be empty" if selector.empty?
242 values = values[0] if values.size == 1 && values[0].is_a?(Array)
244 # We need a copy to determine if we failed to parse, and also
245 # preserve the original pass by-ref statement.
246 statement = selector.strip.dup
248 # Create a simple selector, along with negation.
249 simple_selector(statement, values).each { |name, value| instance_variable_set("@#{name}", value) }
254 # Alternative selector.
255 if statement.sub!(/^\s*,\s*/, "")
256 second = Selector.new(statement, values)
257 @alternates << second
258 # If there are alternate selectors, we group them in the top selector.
259 if alternates = second.instance_variable_get(:@alternates)
260 second.instance_variable_set(:@alternates, [])
261 @alternates.concat alternates
263 @source << " , " << second.to_s
264 # Sibling selector: create a dependency into second selector that will
265 # match element immediately following this one.
266 elsif statement.sub!(/^\s*\+\s*/, "")
267 second = next_selector(statement, values)
268 @depends = lambda do |element, first|
269 if element = next_element(element)
270 second.match(element, first)
273 @source << " + " << second.to_s
274 # Adjacent selector: create a dependency into second selector that will
275 # match all elements following this one.
276 elsif statement.sub!(/^\s*~\s*/, "")
277 second = next_selector(statement, values)
278 @depends = lambda do |element, first|
280 while element = next_element(element)
281 if subset = second.match(element, first)
282 if first && !subset.empty?
283 matches << subset.first
286 matches.concat subset
290 matches.empty? ? nil : matches
292 @source << " ~ " << second.to_s
293 # Child selector: create a dependency into second selector that will
294 # match a child element of this one.
295 elsif statement.sub!(/^\s*>\s*/, "")
296 second = next_selector(statement, values)
297 @depends = lambda do |element, first|
299 element.children.each do |child|
300 if child.tag? && subset = second.match(child, first)
301 if first && !subset.empty?
302 matches << subset.first
305 matches.concat subset
309 matches.empty? ? nil : matches
311 @source << " > " << second.to_s
312 # Descendant selector: create a dependency into second selector that
313 # will match all descendant elements of this one. Note,
314 elsif statement =~ /^\s+\S+/ && statement != selector
315 second = next_selector(statement, values)
316 @depends = lambda do |element, first|
318 stack = element.children.reverse
319 while node = stack.pop
320 next unless node.tag?
321 if subset = second.match(node, first)
322 if first && !subset.empty?
323 matches << subset.first
326 matches.concat subset
328 elsif children = node.children
329 stack.concat children.reverse
332 matches.empty? ? nil : matches
334 @source << " " << second.to_s
336 # The last selector is where we check that we parsed
338 unless statement.empty? || statement.strip.empty?
339 raise ArgumentError, "Invalid selector: #{statement}"
346 # match(element, first?) => array or nil
348 # Matches an element against the selector.
350 # For a simple selector this method returns an array with the
351 # element if the element matches, nil otherwise.
353 # For a complex selector (sibling and descendant) this method
354 # returns an array with all matching elements, nil if no match is
357 # Use +first_only=true+ if you are only interested in the first element.
360 # if selector.match(element)
361 # puts "Element is a login form"
363 def match(element, first_only = false)
364 # Match element if no element name or element name same as element name
365 if matched = (!@tag_name || @tag_name == element.name)
366 # No match if one of the attribute matches failed
367 for attr in @attributes
368 if element.attributes[attr[0]] !~ attr[1]
375 # Pseudo class matches (nth-child, empty, etc).
377 for pseudo in @pseudo
378 unless pseudo.call(element)
385 # Negation. Same rules as above, but we fail if a match is made.
386 if matched && @negation
387 for negation in @negation
388 if negation[:tag_name] == element.name
391 for attr in negation[:attributes]
392 if element.attributes[attr[0]] =~ attr[1]
399 for pseudo in negation[:pseudo]
400 if pseudo.call(element)
410 # If element matched but depends on another element (child,
411 # sibling, etc), apply the dependent matches instead.
412 if matched && @depends
413 matches = @depends.call(element, first_only)
415 matches = matched ? [element] : nil
418 # If this selector is part of the group, try all the alternative
419 # selectors (unless first_only).
420 if !first_only || !matches
421 @alternates.each do |alternate|
422 break if matches && first_only
423 if subset = alternate.match(element, first_only)
425 matches.concat subset
438 # select(root) => array
440 # Selects and returns an array with all matching elements, beginning
441 # with one node and traversing through all children depth-first.
442 # Returns an empty array if no match is found.
444 # The root node may be any element in the document, or the document
448 # selector = HTML::Selector.new "input[type=text]"
449 # matches = selector.select(element)
450 # matches.each do |match|
451 # puts "Found text field with name #{match.attributes['name']}"
456 while node = stack.pop
457 if node.tag? && subset = match(node, false)
458 subset.each do |match|
459 matches << match unless matches.any? { |item| item.equal?(match) }
461 elsif children = node.children
462 stack.concat children.reverse
469 # Similar to #select but returns the first matching element. Returns +nil+
470 # if no element matches the selector.
471 def select_first(root)
473 while node = stack.pop
474 if node.tag? && subset = match(node, true)
475 return subset.first if !subset.empty?
476 elsif children = node.children
477 stack.concat children.reverse
489 # Return the next element after this one. Skips sibling text nodes.
491 # With the +name+ argument, returns the next element with that name,
492 # skipping other sibling elements.
493 def next_element(element, name = nil)
494 if siblings = element.parent.children
496 siblings.each do |node|
497 if node.equal?(element)
499 elsif found && node.tag?
500 return node if (name.nil? || node.name == name)
511 # Creates a simple selector given the statement and array of
512 # substitution values.
514 # Returns a hash with the values +tag_name+, +attributes+,
515 # +pseudo+ (classes) and +negation+.
517 # Called the first time with +can_negate+ true to allow
518 # negation. Called a second time with false since negation
520 def simple_selector(statement, values, can_negate = true)
526 # Element name. (Note that in negation, this can come at
527 # any order, but for simplicity we allow if only first).
528 statement.sub!(/^(\*|[[:alpha:]][\w\-]*)/) do |match|
530 tag_name = match.downcase unless match == "*"
535 # Get identifier, class, attribute name, pseudo or negation.
537 # Element identifier.
538 next if statement.sub!(/^#(\?|[\w\-]+)/) do |match|
544 id = Regexp.new("^#{Regexp.escape(id.to_s)}$") unless id.is_a?(Regexp)
545 attributes << ["id", id]
550 next if statement.sub!(/^\.([\w\-]+)/) do |match|
552 @source << ".#{class_name}"
553 class_name = Regexp.new("(^|\s)#{Regexp.escape(class_name)}($|\s)") unless class_name.is_a?(Regexp)
554 attributes << ["class", class_name]
559 next if statement.sub!(/^\[\s*([[:alpha:]][\w\-]*)\s*((?:[~|^$*])?=)?\s*('[^']*'|"[^*]"|[^\]]*)\s*\]/) do |match|
560 name, equality, value = $1, $2, $3
564 # Handle single and double quotes.
566 if (value[0] == ?" || value[0] == ?') && value[0] == value[-1]
570 @source << "[#{name}#{equality}'#{value}']"
571 attributes << [name.downcase.strip, attribute_match(equality, value)]
576 next if statement.sub!(/^:root/) do |match|
577 pseudo << lambda do |element|
578 element.parent.nil? || !element.parent.tag?
584 # Nth-child including last and of-type.
585 next if statement.sub!(/^:nth-(last-)?(child|of-type)\((odd|even|(\d+|\?)|(-?\d*|\?)?n([+\-]\d+|\?)?)\)/) do |match|
586 reverse = $1 == "last-"
587 of_type = $2 == "of-type"
588 @source << ":nth-#{$1}#{$2}("
591 pseudo << nth_child(2, 1, of_type, reverse)
594 pseudo << nth_child(2, 2, of_type, reverse)
596 when /^(\d+|\?)$/ # b only
597 b = ($1 == "?" ? values.shift : $1).to_i
598 pseudo << nth_child(0, b, of_type, reverse)
600 when /^(-?\d*|\?)?n([+\-]\d+|\?)?$/
601 a = ($1 == "?" ? values.shift :
602 $1 == "" ? 1 : $1 == "-" ? -1 : $1).to_i
603 b = ($2 == "?" ? values.shift : $2).to_i
604 pseudo << nth_child(a, b, of_type, reverse)
605 @source << (b >= 0 ? "#{a}n+#{b})" : "#{a}n#{b})")
607 raise ArgumentError, "Invalid nth-child #{match}"
611 # First/last child (of type).
612 next if statement.sub!(/^:(first|last)-(child|of-type)/) do |match|
613 reverse = $1 == "last"
614 of_type = $2 == "of-type"
615 pseudo << nth_child(0, 1, of_type, reverse)
616 @source << ":#{$1}-#{$2}"
619 # Only child (of type).
620 next if statement.sub!(/^:only-(child|of-type)/) do |match|
621 of_type = $1 == "of-type"
622 pseudo << only_child(of_type)
623 @source << ":only-#{$1}"
627 # Empty: no child elements or meaningful content (whitespaces
629 next if statement.sub!(/^:empty/) do |match|
630 pseudo << lambda do |element|
632 for child in element.children
633 if child.tag? || !child.content.strip.empty?
643 # Content: match the text content of the element, stripping
644 # leading and trailing spaces.
645 next if statement.sub!(/^:content\(\s*(\?|'[^']*'|"[^"]*"|[^)]*)\s*\)/) do |match|
648 content = values.shift
649 elsif (content[0] == ?" || content[0] == ?') && content[0] == content[-1]
650 content = content[1..-2]
652 @source << ":content('#{content}')"
653 content = Regexp.new("^#{Regexp.escape(content.to_s)}$") unless content.is_a?(Regexp)
654 pseudo << lambda do |element|
656 for child in element.children
658 text << child.content
661 text.strip =~ content
666 # Negation. Create another simple selector to handle it.
667 if statement.sub!(/^:not\(\s*/, "")
668 raise ArgumentError, "Double negatives are not missing feature" unless can_negate
670 negation << simple_selector(statement, values, false)
671 raise ArgumentError, "Negation not closed" unless statement.sub!(/^\s*\)/, "")
676 # No match: moving on.
680 # Return hash. The keys are mapped to instance variables.
681 {:tag_name=>tag_name, :attributes=>attributes, :pseudo=>pseudo, :negation=>negation}
685 # Create a regular expression to match an attribute value based
686 # on the equality operator (=, ^=, |=, etc).
687 def attribute_match(equality, value)
688 regexp = value.is_a?(Regexp) ? value : Regexp.escape(value.to_s)
691 # Match the attribute value in full
692 Regexp.new("^#{regexp}$")
694 # Match a space-separated word within the attribute value
695 Regexp.new("(^|\s)#{regexp}($|\s)")
697 # Match the beginning of the attribute value
698 Regexp.new("^#{regexp}")
700 # Match the end of the attribute value
701 Regexp.new("#{regexp}$")
703 # Match substring of the attribute value
704 regexp.is_a?(Regexp) ? regexp : Regexp.new(regexp)
706 # Match the first space-separated item of the attribute value
707 Regexp.new("^#{regexp}($|\s)")
709 raise InvalidSelectorError, "Invalid operation/value" unless value.empty?
710 # Match all attributes values (existence check)
716 # Returns a lambda that can match an element against the nth-child
717 # pseudo class, given the following arguments:
718 # * +a+ -- Value of a part.
719 # * +b+ -- Value of b part.
720 # * +of_type+ -- True to test only elements of this type (of-type).
721 # * +reverse+ -- True to count in reverse order (last-).
722 def nth_child(a, b, of_type, reverse)
723 # a = 0 means select at index b, if b = 0 nothing selected
724 return lambda { |element| false } if a == 0 && b == 0
725 # a < 0 and b < 0 will never match against an index
726 return lambda { |element| false } if a < 0 && b < 0
727 b = a + b + 1 if b < 0 # b < 0 just picks last element from each group
728 b -= 1 unless b == 0 # b == 0 is same as b == 1, otherwise zero based
730 # Element must be inside parent element.
731 return false unless element.parent && element.parent.tag?
733 # Get siblings, reverse if counting from last.
734 siblings = element.parent.children
735 siblings = siblings.reverse if reverse
736 # Match element name if of-type, otherwise ignore name.
737 name = of_type ? element.name : nil
739 for child in siblings
740 # Skip text nodes/comments.
741 if child.tag? && (name == nil || child.name == name)
743 # Shortcut when a == 0 no need to go past count
745 found = child.equal?(element)
749 # Only look for first b elements
751 if child.equal?(element)
752 found = (index % a) == 0
756 # Otherwise, break if child found and count == an+b
757 if child.equal?(element)
758 found = (index % a) == b
770 # Creates a only child lambda. Pass +of-type+ to only look at
771 # elements of its type.
772 def only_child(of_type)
774 # Element must be inside parent element.
775 return false unless element.parent && element.parent.tag?
776 name = of_type ? element.name : nil
778 for child in element.parent.children
779 # Skip text nodes/comments.
780 if child.tag? && (name == nil || child.name == name)
781 unless child.equal?(element)
792 # Called to create a dependent selector (sibling, descendant, etc).
793 # Passes the remainder of the statement that will be reduced to zero
794 # eventually, and array of substitution values.
796 # This method is called from four places, so it helps to put it here
797 # for reuse. The only logic deals with the need to detect comma
798 # separators (alternate) and apply them to the selector group of the
800 def next_selector(statement, values)
801 second = Selector.new(statement, values)
802 # If there are alternate selectors, we group them in the top selector.
803 if alternates = second.instance_variable_get(:@alternates)
804 second.instance_variable_set(:@alternates, [])
805 @alternates.concat alternates
813 # See HTML::Selector.new
814 def self.selector(statement, *values)
815 Selector.new(statement, *values)
821 def select(selector, *values)
822 selector = HTML::Selector.new(selector, values)
823 selector.select(self)