2 # vim:set ts=2 sw=2 expandtab:
4 # xmlformat - configurable XML file formatter/pretty-printer
6 # Copyright (c) 2004, 2005 Kitebird, LLC. All rights reserved.
7 # Some portions are based on the REX shallow XML parser, which
8 # is Copyright (c) 1998, Robert D. Cameron. These include the
9 # regular expression parsing variables and the shallow_parse()
11 # This software is licensed as described in the file LICENSE,
12 # which you should have received as part of this distribution.
14 # Syntax: xmlformat [config-file] xml-file
16 # Default config file is $ENV{XMLFORMAT_CONF} or ./xmlformat.conf, in that
23 # The input document first is parsed into a list of strings. Each string
24 # represents one of the following:
26 # - processing instruction (the XML declaration is treated as a PI)
29 # - DOCTYPE declaration
30 # - element tag (either <abc>, </abc>, or <abc/>), *including attributes*
32 # Entities are left untouched. They appear in their original form as part
33 # of the text node in which they occur.
35 # The list of strings then is converted to a hierarchical structure.
36 # The document top level is represented by a reference to a list.
37 # Each list element is a reference to a node -- a hash that has "type"
38 # and "content" key/value pairs. The "type" key indicates the node
39 # type and has one of the following values:
42 # "pi" - processing instruction node
43 # "comment" - comment node
44 # "CDATA" - CDATA section node
45 # "DOCTYPE" - DOCTYPE node
46 # "elt" - element node
48 # (For purposes of this program, it's really only necessary to have "text",
49 # "elt", and "other". The types other than "text" and "elt" currently are
50 # all treated the same way.)
52 # For all but element nodes, the "content" value is the text of the node.
54 # For element nodes, the "content" hash is a reference to a list of
55 # nodes for the element's children. In addition, an element node has
56 # three additional key/value pairs:
57 # - The "name" value is the tag name within the opening tag, minus angle
58 # brackets or attributes.
59 # - The "open_tag" value is the full opening tag, which may also be the
61 # - The "close_tag" value depends on the opening tag. If the open tag is
62 # "<abc>", the close tag is "</abc>". If the open tag is "<abc/>", the
63 # close tag is the empty string.
65 # If the tree structure is converted back into a string with
66 # tree_stringify(), the result can be compared to the input file
67 # as a regression test. The string should be identical to the original
73 $Getopt::Long
::ignorecase
= 0; # options are case sensitive
74 $Getopt::Long
::bundling
= 1; # allow short options to be bundled
76 my $PROG_NAME = "xmlformat";
77 my $PROG_VERSION = "1.04";
78 my $PROG_LANG = "Perl";
80 # ----------------------------------------------------------------------
86 # ----------------------------------------------------------------------
88 # Regular expressions for parsing document components. Based on REX.
90 # SPE = shallow parsing expression
91 # SE = scanning expression
92 # CE = completion expression
93 # RSB = right square brackets
97 my $UntilHyphen = "[^-]*-";
98 my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
99 my $CommentCE = "$Until2Hyphens>?";
100 my $UntilRSBs = "[^\\]]*\\](?:[^\\]]+\\])*\\]+";
101 my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
102 my $S = "[ \\n\\t\\r]+";
103 my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
104 my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
105 my $Name = "(?:$NameStrt)(?:$NameChar)*";
106 my $QuoteSE = "\"[^\"]*\"|'[^']*'";
107 my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
108 my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
109 my $S1 = "[\\n\\r\\t ]";
110 my $UntilQMs = "[^?]*\\?+";
111 my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
113 "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
114 my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*\\](?:$S)?)?>?";
116 "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
117 my $PI_CE = "$Name(?:$PI_Tail)?";
118 my $EndTagCE = "$Name(?:$S)?>?";
119 my $AttValSE = "\"[^<\"]*\"|'[^<']*'";
120 my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
122 "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
123 my $XML_SPE = "$TextSE|$MarkupSPE";
125 # ----------------------------------------------------------------------
127 # Allowable options and their possible values:
128 # - The keys of this hash are the allowable option names
129 # - The value for each key is list of allowable option values
130 # - If the value is undef, the option value must be numeric
131 # If any new formatting option is added to this program, it
132 # must be specified here, *and* a default value for it should
133 # be listed in the *DOCUMENT and *DEFAULT pseudo-element
137 "format" => [ "block", "inline", "verbatim" ],
138 "normalize" => [ "yes", "no" ],
139 "subindent" => undef,
140 "wrap-length" => undef,
141 "entry-break" => undef,
142 "exit-break" => undef,
143 "element-break" => undef
146 # Object creation: set up the default formatting configuration
147 # and variables for maintaining input and output document.
155 # Formatting options for each element.
157 $self->{elt_opts
} = { };
159 # The formatting options for the *DOCUMENT and *DEFAULT pseudo-elements can
160 # be overridden in the configuration file, but the options must also be
161 # built in to make sure they exist if not specified in the configuration
162 # file. Each of the structures must have a value for every option.
164 # Options for top-level document children.
165 # - Do not change entry-break: 0 ensures no extra newlines before
166 # first element of output.
167 # - Do not change exit-break: 1 ensures a newline after final element
168 # of output document.
169 # - It's probably best not to change any of the others, except perhaps
170 # if you want to increase the element-break.
172 $self->{elt_opts
}->{"*DOCUMENT"} = {
177 "entry-break" => 0, # do not change
178 "exit-break" => 1, # do not change
182 # Default options. These are used for any elements in the document
183 # that are not specified explicitly in the configuration file.
185 $self->{elt_opts
}->{"*DEFAULT"} = {
195 # Run the *DOCUMENT and *DEFAULT options through the option-checker
196 # to verify that the built-in values are legal.
200 foreach my $elt_name (keys (%{$self->{elt_opts
}})) # ... for each element
202 # Check each option for element
203 while (my ($opt_name, $opt_val) = each (%{$self->{elt_opts
}->{$elt_name}}))
207 ($opt_val, $err_msg) = check_option
($opt_name, $opt_val);
208 if (!defined ($err_msg))
210 $self->{elt_opts
}->{$elt_name}->{$opt_name} = $opt_val;
214 warn "LOGIC ERROR: $elt_name default option is invalid\n";
221 # Make sure that the every option is represented in the
222 # *DOCUMENT and *DEFAULT structures.
224 foreach my $opt_name (keys (%opt_list))
226 foreach my $elt_name (keys (%{$self->{elt_opts
}}))
228 if (!exists ($self->{elt_opts
}->{$elt_name}->{$opt_name}))
230 warn "LOGIC ERROR: $elt_name has no default '$opt_name' option\n";
236 die "Cannot continue; internal default formatting options must be fixed\n"
239 bless $self, $type; # bless object and return it
242 # Initialize the variables that are used per-document
248 # Elements that are used in the document but not named explicitly
249 # in the configuration file.
251 $self->{unconf_elts
} = { };
253 # List of tokens for current document.
255 $self->{tokens
} = [ ];
257 # List of line numbers for each token
259 $self->{line_num
} = [ ];
261 # Document node tree (constructed from the token list).
265 # Variables for formatting operations:
266 # out_doc = resulting output document (constructed from document tree)
267 # pending = array of pending tokens being held until flushed
269 $self->{out_doc
} = "";
270 $self->{pending
} = [ ];
272 # Inline elements within block elements are processed using the
273 # text normalization (and possible line-wrapping) values of their
274 # enclosing block. Blocks and inlines may be nested, so we maintain
275 # a stack that allows the normalize/wrap-length values of the current
276 # block to be determined.
278 $self->{block_name_stack
} = [ ]; # for debugging
279 $self->{block_opts_stack
} = [ ];
281 # A similar stack for maintaining each block's current break type.
283 $self->{block_break_type_stack
} = [ ];
286 # Accessors for token list and resulting output document
292 return $self->{tokens
};
299 return $self->{out_doc
};
303 # Methods for adding strings to output document or
304 # to the pending output array
308 my ($self, $str) = @_;
310 $self->{out_doc
} .= $str;
315 my ($self, $str) = @_;
317 push (@
{$self->{pending
}}, $str);
321 # Block stack mainenance methods
323 # Push options onto or pop options off from the stack. When doing
324 # this, also push or pop an element onto the break-level stack.
328 my ($self, $name, $opts) = @_;
330 push (@
{$self->{block_name_stack
}}, $name);
331 push (@
{$self->{block_opts_stack
}}, $opts);
332 push (@
{$self->{block_break_type_stack
}}, "entry-break");
339 pop (@
{$self->{block_name_stack
}});
340 pop (@
{$self->{block_opts_stack
}});
341 pop (@
{$self->{block_break_type_stack
}});
344 # Return the current block's normalization status or wrap length
350 my $size = @
{$self->{block_opts_stack
}};
351 my $opts = $self->{block_opts_stack
}->[$size-1];
352 return $opts->{normalize
} eq "yes";
355 sub block_wrap_length
359 my $size = @
{$self->{block_opts_stack
}};
360 my $opts = $self->{block_opts_stack
}->[$size-1];
361 return $opts->{"wrap-length"};
364 # Set the current block's break type, or return the number of newlines
365 # for the block's break type
367 sub set_block_break_type
369 my ($self, $type) = @_;
371 my $size = @
{$self->{block_break_type_stack
}};
372 $self->{block_break_type_stack
}->[$size-1] = $type;
375 sub block_break_value
379 my $size = @
{$self->{block_opts_stack
}};
380 my $opts = $self->{block_opts_stack
}->[$size-1];
381 $size = @
{$self->{block_break_type_stack
}};
382 my $type = $self->{block_break_type_stack
}->[$size-1];
383 return $opts->{$type};
387 # ----------------------------------------------------------------------
389 # Read configuration information. For each element, construct a hash
390 # containing a hash key and value for each option name and value.
391 # After reading the file, fill in missing option values for
392 # incomplete option structures using the *DEFAULT options.
397 my $conf_file = shift;
400 my $in_continuation = 0;
403 open (FH
, $conf_file) or die "Cannot read config file $conf_file: $!\n";
408 next if /^\s*($|#)/; # skip blank lines, comments
409 if ($in_continuation)
411 $_ = $saved_line . " " . $_;
413 $in_continuation = 0;
417 # Line doesn't begin with whitespace, so it lists element names.
418 # Names are separated by whitespace or commas, possibly followed
419 # by a continuation character or a comment.
422 s/\\$//; # remove continuation character
424 $in_continuation = 1;
427 s/\s*#.*$//; # remove any trailing comment
428 @elt_names = split (/[\s,]+/, $_);
429 # make sure each name has an entry in the elt_opts structure
430 foreach my $elt_name (@elt_names)
432 $self->{elt_opts
}->{$elt_name} = { }
433 unless exists ($self->{elt_opts
}->{$elt_name});
438 # Line begins with whitespace, so it contains an option
439 # to apply to the current element list, possibly followed by
440 # a comment. First check that there is a current list.
441 # Then parse the option name/value.
443 die "$conf_file:$.: Option setting found before any "
444 . "elements were named.\n"
447 my ($opt_name, $opt_val) = /^\s+(\S+)(?:\s+|\s*=\s*)(\S+)$/;
448 die "$conf_file:$.: Malformed line: $_\n" unless defined ($opt_val);
450 # Check option. If illegal, die with message. Otherwise,
451 # add option to each element in current element list
453 ($opt_val, $err_msg) = check_option
($opt_name, $opt_val);
454 die "$conf_file:$.: $err_msg\n" if defined ($err_msg);
455 foreach my $elt_name (@elt_names)
457 $self->{elt_opts
}->{$elt_name}->{$opt_name} = $opt_val;
463 # For any element that has missing option values, fill in the values
464 # using the options for the *DEFAULT pseudo-element. This speeds up
465 # element option lookups later. It also makes it unnecessary to test
466 # each option to see if it's defined: All element option structures
467 # will have every option defined.
469 my $def_opts = $self->{elt_opts
}->{"*DEFAULT"};
471 foreach my $elt_name (keys (%{$self->{elt_opts
}}))
473 next if $elt_name eq "*DEFAULT";
474 foreach my $opt_name (keys (%{$def_opts}))
476 next if exists ($self->{elt_opts
}->{$elt_name}->{$opt_name}); # already set
477 $self->{elt_opts
}->{$elt_name}->{$opt_name} = $def_opts->{$opt_name};
483 # Check option name to make sure it's legal. Check the value to make sure
484 # that it's legal for the name. Return a two-element array:
485 # (value, undef) if the option name and value are legal.
486 # (undef, message) if an error was found; message contains error message.
487 # For legal values, the returned value should be assigned to the option,
488 # because it may get type-converted here.
492 my ($opt_name, $opt_val) = @_;
494 # - Check option name to make sure it's a legal option
495 # - Then check the value. If there is a list of values
496 # the value must be one of them. Otherwise, the value
497 # must be an integer.
499 return (undef, "Unknown option name: $opt_name")
500 unless exists ($opt_list{$opt_name});
501 my $allowable_val = $opt_list{$opt_name};
502 if (defined ($allowable_val))
504 return (undef, "Unknown '$opt_name' value: $opt_val")
505 unless grep (/^$opt_val$/, @
{$allowable_val});
507 else # other options should be numeric
509 # "$opt_val" converts $opt_val to string for pattern match
510 return (undef, "'$opt_name' value ($opt_val) should be an integer")
511 unless "$opt_val" =~ /^\d+$/;
513 return ($opt_val, undef);
517 # Return hash of option values for a given element. If no options are found:
518 # - Add the element name to the list of unconfigured options.
519 # - Assign the default options to the element. (This way the test for the
520 # option fails only once.)
525 my $elt_name = shift;
527 my $opts = $self->{elt_opts
}->{$elt_name};
528 if (!defined ($opts))
530 $self->{unconf_elts
}->{$elt_name} = 1;
531 $opts = $self->{elt_opts
}->{$elt_name} = $self->{elt_opts
}->{"*DEFAULT"};
537 # Display contents of configuration options to be used to process document.
538 # For each element named in the elt_opts structure, display its format
539 # type, and those options that apply to the type.
544 # Format types and the additional options that apply to each type
558 foreach my $elt_name (sort (keys (%{$self->{elt_opts
}})))
561 my %opts = %{$self->{elt_opts
}->{$elt_name}};
562 my $format = $opts{format
};
563 # Write out format type, then options that apply to the format type
564 print " format = $format\n";
565 foreach my $opt_name (@
{$format_opts->{$format}})
567 print " $opt_name = $opts{$opt_name}\n";
574 # Display the list of elements that are used in the document but not
575 # configured in the configuration file.
577 # Then re-unconfigure the elements so that they won't be considered
578 # as configured for the next document, if there is one.
580 sub display_unconfigured_elements
584 my @elts = keys (%{$self->{unconf_elts
}});
587 print "The document contains no unconfigured elements.\n";
591 print "The following document elements were assigned no formatting options:\n";
592 foreach my $line ($self->line_wrap ([ join (" ", sort (@elts)) ], 0, 0, 65))
598 foreach my $elt_name (@elts)
600 delete ($self->{elt_opts
}->{$elt_name});
604 # ----------------------------------------------------------------------
606 # Main document processing routine.
607 # - Argument is a string representing an input document
608 # - Return value is the reformatted document, or undef. An undef return
609 # signifies either that an error occurred, or that some option was
610 # given that suppresses document output. In either case, don't write
611 # any output for the document. Any error messages will already have
612 # been printed when this returns.
617 my ($doc, $verbose, $check_parser, $canonize_only, $show_unconf_elts) = @_;
620 $self->init_doc_vars ();
622 # Perform lexical parse to split document into list of tokens
623 warn "Parsing document...\n" if $verbose;
624 $self->shallow_parse ($doc);
628 warn "Checking parser...\n" if $verbose;
629 # concatentation of tokens should be identical to original document
630 if ($doc eq join ("", @
{$self->tokens ()}))
632 print "Parser is okay\n";
636 print "PARSER ERROR: document token concatenation differs from document\n";
641 # Assign input line number to each token
642 $self->assign_line_numbers ();
644 # Look for and report any error tokens returned by parser
645 warn "Checking document for errors...\n" if $verbose;
646 if ($self->report_errors () > 0)
648 warn "Cannot continue processing document.\n";
652 # Convert the token list to a tree structure
653 warn "Converting document tokens to tree...\n" if $verbose;
654 if ($self->tokens_to_tree () > 0)
656 warn "Cannot continue processing document.\n";
660 # Check: Stringify the tree to convert it back to a single string,
661 # then compare to original document string (should be identical)
662 # (This is an integrity check on the validity of the to-tree and stringify
663 # operations; if one or both do not work properly, a mismatch should occur.)
664 #$str = $self->tree_stringify ();
666 #warn "ERROR: mismatch between document and resulting string\n" if $doc ne $str;
668 # Canonize tree to remove extraneous whitespace
669 warn "Canonizing document tree...\n" if $verbose;
670 $self->tree_canonize ();
674 print $self->tree_stringify () . "\n";
678 # One side-effect of canonizing the tree is that the formatting
679 # options are looked up for each element in the document. That
680 # causes the list of elements that have no explicit configuration
681 # to be built. Display the list and return if user requested it.
683 if ($show_unconf_elts)
685 $self->display_unconfigured_elements ();
689 # Format the tree to produce formatted XML as a single string
690 warn "Formatting document tree...\n" if $verbose;
691 $self->tree_format ();
693 # If the document is not empty, add a newline and emit a warning if
694 # reformatting failed to add a trailing newline. This shouldn't
695 # happen if the *DOCUMENT options are set up with exit-break = 1,
696 # which is the reason for the warning rather than just silently
697 # adding the newline.
699 $str = $self->out_doc ();
700 if ($str ne "" && $str !~ /\n$/)
702 warn "LOGIC ERROR: trailing newline had to be added\n";
709 # ----------------------------------------------------------------------
711 # Parse XML document into array of tokens and store array
715 my ($self, $xml_document) = @_;
717 $self->{tokens
} = [ $xml_document =~ /$XML_SPE/g ];
720 # ----------------------------------------------------------------------
722 # Extract a tag name from a tag and return it.
724 # Dies if the tag cannot be found, because this is supposed to be
725 # called only with a legal tag.
731 die "Cannot find tag name in tag: $tag\n" unless $tag =~ /^<\/?
($Name)/;
735 # ----------------------------------------------------------------------
737 # Assign an input line number to each token. The number indicates
738 # the line number on which the token begins.
740 sub assign_line_numbers
745 $self->{line_num
} = [ ];
746 for (my $i = 0; $i < @
{$self->{tokens
}}; $i++)
748 my $token = $self->{tokens
}->[$i];
749 push (@
{$self->{line_num
}}, $line_num);
750 # count newlines and increment line counter (tr returns no. of matches)
751 $line_num += ($token =~ tr/\n/\n/);
755 # ----------------------------------------------------------------------
757 # Check token list for errors and report any that are found. Error
758 # tokens are those that begin with "<" but do not end with ">".
760 # Returns the error count.
762 # Does not modify the original token list.
769 for (my $i = 0; $i < @
{$self->{tokens
}}; $i++)
771 my $token = $self->{tokens
}->[$i];
772 if ($token =~ /^</ && $token !~ />$/)
774 my $line_num = $self->{line_num
}->[$i];
775 warn "Malformed token at line $line_num, token " . ($i+1) . ": $token\n";
779 warn "Number of errors found: $err_count\n" if $err_count > 0;
783 # ----------------------------------------------------------------------
785 # Helper routine to print tag stack for tokens_to_tree
789 my ($label, @stack) = @_;
792 warn " $label: none\n";
797 for (my $i = 0; $i < @stack; $i++)
799 warn " ", ($i+1), ": ", $stack[$i], "\n";
804 # Convert the list of XML document tokens to a tree representation.
805 # The implementation uses a loop and a stack rather than recursion.
807 # Does not modify the original token list.
809 # Returns an error count.
815 my @tag_stack = (); # stack for element tags
816 my @children_stack = (); # stack for lists of children
817 my $children = [ ]; # current list of children
820 for (my $i = 0; $i < @
{$self->{tokens
}}; $i++)
822 my $token = $self->{tokens
}->[$i];
823 my $line_num = $self->{line_num
}->[$i];
824 my $tok_err = "Error near line $line_num, token " . ($i+1) . " ($token)";
825 if ($token !~ /^</) # text
827 push (@
{$children}, text_node
($token));
829 elsif ($token =~ /^<!--/) # comment
831 push (@
{$children}, comment_node
($token));
833 elsif ($token =~ /^<\?/) # processing instruction
835 push (@
{$children}, pi_node
($token));
837 elsif ($token =~ /^<!DOCTYPE/) # DOCTYPE
839 push (@
{$children}, doctype_node
($token));
841 elsif ($token =~ /^<!\[/) # CDATA
843 push (@
{$children}, cdata_node
($token));
845 elsif ($token =~ /^<\//) # element close tag
849 warn "$tok_err: Close tag w/o preceding open tag; malformed document?\n";
853 if (!@children_stack)
855 warn "$tok_err: Empty children stack; malformed document?\n";
859 my $tag = pop (@tag_stack);
860 my $open_tag_name = extract_tag_name
($tag);
861 my $close_tag_name = extract_tag_name
($token);
862 if ($open_tag_name ne $close_tag_name)
864 warn "$tok_err: Tag mismatch; malformed document?\n";
865 warn " open tag: $tag\n";
866 warn " close tag: $token\n";
867 print_tag_stack
("enclosing tags", @tag_stack);
871 my $elt = element_node
($tag, $token, $children);
872 $children = pop (@children_stack);
873 push (@
{$children}, $elt);
875 else # element open tag
877 # If we reach here, we're seeing the open tag for an element:
878 # - If the tag is also the close tag (e.g., <abc/>), close the
879 # element immediately, giving it an empty child list.
880 # - Otherwise, push tag and child list on stacks, begin new child
881 # list for element body.
882 if ($token =~ /\/>$/) # tag is of form <abc/>
884 push (@
{$children}, element_node
($token, "", [ ]));
886 else # tag is of form <abc>
888 push (@tag_stack, $token);
889 push (@children_stack, $children);
895 # At this point, the stacks should be empty if the document is
900 warn "Error at EOF: Unclosed tags; malformed document?\n";
901 print_tag_stack
("unclosed tags", @tag_stack);
906 warn "Error at EOF: Unprocessed child elements; malformed document?\n";
907 # TODO: print out info about them
911 $self->{tree
} = $children;
916 # Node-generating helper methods for tokens_to_tree
918 # Generic node generator
920 sub node
{ return { "type" => $_[0], "content" => $_[1] }; }
922 # Generators for specific non-element nodes
924 sub text_node
{ return node
("text", $_[0]); }
925 sub comment_node
{ return node
("comment", $_[0]); }
926 sub pi_node
{ return node
("pi", $_[0]); }
927 sub doctype_node
{ return node
("DOCTYPE", $_[0]); }
928 sub cdata_node
{ return node
("CDATA", $_[0]); }
930 # For an element node, create a standard node with the type and content
931 # key/value pairs. Then add pairs for the "name", "open_tag", and
932 # "close_tag" hash keys.
936 my ($open_tag, $close_tag, $children) = @_;
938 my $elt = node
("elt", $children);
939 # name is the open tag with angle brackets and attibutes stripped
940 $elt->{name
} = extract_tag_name
($open_tag);
941 $elt->{open_tag
} = $open_tag;
942 $elt->{close_tag
} = $close_tag;
946 # ----------------------------------------------------------------------
948 # Convert the given XML document tree (or subtree) to string form by
949 # concatentating all of its components. Argument is a reference
950 # to a list of nodes at a given level of the tree.
952 # Does not modify the node list.
957 my $children = shift || $self->{tree
}; # use entire tree if no arg;
960 for (my $i = 0; $i < @
{$children}; $i++)
962 my $child = $children->[$i];
964 # - Elements have list of child nodes as content (process recursively)
965 # - All other node types have text content
967 if ($child->{type
} eq "elt")
969 $str .= $child->{open_tag
}
970 . $self->tree_stringify ($child->{content
})
971 . $child->{close_tag
};
975 $str .= $child->{content
};
981 # ----------------------------------------------------------------------
984 # Put tree in "canonical" form by eliminating extraneous whitespace
985 # from element text content.
987 # $children is a list of child nodes
989 # This function modifies the node list.
991 # Canonizing occurs as follows:
992 # - Comment, PI, DOCTYPE, and CDATA nodes remain untouched
993 # - Verbatim elements and their descendants remain untouched
994 # - Within non-normalized block elements:
995 # - Delete all-whitespace text node children
996 # - Leave other text node children untouched
997 # - Within normalized block elements:
998 # - Convert runs of whitespace (including line-endings) to single spaces
999 # - Trim leading whitespace of first text node
1000 # - Trim trailing whitespace of last text node
1001 # - Trim whitespace that is adjacent to a verbatim or non-normalized
1002 # sub-element. (For example, if a <programlisting> is followed by
1003 # more text, delete any whitespace at beginning of that text.)
1004 # - Within inline elements:
1005 # - Normalize the same way as the enclosing block element, with the
1006 # exception that a space at the beginning or end is not removed.
1007 # (Otherwise, <para>three<literal> blind </literal>mice</para>
1008 # would become <para>three<literal>blind</literal>mice</para>.)
1014 $self->{tree
} = $self->tree_canonize2 ($self->{tree
}, "*DOCUMENT");
1021 my $children = shift;
1022 my $par_name = shift;
1024 # Formatting options for parent
1025 my $par_opts = $self->get_opts ($par_name);
1027 # If parent is a block element, remember its formatting options on
1028 # the block stack so they can be used to control canonization of
1029 # inline child elements.
1031 $self->begin_block ($par_name, $par_opts) if $par_opts->{format
} eq "block";
1033 # Iterate through list of child nodes to preserve, modify, or
1034 # discard whitespace. Return resulting list of children.
1036 # Canonize element and text nodes. Leave everything else (comments,
1037 # processing instructions, etc.) untouched.
1039 my @new_children = ();
1041 while (@
{$children})
1043 my $child = shift (@
{$children});
1045 if ($child->{type
} eq "elt")
1047 # Leave verbatim elements untouched. For other element nodes,
1048 # canonize child list using options appropriate to element.
1050 if ($self->get_opts ($child->{name
})->{format
} ne "verbatim")
1052 $child->{content
} = $self->tree_canonize2 ($child->{content
},
1056 elsif ($child->{type
} eq "text")
1058 # Delete all-whitespace node or strip whitespace as appropriate.
1060 # Paranoia check: We should never get here for verbatim elements,
1061 # because normalization is irrelevant for them.
1063 die "LOGIC ERROR: trying to canonize verbatim element $par_name!\n"
1064 if $par_opts->{format
} eq "verbatim";
1066 if (!$self->block_normalize ())
1068 # Enclosing block is not normalized:
1069 # - Delete child all-whitespace text nodes.
1070 # - Leave other text nodes untouched.
1072 next if $child->{content
} =~ /^\s*$/;
1076 # Enclosing block is normalized, so normalize this text node:
1077 # - Convert runs of whitespace characters (including
1078 # line-endings characters) to single spaces.
1079 # - Trim leading whitespace if this node is the first child
1080 # of a block element or it follows a non-normalized node.
1081 # - Trim leading whitespace if this node is the last child
1082 # of a block element or it precedes a non-normalized node.
1084 # These are nil if there is no prev or next child
1085 my $prev_child = $new_children[$#new_children];
1086 my $next_child = $children->[0];
1088 $child->{content
} =~ s/\s+/ /g;
1089 $child->{content
} =~ s/^ //
1090 if (!defined ($prev_child) && $par_opts->{format
} eq "block")
1091 || $self->non_normalized_node ($prev_child);
1092 $child->{content
} =~ s/ $//
1093 if (!defined ($next_child) && $par_opts->{format
} eq "block")
1094 || $self->non_normalized_node ($next_child);
1096 # If resulting text is empty, discard the node.
1097 next if $child->{content
} =~ /^$/;
1100 push (@new_children, $child);
1103 # Pop block stack if parent was a block element
1104 $self->end_block () if $par_opts->{format
} eq "block";
1106 return \
@new_children;
1110 # Helper function for tree_canonize().
1112 # Determine whether a node is normalized. This is used to check
1113 # the node that is adjacent to a given text node (either previous
1115 # - No is node is nil
1116 # - No if the node is a verbatim element
1117 # - If the node is a block element, yes or no according to its
1119 # - No if the node is an inline element. Inlines are normalized
1120 # if the parent block is normalized, but this method is not called
1121 # except while examinine normalized blocks. So its inline children
1122 # are also normalized.
1123 # - No if node is a comment, PI, DOCTYPE, or CDATA section. These are
1124 # treated like verbatim elements.
1126 sub non_normalized_node
1132 my $type = $node->{type
};
1135 my $node_opts = $self->get_opts ($node->{name
});
1136 if ($node_opts->{format
} eq "verbatim")
1140 if ($node_opts->{format
} eq "block")
1142 return $node_opts->{normalize
} eq "no";
1144 if ($node_opts->{format
} eq "inline")
1148 die "LOGIC ERROR: non_normalized_node: unhandled node format.\n";
1150 if ($type eq "comment" || $type eq "pi" || $type eq "DOCTYPE"
1151 || $type eq "CDATA")
1155 if ($type eq "text")
1157 die "LOGIC ERROR: non_normalized_node: got called for text node.\n";
1159 die "LOGIC ERROR: non_normalized_node: unhandled node type.\n";
1162 # ----------------------------------------------------------------------
1164 # Format (pretty-print) the document tree
1166 # Does not modify the node list.
1168 # The class maintains two variables for storing output:
1169 # - out_doc stores content that has been seen and "flushed".
1170 # - pending stores an array of strings (content of text nodes and inline
1171 # element tags). These are held until they need to be flushed, at
1172 # which point they are concatenated and possibly wrapped/indented.
1173 # Flushing occurs when a break needs to be written, which happens
1174 # when something other than a text node or inline element is seen.
1176 # If parent name and children are not given, format the entire document.
1177 # Assume prevailing indent = 0 if not given.
1182 my $par_name = shift || "*DOCUMENT"; # format entire document if no arg
1183 my $children = shift || $self->{tree
}; # use entire tree if no arg
1184 my $indent = shift || 0;
1186 # Formatting options for parent element
1187 my $par_opts = $self->get_opts ($par_name);
1189 # If parent is a block element:
1190 # - Remember its formatting options on the block stack so they can
1191 # be used to control formatting of inline child elements.
1192 # - Set initial break type to entry-break.
1193 # - Shift prevailing indent right before generating child content.
1195 if ($par_opts->{format
} eq "block")
1197 $self->begin_block ($par_name, $par_opts);
1198 $self->set_block_break_type ("entry-break");
1199 $indent += $par_opts->{"subindent"};
1202 # Variables for keeping track of whether the previous child
1203 # was a text node. Used for controlling break behavior in
1204 # non-normalized block elements: No line breaks are added around
1205 # text in such elements, nor is indenting added.
1207 my $prev_child_is_text = 0;
1208 my $cur_child_is_text = 0;
1210 foreach my $child (@
{$children})
1212 $prev_child_is_text = $cur_child_is_text;
1214 # Text nodes: just add text to pending output
1216 if ($child->{type
} eq "text")
1218 $cur_child_is_text = 1;
1219 $self->add_to_pending ($child->{content
});
1223 $cur_child_is_text = 0;
1225 # Element nodes: handle depending on format type
1227 if ($child->{type
} eq "elt")
1229 my $child_opts = $self->get_opts ($child->{name
});
1231 # Verbatim elements:
1232 # - Print literally without change (use _stringify).
1233 # - Do not line-wrap or add any indent.
1235 if ($child_opts->{format
} eq "verbatim")
1237 $self->flush_pending ($indent);
1238 $self->emit_break (0)
1239 unless $prev_child_is_text && !$self->block_normalize ();
1240 $self->set_block_break_type ("element-break");
1241 $self->add_to_doc ($child->{open_tag
}
1242 . $self->tree_stringify ($child->{content
})
1243 . $child->{close_tag
});
1248 # - Do not break or indent.
1249 # - Do not line-wrap content; just add content to pending output
1250 # and let it be wrapped as part of parent's content.
1252 if ($child_opts->{format
} eq "inline")
1254 $self->add_to_pending ($child->{open_tag
});
1255 $self->tree_format ($child->{name
}, $child->{content
}, $indent);
1256 $self->add_to_pending ($child->{close_tag
});
1260 # If we get here, node is a block element.
1262 # - Break and flush any pending output
1263 # - Break and indent (no indent if break count is zero)
1264 # - Process element itself:
1265 # - Put out opening tag
1266 # - Put out element content
1267 # - Put out any indent needed before closing tag. None needed if:
1268 # - Element's exit-break is 0 (closing tag is not on new line,
1269 # so don't indent it)
1270 # - There is no separate closing tag (it was in <abc/> format)
1271 # - Element has no children (tags will be written as
1272 # <abc></abc>, so don't indent closing tag)
1273 # - Element has children, but the block is not normalized and
1274 # the last child is a text node
1275 # - Put out closing tag
1277 $self->flush_pending ($indent);
1278 $self->emit_break ($indent)
1279 unless $prev_child_is_text && !$self->block_normalize ();
1280 $self->set_block_break_type ("element-break");
1281 $self->add_to_doc ($child->{open_tag
});
1282 $self->tree_format ($child->{name
}, $child->{content
}, $indent);
1283 $self->add_to_doc (" " x
$indent)
1284 unless $child_opts->{"exit-break"} <= 0
1285 || $child->{close_tag
} eq ""
1286 || !@
{$child->{content
}}
1287 || (@
{$child->{content
}}
1288 && $child->{content
}->[$#{$child->{content}}]->{type} eq "text"
1289 && $child_opts->{normalize
} eq "no");
1290 $self->add_to_doc ($child->{close_tag
});
1294 # Comments, PIs, etc. (everything other than text and elements),
1295 # treat similarly to verbatim block:
1296 # - Flush any pending output
1298 # - Add node content to collected output
1300 $self->flush_pending ($indent);
1301 $self->emit_break (0)
1302 unless $prev_child_is_text && !$self->block_normalize ();
1303 $self->set_block_break_type ("element-break");
1304 $self->add_to_doc ($child->{content
});
1307 $prev_child_is_text = $cur_child_is_text;
1309 # Done processing current element's children now.
1311 # If current element is a block element:
1312 # - If there were any children, flush any pending output and put
1313 # out the exit break.
1314 # - Pop the block stack
1316 if ($par_opts->{format
} eq "block")
1320 $self->flush_pending ($indent);
1321 $self->set_block_break_type ("exit-break");
1322 $self->emit_break (0)
1323 unless $prev_child_is_text && !$self->block_normalize ();
1325 $self->end_block ();
1330 # Emit a break - the appropriate number of newlines according to the
1331 # enclosing block's current break type.
1333 # In addition, emit the number of spaces indicated by indent. (indent
1334 # > 0 when breaking just before emitting an element tag that should
1335 # be indented within its parent element.)
1337 # Exception: Emit no indent if break count is zero. That indicates
1338 # any following output will be written on the same output line, not
1339 # indented on a new line.
1341 # Initially, when processing a node's child list, the break type is
1342 # set to entry-break. Each subsequent break is an element-break.
1343 # (After child list has been processed, an exit-break is produced as well.)
1347 my ($self, $indent) = @_;
1349 # number of newlines to emit
1350 my $break_value = $self->block_break_value ();
1352 $self->add_to_doc ("\n" x
$break_value);
1353 # add indent if there *was* a break
1354 $self->add_to_doc (" " x
$indent) if $indent > 0 && $break_value > 0;
1358 # Flush pending output to output document collected thus far:
1359 # - Wrap pending contents as necessary, with indent before *each* line.
1360 # - Add pending text to output document (thus "flushing" it)
1361 # - Clear pending array.
1365 my ($self, $indent) = @_;
1367 # Do nothing if nothing to flush
1368 return if !@
{$self->{pending
}};
1370 # If current block is not normalized:
1371 # - Text nodes cannot be modified (no wrapping or indent). Flush
1372 # text as is without adding a break or indent.
1373 # If current block is normalized:
1375 # - If line wrap is disabled:
1376 # - Add indent if there is a break. (If there isn't a break, text
1377 # should immediately follow preceding tag, so don't add indent.)
1378 # - Add text without wrapping
1379 # - If line wrap is enabled:
1380 # - First line indent is 0 if there is no break. (Text immediately
1381 # follows preceding tag.) Otherwise first line indent is same as
1382 # prevailing indent.
1383 # - Any subsequent lines get the prevailing indent.
1385 # After flushing text, advance break type to element-break.
1389 if (!$self->block_normalize ())
1391 $s .= join ("", @
{$self->{pending
}});
1395 $self->emit_break (0);
1396 my $wrap_len = $self->block_wrap_length ();
1397 my $break_value = $self->block_break_value ();
1400 $s .= " " x
$indent if $break_value > 0;
1401 $s .= join ("", @
{$self->{pending
}});
1405 my $first_indent = ($break_value > 0 ?
$indent : 0);
1406 # Wrap lines, then join by newlines (don't add one at end)
1407 my @lines = $self->line_wrap ($self->{pending
},
1411 $s .= join ("\n", @lines);
1415 $self->add_to_doc ($s);
1416 $self->{pending
} = [ ];
1417 $self->set_block_break_type ("element-break");
1421 # Perform line-wrapping of string array to lines no longer than given
1422 # length (including indent).
1423 # Any word longer than line length appears by itself on line.
1424 # Return array of lines (not newline-terminated).
1426 # $strs - reference to array of text items to be joined and line-wrapped.
1428 # - A tag (such as <emphasis role="bold">). This should be treated as
1429 # an atomic unit, which is important for preserving inline tags intact.
1430 # - A possibly multi-word string (such as "This is a string"). In this
1431 # latter case, line-wrapping preserves internal whitespace in the
1432 # string, with the exception that if whitespace would be placed at
1433 # the end of a line, it is discarded.
1435 # $first_indent - indent for first line
1436 # $rest_indent - indent for any remaining lines
1437 # $max_len - maximum length of output lines (including indent)
1441 my ($self, $strs, $first_indent, $rest_indent, $max_len) = @_;
1443 # First, tokenize the strings
1446 foreach my $str (@
{$strs})
1450 # String is a tag; treat as atomic unit and don't split
1451 push (@words, $str);
1455 # String of white and non-white tokens.
1456 # Tokenize into white and non-white tokens.
1457 push (@words, ($str =~ /\S+|\s+/g));
1461 # Now merge tokens that are not separated by whitespace tokens. For
1462 # example, "<i>", "word", "</i>" gets merged to "<i>word</i>". But
1463 # "<i>", " ", "word", " ", "</i>" gets left as separate tokens.
1466 foreach my $word (@words)
1468 # If there is a previous word that does not end with whitespace,
1469 # and the currrent word does not begin with whitespace, concatenate
1470 # current word to previous word. Otherwise append current word to
1471 # end of list of words.
1472 if (@words2 && $words2[$#words2] !~ /\s$/ && $word !~ /^\s/)
1474 $words2[$#words2] .= $word;
1478 push (@words2, $word);
1485 # set the indent for the first line
1486 my $indent = $first_indent;
1487 # saved-up whitespace to put before next non-white word
1490 foreach my $word (@words2) # ... while words remain to wrap
1492 # If word is whitespace, save it. It gets added before next
1493 # word if no line-break occurs.
1499 my $wlen = length ($word);
1502 # New output line; it gets at least one word (discard any
1504 $line = " " x
$indent . $word;
1505 $llen = $indent + $wlen;
1506 $indent = $rest_indent;
1510 if ($llen + length ($white) + $wlen > $max_len)
1512 # Word (plus saved whitespace) won't fit on current line.
1513 # Begin new line (discard any saved whitespace).
1514 push (@lines, $line);
1515 $line = " " x
$indent . $word;
1516 $llen = $indent + $wlen;
1517 $indent = $rest_indent;
1521 # add word to current line with saved whitespace between
1522 $line .= $white . $word;
1523 $llen += length ($white) + $wlen;
1527 # push remaining line, if any
1528 push (@lines, $line) if $line ne "";
1535 # ----------------------------------------------------------------------
1537 # Begin main program
1543 Usage: $PROG_NAME [options] xml-file
1547 Print this message and exit.
1548 --backup suffix -b suffix
1549 Back up the input document, adding suffix to the input
1550 filename to create the backup filename.
1552 Proceed only as far as the document canonization stage,
1553 printing the result.
1555 Parse the document into tokens and verify that their
1556 concatenation is identical to the original input document.
1557 This option suppresses further document processing.
1558 --config-file file_name, -f file_name
1559 Specify the configuration filename. If no file is named,
1560 xmlformat uses the file named by the environment variable
1561 XMLFORMAT_CONF, if it exists, or ./xmlformat.conf, if it
1562 exists. Otherwise, xmlformat uses built-in formatting
1565 Format the document in place, replacing the contents of
1566 the input file with the reformatted document. (It's a
1567 good idea to use --backup along with this option.)
1569 Show configuration options after reading configuration
1570 file. This option suppresses document processing.
1571 --show-unconfigured-elements
1572 Show elements that are used in the document but for
1573 which no options were specified in the configuration
1574 file. This option suppresses document output.
1576 Be verbose about processing stages.
1578 Show version information and exit.
1581 # Variables for command line options; most are undefined initially.
1589 my $show_unconf_elts;
1594 # =i means an integer argument is required after the option
1595 # =s means a string argument is required after the option
1596 # :s means a string argument is optional after the option
1597 "help|h" => \
$help, # print help message
1598 "backup|b=s" => \
$backup_suffix, # make backup using suffix
1599 "canonized-output" => \
$canonize_only, # print canonized document
1600 "check-parser" => \
$check_parser, # verify parser integrity
1601 "config-file|f=s" => \
$conf_file, # config file
1602 "in-place|i" => \
$in_place, # format in place
1603 "show-config" => \
$show_conf, # show configuration file
1605 "show-unconfigured-elements" => \
$show_unconf_elts, # show unconfigured elements
1606 "verbose|v" => \
$verbose, # be verbose
1607 "version|V" => \
$show_version, # show version info
1608 ) or do { print "$usage\n"; exit (1); };
1610 if (defined ($help))
1616 if (defined ($show_version))
1618 print "$PROG_NAME $PROG_VERSION ($PROG_LANG version)\n";
1622 # --in-place option requires a named file
1624 warn "WARNING: --in-place/-i option ignored (requires named input files)\n"
1625 if defined ($in_place) && @ARGV == 0;
1627 # --backup/-b is meaningless without --in-place
1629 if (defined ($backup_suffix))
1631 if (!defined ($in_place))
1633 die "--backup/-b option meaningless without --in-place/-i option\n";
1637 # Save input filenames
1638 my @in_file = @ARGV;
1640 my $xf = XMLFormat
->new ();
1642 # If a configuration file was named explicitly, use it. An error occurs
1643 # if the file does not exist.
1645 # If no configuration file was named, fall back to:
1646 # - The file named by the environment variable XMLFORMAT_CONF, if it exists
1647 # - ./xmlformat.conf, if it exists
1649 # If no configuration file can be found at all, the built-in default options
1650 # are used. (These are set up in new().)
1652 my $env_conf_file = $ENV{XMLFORMAT_CONF
};
1653 my $def_conf_file = "./xmlformat.conf";
1655 # If no config file was named, but XMLFORMAT_CONF is set, use its value
1656 # as the config file name.
1657 if (!defined ($conf_file))
1659 $conf_file = $env_conf_file if defined ($env_conf_file);
1661 # If config file still isn't defined, use the default file if it exists.
1662 if (!defined ($conf_file))
1664 if (-r
$def_conf_file && ! -d
$def_conf_file)
1666 $conf_file = $def_conf_file;
1669 if (defined ($conf_file))
1671 warn "Reading configuration file...\n" if $verbose;
1672 die "Configuration file '$conf_file' is not readable.\n" if ! -r
$conf_file;
1673 die "Configuration file '$conf_file' is a directory.\n" if -d
$conf_file;
1674 $xf->read_config ($conf_file)
1677 if ($show_conf) # show configuration and exit
1679 $xf->display_config ();
1683 my ($in_doc, $out_doc);
1685 # Process arguments.
1686 # - If no files named, read string, write to stdout.
1687 # - If files named, read and process each one. Write output to stdout
1688 # unless --in-place option was given. Make backup of original file
1689 # if --backup option was given.
1693 warn "Reading document...\n" if $verbose;
1696 $in_doc = <>; # slurp input document as single string
1699 $out_doc = $xf->process_doc ($in_doc,
1700 $verbose, $check_parser, $canonize_only, $show_unconf_elts);
1701 if (defined ($out_doc))
1703 warn "Writing output document...\n" if $verbose;
1709 foreach my $file (@ARGV)
1711 warn "Reading document $file...\n" if $verbose;
1713 or die "Cannot read $file: $!\n";
1716 $in_doc = <IN
>; # slurp input document as single string
1719 $out_doc = $xf->process_doc ($in_doc,
1720 $verbose, $check_parser, $canonize_only, $show_unconf_elts);
1721 next unless defined ($out_doc);
1722 if (defined ($in_place))
1724 if (defined ($backup_suffix))
1726 warn "Making backup of $file to $file$backup_suffix...\n" if $verbose;
1727 rename ($file, $file . $backup_suffix)
1728 or die "Could not rename $file to $file$backup_suffix: $!\n";
1730 warn "Writing output document to $file...\n" if $verbose;
1731 open (OUT
, ">$file") or die "Cannot write to $file: $!\n";
1737 warn "Writing output document...\n" if $verbose;
1743 warn "Done!\n" if $verbose;