Zoom session when the mouse pointer is moved up and down during a playhead drag.
[ardour2.git] / manual / xmlformat / xmlformat.pl
blob877f5f110f348258834daea1e58c6bd6077a66de
1 #! /usr/bin/perl -w
2 # vim:set ts=2 sw=2 expandtab:
4 # xmlformat - configurable XML file formatter/pretty-printer
6 # Copyright (c) 2004, 2005 Kitebird, LLC. All rights reserved.
7 # Some portions are based on the REX shallow XML parser, which
8 # is Copyright (c) 1998, Robert D. Cameron. These include the
9 # regular expression parsing variables and the shallow_parse()
10 # method.
11 # This software is licensed as described in the file LICENSE,
12 # which you should have received as part of this distribution.
14 # Syntax: xmlformat [config-file] xml-file
16 # Default config file is $ENV{XMLFORMAT_CONF} or ./xmlformat.conf, in that
17 # order.
19 # Paul DuBois
20 # paul@kitebird.com
21 # 2003-12-14
23 # The input document first is parsed into a list of strings. Each string
24 # represents one of the following:
25 # - text node
26 # - processing instruction (the XML declaration is treated as a PI)
27 # - comment
28 # - CDATA section
29 # - DOCTYPE declaration
30 # - element tag (either <abc>, </abc>, or <abc/>), *including attributes*
32 # Entities are left untouched. They appear in their original form as part
33 # of the text node in which they occur.
35 # The list of strings then is converted to a hierarchical structure.
36 # The document top level is represented by a reference to a list.
37 # Each list element is a reference to a node -- a hash that has "type"
38 # and "content" key/value pairs. The "type" key indicates the node
39 # type and has one of the following values:
41 # "text" - text node
42 # "pi" - processing instruction node
43 # "comment" - comment node
44 # "CDATA" - CDATA section node
45 # "DOCTYPE" - DOCTYPE node
46 # "elt" - element node
48 # (For purposes of this program, it's really only necessary to have "text",
49 # "elt", and "other". The types other than "text" and "elt" currently are
50 # all treated the same way.)
52 # For all but element nodes, the "content" value is the text of the node.
54 # For element nodes, the "content" hash is a reference to a list of
55 # nodes for the element's children. In addition, an element node has
56 # three additional key/value pairs:
57 # - The "name" value is the tag name within the opening tag, minus angle
58 # brackets or attributes.
59 # - The "open_tag" value is the full opening tag, which may also be the
60 # closing tag.
61 # - The "close_tag" value depends on the opening tag. If the open tag is
62 # "<abc>", the close tag is "</abc>". If the open tag is "<abc/>", the
63 # close tag is the empty string.
65 # If the tree structure is converted back into a string with
66 # tree_stringify(), the result can be compared to the input file
67 # as a regression test. The string should be identical to the original
68 # input document.
70 use strict;
72 use Getopt::Long;
73 $Getopt::Long::ignorecase = 0; # options are case sensitive
74 $Getopt::Long::bundling = 1; # allow short options to be bundled
76 my $PROG_NAME = "xmlformat";
77 my $PROG_VERSION = "1.04";
78 my $PROG_LANG = "Perl";
80 # ----------------------------------------------------------------------
82 package XMLFormat;
84 use strict;
86 # ----------------------------------------------------------------------
88 # Regular expressions for parsing document components. Based on REX.
90 # SPE = shallow parsing expression
91 # SE = scanning expression
92 # CE = completion expression
93 # RSB = right square brackets
94 # QM = question mark
96 my $TextSE = "[^<]+";
97 my $UntilHyphen = "[^-]*-";
98 my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
99 my $CommentCE = "$Until2Hyphens>?";
100 my $UntilRSBs = "[^\\]]*\\](?:[^\\]]+\\])*\\]+";
101 my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
102 my $S = "[ \\n\\t\\r]+";
103 my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
104 my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
105 my $Name = "(?:$NameStrt)(?:$NameChar)*";
106 my $QuoteSE = "\"[^\"]*\"|'[^']*'";
107 my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
108 my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
109 my $S1 = "[\\n\\r\\t ]";
110 my $UntilQMs = "[^?]*\\?+";
111 my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
112 my $DT_ItemSE =
113 "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
114 my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*\\](?:$S)?)?>?";
115 my $DeclCE =
116 "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
117 my $PI_CE = "$Name(?:$PI_Tail)?";
118 my $EndTagCE = "$Name(?:$S)?>?";
119 my $AttValSE = "\"[^<\"]*\"|'[^<']*'";
120 my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
121 my $MarkupSPE =
122 "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
123 my $XML_SPE = "$TextSE|$MarkupSPE";
125 # ----------------------------------------------------------------------
127 # Allowable options and their possible values:
128 # - The keys of this hash are the allowable option names
129 # - The value for each key is list of allowable option values
130 # - If the value is undef, the option value must be numeric
131 # If any new formatting option is added to this program, it
132 # must be specified here, *and* a default value for it should
133 # be listed in the *DOCUMENT and *DEFAULT pseudo-element
134 # option hashes.
136 my %opt_list = (
137 "format" => [ "block", "inline", "verbatim" ],
138 "normalize" => [ "yes", "no" ],
139 "subindent" => undef,
140 "wrap-length" => undef,
141 "entry-break" => undef,
142 "exit-break" => undef,
143 "element-break" => undef
146 # Object creation: set up the default formatting configuration
147 # and variables for maintaining input and output document.
149 sub new
151 my $type = shift;
153 my $self = {};
155 # Formatting options for each element.
157 $self->{elt_opts} = { };
159 # The formatting options for the *DOCUMENT and *DEFAULT pseudo-elements can
160 # be overridden in the configuration file, but the options must also be
161 # built in to make sure they exist if not specified in the configuration
162 # file. Each of the structures must have a value for every option.
164 # Options for top-level document children.
165 # - Do not change entry-break: 0 ensures no extra newlines before
166 # first element of output.
167 # - Do not change exit-break: 1 ensures a newline after final element
168 # of output document.
169 # - It's probably best not to change any of the others, except perhaps
170 # if you want to increase the element-break.
172 $self->{elt_opts}->{"*DOCUMENT"} = {
173 "format" => "block",
174 "normalize" => "no",
175 "subindent" => 0,
176 "wrap-length" => 0,
177 "entry-break" => 0, # do not change
178 "exit-break" => 1, # do not change
179 "element-break" => 1
182 # Default options. These are used for any elements in the document
183 # that are not specified explicitly in the configuration file.
185 $self->{elt_opts}->{"*DEFAULT"} = {
186 "format" => "block",
187 "normalize" => "no",
188 "subindent" => 1,
189 "wrap-length" => 0,
190 "entry-break" => 1,
191 "exit-break" => 1,
192 "element-break" => 1
195 # Run the *DOCUMENT and *DEFAULT options through the option-checker
196 # to verify that the built-in values are legal.
198 my $err_count = 0;
200 foreach my $elt_name (keys (%{$self->{elt_opts}})) # ... for each element
202 # Check each option for element
203 while (my ($opt_name, $opt_val) = each (%{$self->{elt_opts}->{$elt_name}}))
205 my $err_msg;
207 ($opt_val, $err_msg) = check_option ($opt_name, $opt_val);
208 if (!defined ($err_msg))
210 $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val;
212 else
214 warn "LOGIC ERROR: $elt_name default option is invalid\n";
215 warn "$err_msg\n";
216 ++$err_count;
221 # Make sure that the every option is represented in the
222 # *DOCUMENT and *DEFAULT structures.
224 foreach my $opt_name (keys (%opt_list))
226 foreach my $elt_name (keys (%{$self->{elt_opts}}))
228 if (!exists ($self->{elt_opts}->{$elt_name}->{$opt_name}))
230 warn "LOGIC ERROR: $elt_name has no default '$opt_name' option\n";
231 ++$err_count;
236 die "Cannot continue; internal default formatting options must be fixed\n"
237 if $err_count > 0;
239 bless $self, $type; # bless object and return it
242 # Initialize the variables that are used per-document
244 sub init_doc_vars
246 my $self = shift;
248 # Elements that are used in the document but not named explicitly
249 # in the configuration file.
251 $self->{unconf_elts} = { };
253 # List of tokens for current document.
255 $self->{tokens} = [ ];
257 # List of line numbers for each token
259 $self->{line_num} = [ ];
261 # Document node tree (constructed from the token list).
263 $self->{tree} = [ ];
265 # Variables for formatting operations:
266 # out_doc = resulting output document (constructed from document tree)
267 # pending = array of pending tokens being held until flushed
269 $self->{out_doc} = "";
270 $self->{pending} = [ ];
272 # Inline elements within block elements are processed using the
273 # text normalization (and possible line-wrapping) values of their
274 # enclosing block. Blocks and inlines may be nested, so we maintain
275 # a stack that allows the normalize/wrap-length values of the current
276 # block to be determined.
278 $self->{block_name_stack} = [ ]; # for debugging
279 $self->{block_opts_stack} = [ ];
281 # A similar stack for maintaining each block's current break type.
283 $self->{block_break_type_stack} = [ ];
286 # Accessors for token list and resulting output document
288 sub tokens
290 my $self = shift;
292 return $self->{tokens};
295 sub out_doc
297 my $self = shift;
299 return $self->{out_doc};
303 # Methods for adding strings to output document or
304 # to the pending output array
306 sub add_to_doc
308 my ($self, $str) = @_;
310 $self->{out_doc} .= $str;
313 sub add_to_pending
315 my ($self, $str) = @_;
317 push (@{$self->{pending}}, $str);
321 # Block stack mainenance methods
323 # Push options onto or pop options off from the stack. When doing
324 # this, also push or pop an element onto the break-level stack.
326 sub begin_block
328 my ($self, $name, $opts) = @_;
330 push (@{$self->{block_name_stack}}, $name);
331 push (@{$self->{block_opts_stack}}, $opts);
332 push (@{$self->{block_break_type_stack}}, "entry-break");
335 sub end_block
337 my $self = shift;
339 pop (@{$self->{block_name_stack}});
340 pop (@{$self->{block_opts_stack}});
341 pop (@{$self->{block_break_type_stack}});
344 # Return the current block's normalization status or wrap length
346 sub block_normalize
348 my $self = shift;
350 my $size = @{$self->{block_opts_stack}};
351 my $opts = $self->{block_opts_stack}->[$size-1];
352 return $opts->{normalize} eq "yes";
355 sub block_wrap_length
357 my $self = shift;
359 my $size = @{$self->{block_opts_stack}};
360 my $opts = $self->{block_opts_stack}->[$size-1];
361 return $opts->{"wrap-length"};
364 # Set the current block's break type, or return the number of newlines
365 # for the block's break type
367 sub set_block_break_type
369 my ($self, $type) = @_;
371 my $size = @{$self->{block_break_type_stack}};
372 $self->{block_break_type_stack}->[$size-1] = $type;
375 sub block_break_value
377 my $self = shift;
379 my $size = @{$self->{block_opts_stack}};
380 my $opts = $self->{block_opts_stack}->[$size-1];
381 $size = @{$self->{block_break_type_stack}};
382 my $type = $self->{block_break_type_stack}->[$size-1];
383 return $opts->{$type};
387 # ----------------------------------------------------------------------
389 # Read configuration information. For each element, construct a hash
390 # containing a hash key and value for each option name and value.
391 # After reading the file, fill in missing option values for
392 # incomplete option structures using the *DEFAULT options.
394 sub read_config
396 my $self = shift;
397 my $conf_file = shift;
398 my @elt_names = ();
399 my $err_msg;
400 my $in_continuation = 0;
401 my $saved_line = "";
403 open (FH, $conf_file) or die "Cannot read config file $conf_file: $!\n";
404 while (<FH>)
406 chomp;
408 next if /^\s*($|#)/; # skip blank lines, comments
409 if ($in_continuation)
411 $_ = $saved_line . " " . $_;
412 $saved_line = "";
413 $in_continuation = 0;
415 if (!/^\s/)
417 # Line doesn't begin with whitespace, so it lists element names.
418 # Names are separated by whitespace or commas, possibly followed
419 # by a continuation character or a comment.
420 if (/\\$/)
422 s/\\$//; # remove continuation character
423 $saved_line = $_;
424 $in_continuation = 1;
425 next;
427 s/\s*#.*$//; # remove any trailing comment
428 @elt_names = split (/[\s,]+/, $_);
429 # make sure each name has an entry in the elt_opts structure
430 foreach my $elt_name (@elt_names)
432 $self->{elt_opts}->{$elt_name} = { }
433 unless exists ($self->{elt_opts}->{$elt_name});
436 else
438 # Line begins with whitespace, so it contains an option
439 # to apply to the current element list, possibly followed by
440 # a comment. First check that there is a current list.
441 # Then parse the option name/value.
443 die "$conf_file:$.: Option setting found before any "
444 . "elements were named.\n"
445 if !@elt_names;
446 s/\s*#.*$//;
447 my ($opt_name, $opt_val) = /^\s+(\S+)(?:\s+|\s*=\s*)(\S+)$/;
448 die "$conf_file:$.: Malformed line: $_\n" unless defined ($opt_val);
450 # Check option. If illegal, die with message. Otherwise,
451 # add option to each element in current element list
453 ($opt_val, $err_msg) = check_option ($opt_name, $opt_val);
454 die "$conf_file:$.: $err_msg\n" if defined ($err_msg);
455 foreach my $elt_name (@elt_names)
457 $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val;
461 close (FH);
463 # For any element that has missing option values, fill in the values
464 # using the options for the *DEFAULT pseudo-element. This speeds up
465 # element option lookups later. It also makes it unnecessary to test
466 # each option to see if it's defined: All element option structures
467 # will have every option defined.
469 my $def_opts = $self->{elt_opts}->{"*DEFAULT"};
471 foreach my $elt_name (keys (%{$self->{elt_opts}}))
473 next if $elt_name eq "*DEFAULT";
474 foreach my $opt_name (keys (%{$def_opts}))
476 next if exists ($self->{elt_opts}->{$elt_name}->{$opt_name}); # already set
477 $self->{elt_opts}->{$elt_name}->{$opt_name} = $def_opts->{$opt_name};
483 # Check option name to make sure it's legal. Check the value to make sure
484 # that it's legal for the name. Return a two-element array:
485 # (value, undef) if the option name and value are legal.
486 # (undef, message) if an error was found; message contains error message.
487 # For legal values, the returned value should be assigned to the option,
488 # because it may get type-converted here.
490 sub check_option
492 my ($opt_name, $opt_val) = @_;
494 # - Check option name to make sure it's a legal option
495 # - Then check the value. If there is a list of values
496 # the value must be one of them. Otherwise, the value
497 # must be an integer.
499 return (undef, "Unknown option name: $opt_name")
500 unless exists ($opt_list{$opt_name});
501 my $allowable_val = $opt_list{$opt_name};
502 if (defined ($allowable_val))
504 return (undef, "Unknown '$opt_name' value: $opt_val")
505 unless grep (/^$opt_val$/, @{$allowable_val});
507 else # other options should be numeric
509 # "$opt_val" converts $opt_val to string for pattern match
510 return (undef, "'$opt_name' value ($opt_val) should be an integer")
511 unless "$opt_val" =~ /^\d+$/;
513 return ($opt_val, undef);
517 # Return hash of option values for a given element. If no options are found:
518 # - Add the element name to the list of unconfigured options.
519 # - Assign the default options to the element. (This way the test for the
520 # option fails only once.)
522 sub get_opts
524 my $self = shift;
525 my $elt_name = shift;
527 my $opts = $self->{elt_opts}->{$elt_name};
528 if (!defined ($opts))
530 $self->{unconf_elts}->{$elt_name} = 1;
531 $opts = $self->{elt_opts}->{$elt_name} = $self->{elt_opts}->{"*DEFAULT"};
533 return $opts;
537 # Display contents of configuration options to be used to process document.
538 # For each element named in the elt_opts structure, display its format
539 # type, and those options that apply to the type.
541 sub display_config
543 my $self = shift;
544 # Format types and the additional options that apply to each type
545 my $format_opts = {
546 "block" => [
547 "entry-break",
548 "element-break",
549 "exit-break",
550 "subindent",
551 "normalize",
552 "wrap-length"
554 "inline" => [ ],
555 "verbatim" => [ ]
558 foreach my $elt_name (sort (keys (%{$self->{elt_opts}})))
560 print "$elt_name\n";
561 my %opts = %{$self->{elt_opts}->{$elt_name}};
562 my $format = $opts{format};
563 # Write out format type, then options that apply to the format type
564 print " format = $format\n";
565 foreach my $opt_name (@{$format_opts->{$format}})
567 print " $opt_name = $opts{$opt_name}\n";
569 print "\n";
574 # Display the list of elements that are used in the document but not
575 # configured in the configuration file.
577 # Then re-unconfigure the elements so that they won't be considered
578 # as configured for the next document, if there is one.
580 sub display_unconfigured_elements
582 my $self = shift;
584 my @elts = keys (%{$self->{unconf_elts}});
585 if (@elts == 0)
587 print "The document contains no unconfigured elements.\n";
589 else
591 print "The following document elements were assigned no formatting options:\n";
592 foreach my $line ($self->line_wrap ([ join (" ", sort (@elts)) ], 0, 0, 65))
594 print "$line\n";
598 foreach my $elt_name (@elts)
600 delete ($self->{elt_opts}->{$elt_name});
604 # ----------------------------------------------------------------------
606 # Main document processing routine.
607 # - Argument is a string representing an input document
608 # - Return value is the reformatted document, or undef. An undef return
609 # signifies either that an error occurred, or that some option was
610 # given that suppresses document output. In either case, don't write
611 # any output for the document. Any error messages will already have
612 # been printed when this returns.
614 sub process_doc
616 my $self = shift;
617 my ($doc, $verbose, $check_parser, $canonize_only, $show_unconf_elts) = @_;
618 my $str;
620 $self->init_doc_vars ();
622 # Perform lexical parse to split document into list of tokens
623 warn "Parsing document...\n" if $verbose;
624 $self->shallow_parse ($doc);
626 if ($check_parser)
628 warn "Checking parser...\n" if $verbose;
629 # concatentation of tokens should be identical to original document
630 if ($doc eq join ("", @{$self->tokens ()}))
632 print "Parser is okay\n";
634 else
636 print "PARSER ERROR: document token concatenation differs from document\n";
638 return undef;
641 # Assign input line number to each token
642 $self->assign_line_numbers ();
644 # Look for and report any error tokens returned by parser
645 warn "Checking document for errors...\n" if $verbose;
646 if ($self->report_errors () > 0)
648 warn "Cannot continue processing document.\n";
649 return undef;
652 # Convert the token list to a tree structure
653 warn "Converting document tokens to tree...\n" if $verbose;
654 if ($self->tokens_to_tree () > 0)
656 warn "Cannot continue processing document.\n";
657 return undef;
660 # Check: Stringify the tree to convert it back to a single string,
661 # then compare to original document string (should be identical)
662 # (This is an integrity check on the validity of the to-tree and stringify
663 # operations; if one or both do not work properly, a mismatch should occur.)
664 #$str = $self->tree_stringify ();
665 #print $str;
666 #warn "ERROR: mismatch between document and resulting string\n" if $doc ne $str;
668 # Canonize tree to remove extraneous whitespace
669 warn "Canonizing document tree...\n" if $verbose;
670 $self->tree_canonize ();
672 if ($canonize_only)
674 print $self->tree_stringify () . "\n";
675 return undef;
678 # One side-effect of canonizing the tree is that the formatting
679 # options are looked up for each element in the document. That
680 # causes the list of elements that have no explicit configuration
681 # to be built. Display the list and return if user requested it.
683 if ($show_unconf_elts)
685 $self->display_unconfigured_elements ();
686 return undef;
689 # Format the tree to produce formatted XML as a single string
690 warn "Formatting document tree...\n" if $verbose;
691 $self->tree_format ();
693 # If the document is not empty, add a newline and emit a warning if
694 # reformatting failed to add a trailing newline. This shouldn't
695 # happen if the *DOCUMENT options are set up with exit-break = 1,
696 # which is the reason for the warning rather than just silently
697 # adding the newline.
699 $str = $self->out_doc ();
700 if ($str ne "" && $str !~ /\n$/)
702 warn "LOGIC ERROR: trailing newline had to be added\n";
703 $str .= "\n";
706 return $str;
709 # ----------------------------------------------------------------------
711 # Parse XML document into array of tokens and store array
713 sub shallow_parse
715 my ($self, $xml_document) = @_;
717 $self->{tokens} = [ $xml_document =~ /$XML_SPE/g ];
720 # ----------------------------------------------------------------------
722 # Extract a tag name from a tag and return it.
724 # Dies if the tag cannot be found, because this is supposed to be
725 # called only with a legal tag.
727 sub extract_tag_name
729 my $tag = shift;
731 die "Cannot find tag name in tag: $tag\n" unless $tag =~ /^<\/?($Name)/;
732 return $1;
735 # ----------------------------------------------------------------------
737 # Assign an input line number to each token. The number indicates
738 # the line number on which the token begins.
740 sub assign_line_numbers
742 my $self = shift;
743 my $line_num = 1;
745 $self->{line_num} = [ ];
746 for (my $i = 0; $i < @{$self->{tokens}}; $i++)
748 my $token = $self->{tokens}->[$i];
749 push (@{$self->{line_num}}, $line_num);
750 # count newlines and increment line counter (tr returns no. of matches)
751 $line_num += ($token =~ tr/\n/\n/);
755 # ----------------------------------------------------------------------
757 # Check token list for errors and report any that are found. Error
758 # tokens are those that begin with "<" but do not end with ">".
760 # Returns the error count.
762 # Does not modify the original token list.
764 sub report_errors
766 my $self = shift;
767 my $err_count = 0;
769 for (my $i = 0; $i < @{$self->{tokens}}; $i++)
771 my $token = $self->{tokens}->[$i];
772 if ($token =~ /^</ && $token !~ />$/)
774 my $line_num = $self->{line_num}->[$i];
775 warn "Malformed token at line $line_num, token " . ($i+1) . ": $token\n";
776 ++$err_count;
779 warn "Number of errors found: $err_count\n" if $err_count > 0;
780 return $err_count;
783 # ----------------------------------------------------------------------
785 # Helper routine to print tag stack for tokens_to_tree
787 sub print_tag_stack
789 my ($label, @stack) = @_;
790 if (@stack < 1)
792 warn " $label: none\n";
794 else
796 warn " $label:\n";
797 for (my $i = 0; $i < @stack; $i++)
799 warn " ", ($i+1), ": ", $stack[$i], "\n";
804 # Convert the list of XML document tokens to a tree representation.
805 # The implementation uses a loop and a stack rather than recursion.
807 # Does not modify the original token list.
809 # Returns an error count.
811 sub tokens_to_tree
813 my $self = shift;
815 my @tag_stack = (); # stack for element tags
816 my @children_stack = (); # stack for lists of children
817 my $children = [ ]; # current list of children
818 my $err_count = 0;
820 for (my $i = 0; $i < @{$self->{tokens}}; $i++)
822 my $token = $self->{tokens}->[$i];
823 my $line_num = $self->{line_num}->[$i];
824 my $tok_err = "Error near line $line_num, token " . ($i+1) . " ($token)";
825 if ($token !~ /^</) # text
827 push (@{$children}, text_node ($token));
829 elsif ($token =~ /^<!--/) # comment
831 push (@{$children}, comment_node ($token));
833 elsif ($token =~ /^<\?/) # processing instruction
835 push (@{$children}, pi_node ($token));
837 elsif ($token =~ /^<!DOCTYPE/) # DOCTYPE
839 push (@{$children}, doctype_node ($token));
841 elsif ($token =~ /^<!\[/) # CDATA
843 push (@{$children}, cdata_node ($token));
845 elsif ($token =~ /^<\//) # element close tag
847 if (!@tag_stack)
849 warn "$tok_err: Close tag w/o preceding open tag; malformed document?\n";
850 ++$err_count;
851 next;
853 if (!@children_stack)
855 warn "$tok_err: Empty children stack; malformed document?\n";
856 ++$err_count;
857 next;
859 my $tag = pop (@tag_stack);
860 my $open_tag_name = extract_tag_name ($tag);
861 my $close_tag_name = extract_tag_name ($token);
862 if ($open_tag_name ne $close_tag_name)
864 warn "$tok_err: Tag mismatch; malformed document?\n";
865 warn " open tag: $tag\n";
866 warn " close tag: $token\n";
867 print_tag_stack ("enclosing tags", @tag_stack);
868 ++$err_count;
869 next;
871 my $elt = element_node ($tag, $token, $children);
872 $children = pop (@children_stack);
873 push (@{$children}, $elt);
875 else # element open tag
877 # If we reach here, we're seeing the open tag for an element:
878 # - If the tag is also the close tag (e.g., <abc/>), close the
879 # element immediately, giving it an empty child list.
880 # - Otherwise, push tag and child list on stacks, begin new child
881 # list for element body.
882 if ($token =~ /\/>$/) # tag is of form <abc/>
884 push (@{$children}, element_node ($token, "", [ ]));
886 else # tag is of form <abc>
888 push (@tag_stack, $token);
889 push (@children_stack, $children);
890 $children = [ ];
895 # At this point, the stacks should be empty if the document is
896 # well-formed.
898 if (@tag_stack)
900 warn "Error at EOF: Unclosed tags; malformed document?\n";
901 print_tag_stack ("unclosed tags", @tag_stack);
902 ++$err_count;
904 if (@children_stack)
906 warn "Error at EOF: Unprocessed child elements; malformed document?\n";
907 # TODO: print out info about them
908 ++$err_count;
911 $self->{tree} = $children;
912 return $err_count;
916 # Node-generating helper methods for tokens_to_tree
918 # Generic node generator
920 sub node { return { "type" => $_[0], "content" => $_[1] }; }
922 # Generators for specific non-element nodes
924 sub text_node { return node ("text", $_[0]); }
925 sub comment_node { return node ("comment", $_[0]); }
926 sub pi_node { return node ("pi", $_[0]); }
927 sub doctype_node { return node ("DOCTYPE", $_[0]); }
928 sub cdata_node { return node ("CDATA", $_[0]); }
930 # For an element node, create a standard node with the type and content
931 # key/value pairs. Then add pairs for the "name", "open_tag", and
932 # "close_tag" hash keys.
934 sub element_node
936 my ($open_tag, $close_tag, $children) = @_;
938 my $elt = node ("elt", $children);
939 # name is the open tag with angle brackets and attibutes stripped
940 $elt->{name} = extract_tag_name ($open_tag);
941 $elt->{open_tag} = $open_tag;
942 $elt->{close_tag} = $close_tag;
943 return $elt;
946 # ----------------------------------------------------------------------
948 # Convert the given XML document tree (or subtree) to string form by
949 # concatentating all of its components. Argument is a reference
950 # to a list of nodes at a given level of the tree.
952 # Does not modify the node list.
954 sub tree_stringify
956 my $self = shift;
957 my $children = shift || $self->{tree}; # use entire tree if no arg;
958 my $str = "";
960 for (my $i = 0; $i < @{$children}; $i++)
962 my $child = $children->[$i];
964 # - Elements have list of child nodes as content (process recursively)
965 # - All other node types have text content
967 if ($child->{type} eq "elt")
969 $str .= $child->{open_tag}
970 . $self->tree_stringify ($child->{content})
971 . $child->{close_tag};
973 else
975 $str .= $child->{content};
978 return $str;
981 # ----------------------------------------------------------------------
984 # Put tree in "canonical" form by eliminating extraneous whitespace
985 # from element text content.
987 # $children is a list of child nodes
989 # This function modifies the node list.
991 # Canonizing occurs as follows:
992 # - Comment, PI, DOCTYPE, and CDATA nodes remain untouched
993 # - Verbatim elements and their descendants remain untouched
994 # - Within non-normalized block elements:
995 # - Delete all-whitespace text node children
996 # - Leave other text node children untouched
997 # - Within normalized block elements:
998 # - Convert runs of whitespace (including line-endings) to single spaces
999 # - Trim leading whitespace of first text node
1000 # - Trim trailing whitespace of last text node
1001 # - Trim whitespace that is adjacent to a verbatim or non-normalized
1002 # sub-element. (For example, if a <programlisting> is followed by
1003 # more text, delete any whitespace at beginning of that text.)
1004 # - Within inline elements:
1005 # - Normalize the same way as the enclosing block element, with the
1006 # exception that a space at the beginning or end is not removed.
1007 # (Otherwise, <para>three<literal> blind </literal>mice</para>
1008 # would become <para>three<literal>blind</literal>mice</para>.)
1010 sub tree_canonize
1012 my $self = shift;
1014 $self->{tree} = $self->tree_canonize2 ($self->{tree}, "*DOCUMENT");
1018 sub tree_canonize2
1020 my $self = shift;
1021 my $children = shift;
1022 my $par_name = shift;
1024 # Formatting options for parent
1025 my $par_opts = $self->get_opts ($par_name);
1027 # If parent is a block element, remember its formatting options on
1028 # the block stack so they can be used to control canonization of
1029 # inline child elements.
1031 $self->begin_block ($par_name, $par_opts) if $par_opts->{format} eq "block";
1033 # Iterate through list of child nodes to preserve, modify, or
1034 # discard whitespace. Return resulting list of children.
1036 # Canonize element and text nodes. Leave everything else (comments,
1037 # processing instructions, etc.) untouched.
1039 my @new_children = ();
1041 while (@{$children})
1043 my $child = shift (@{$children});
1045 if ($child->{type} eq "elt")
1047 # Leave verbatim elements untouched. For other element nodes,
1048 # canonize child list using options appropriate to element.
1050 if ($self->get_opts ($child->{name})->{format} ne "verbatim")
1052 $child->{content} = $self->tree_canonize2 ($child->{content},
1053 $child->{name});
1056 elsif ($child->{type} eq "text")
1058 # Delete all-whitespace node or strip whitespace as appropriate.
1060 # Paranoia check: We should never get here for verbatim elements,
1061 # because normalization is irrelevant for them.
1063 die "LOGIC ERROR: trying to canonize verbatim element $par_name!\n"
1064 if $par_opts->{format} eq "verbatim";
1066 if (!$self->block_normalize ())
1068 # Enclosing block is not normalized:
1069 # - Delete child all-whitespace text nodes.
1070 # - Leave other text nodes untouched.
1072 next if $child->{content} =~ /^\s*$/;
1074 else
1076 # Enclosing block is normalized, so normalize this text node:
1077 # - Convert runs of whitespace characters (including
1078 # line-endings characters) to single spaces.
1079 # - Trim leading whitespace if this node is the first child
1080 # of a block element or it follows a non-normalized node.
1081 # - Trim leading whitespace if this node is the last child
1082 # of a block element or it precedes a non-normalized node.
1084 # These are nil if there is no prev or next child
1085 my $prev_child = $new_children[$#new_children];
1086 my $next_child = $children->[0];
1088 $child->{content} =~ s/\s+/ /g;
1089 $child->{content} =~ s/^ //
1090 if (!defined ($prev_child) && $par_opts->{format} eq "block")
1091 || $self->non_normalized_node ($prev_child);
1092 $child->{content} =~ s/ $//
1093 if (!defined ($next_child) && $par_opts->{format} eq "block")
1094 || $self->non_normalized_node ($next_child);
1096 # If resulting text is empty, discard the node.
1097 next if $child->{content} =~ /^$/;
1100 push (@new_children, $child);
1103 # Pop block stack if parent was a block element
1104 $self->end_block () if $par_opts->{format} eq "block";
1106 return \@new_children;
1110 # Helper function for tree_canonize().
1112 # Determine whether a node is normalized. This is used to check
1113 # the node that is adjacent to a given text node (either previous
1114 # or following).
1115 # - No is node is nil
1116 # - No if the node is a verbatim element
1117 # - If the node is a block element, yes or no according to its
1118 # normalize option
1119 # - No if the node is an inline element. Inlines are normalized
1120 # if the parent block is normalized, but this method is not called
1121 # except while examinine normalized blocks. So its inline children
1122 # are also normalized.
1123 # - No if node is a comment, PI, DOCTYPE, or CDATA section. These are
1124 # treated like verbatim elements.
1126 sub non_normalized_node
1128 my $self = shift;
1129 my $node = shift;
1131 return 0 if !$node;
1132 my $type = $node->{type};
1133 if ($type eq "elt")
1135 my $node_opts = $self->get_opts ($node->{name});
1136 if ($node_opts->{format} eq "verbatim")
1138 return 1;
1140 if ($node_opts->{format} eq "block")
1142 return $node_opts->{normalize} eq "no";
1144 if ($node_opts->{format} eq "inline")
1146 return 0;
1148 die "LOGIC ERROR: non_normalized_node: unhandled node format.\n";
1150 if ($type eq "comment" || $type eq "pi" || $type eq "DOCTYPE"
1151 || $type eq "CDATA")
1153 return 1;
1155 if ($type eq "text")
1157 die "LOGIC ERROR: non_normalized_node: got called for text node.\n";
1159 die "LOGIC ERROR: non_normalized_node: unhandled node type.\n";
1162 # ----------------------------------------------------------------------
1164 # Format (pretty-print) the document tree
1166 # Does not modify the node list.
1168 # The class maintains two variables for storing output:
1169 # - out_doc stores content that has been seen and "flushed".
1170 # - pending stores an array of strings (content of text nodes and inline
1171 # element tags). These are held until they need to be flushed, at
1172 # which point they are concatenated and possibly wrapped/indented.
1173 # Flushing occurs when a break needs to be written, which happens
1174 # when something other than a text node or inline element is seen.
1176 # If parent name and children are not given, format the entire document.
1177 # Assume prevailing indent = 0 if not given.
1179 sub tree_format
1181 my $self = shift;
1182 my $par_name = shift || "*DOCUMENT"; # format entire document if no arg
1183 my $children = shift || $self->{tree}; # use entire tree if no arg
1184 my $indent = shift || 0;
1186 # Formatting options for parent element
1187 my $par_opts = $self->get_opts ($par_name);
1189 # If parent is a block element:
1190 # - Remember its formatting options on the block stack so they can
1191 # be used to control formatting of inline child elements.
1192 # - Set initial break type to entry-break.
1193 # - Shift prevailing indent right before generating child content.
1195 if ($par_opts->{format} eq "block")
1197 $self->begin_block ($par_name, $par_opts);
1198 $self->set_block_break_type ("entry-break");
1199 $indent += $par_opts->{"subindent"};
1202 # Variables for keeping track of whether the previous child
1203 # was a text node. Used for controlling break behavior in
1204 # non-normalized block elements: No line breaks are added around
1205 # text in such elements, nor is indenting added.
1207 my $prev_child_is_text = 0;
1208 my $cur_child_is_text = 0;
1210 foreach my $child (@{$children})
1212 $prev_child_is_text = $cur_child_is_text;
1214 # Text nodes: just add text to pending output
1216 if ($child->{type} eq "text")
1218 $cur_child_is_text = 1;
1219 $self->add_to_pending ($child->{content});
1220 next;
1223 $cur_child_is_text = 0;
1225 # Element nodes: handle depending on format type
1227 if ($child->{type} eq "elt")
1229 my $child_opts = $self->get_opts ($child->{name});
1231 # Verbatim elements:
1232 # - Print literally without change (use _stringify).
1233 # - Do not line-wrap or add any indent.
1235 if ($child_opts->{format} eq "verbatim")
1237 $self->flush_pending ($indent);
1238 $self->emit_break (0)
1239 unless $prev_child_is_text && !$self->block_normalize ();
1240 $self->set_block_break_type ("element-break");
1241 $self->add_to_doc ($child->{open_tag}
1242 . $self->tree_stringify ($child->{content})
1243 . $child->{close_tag});
1244 next;
1247 # Inline elements:
1248 # - Do not break or indent.
1249 # - Do not line-wrap content; just add content to pending output
1250 # and let it be wrapped as part of parent's content.
1252 if ($child_opts->{format} eq "inline")
1254 $self->add_to_pending ($child->{open_tag});
1255 $self->tree_format ($child->{name}, $child->{content}, $indent);
1256 $self->add_to_pending ($child->{close_tag});
1257 next;
1260 # If we get here, node is a block element.
1262 # - Break and flush any pending output
1263 # - Break and indent (no indent if break count is zero)
1264 # - Process element itself:
1265 # - Put out opening tag
1266 # - Put out element content
1267 # - Put out any indent needed before closing tag. None needed if:
1268 # - Element's exit-break is 0 (closing tag is not on new line,
1269 # so don't indent it)
1270 # - There is no separate closing tag (it was in <abc/> format)
1271 # - Element has no children (tags will be written as
1272 # <abc></abc>, so don't indent closing tag)
1273 # - Element has children, but the block is not normalized and
1274 # the last child is a text node
1275 # - Put out closing tag
1277 $self->flush_pending ($indent);
1278 $self->emit_break ($indent)
1279 unless $prev_child_is_text && !$self->block_normalize ();
1280 $self->set_block_break_type ("element-break");
1281 $self->add_to_doc ($child->{open_tag});
1282 $self->tree_format ($child->{name}, $child->{content}, $indent);
1283 $self->add_to_doc (" " x $indent)
1284 unless $child_opts->{"exit-break"} <= 0
1285 || $child->{close_tag} eq ""
1286 || !@{$child->{content}}
1287 || (@{$child->{content}}
1288 && $child->{content}->[$#{$child->{content}}]->{type} eq "text"
1289 && $child_opts->{normalize} eq "no");
1290 $self->add_to_doc ($child->{close_tag});
1291 next;
1294 # Comments, PIs, etc. (everything other than text and elements),
1295 # treat similarly to verbatim block:
1296 # - Flush any pending output
1297 # - Put out a break
1298 # - Add node content to collected output
1300 $self->flush_pending ($indent);
1301 $self->emit_break (0)
1302 unless $prev_child_is_text && !$self->block_normalize ();
1303 $self->set_block_break_type ("element-break");
1304 $self->add_to_doc ($child->{content});
1307 $prev_child_is_text = $cur_child_is_text;
1309 # Done processing current element's children now.
1311 # If current element is a block element:
1312 # - If there were any children, flush any pending output and put
1313 # out the exit break.
1314 # - Pop the block stack
1316 if ($par_opts->{format} eq "block")
1318 if (@{$children})
1320 $self->flush_pending ($indent);
1321 $self->set_block_break_type ("exit-break");
1322 $self->emit_break (0)
1323 unless $prev_child_is_text && !$self->block_normalize ();
1325 $self->end_block ();
1330 # Emit a break - the appropriate number of newlines according to the
1331 # enclosing block's current break type.
1333 # In addition, emit the number of spaces indicated by indent. (indent
1334 # > 0 when breaking just before emitting an element tag that should
1335 # be indented within its parent element.)
1337 # Exception: Emit no indent if break count is zero. That indicates
1338 # any following output will be written on the same output line, not
1339 # indented on a new line.
1341 # Initially, when processing a node's child list, the break type is
1342 # set to entry-break. Each subsequent break is an element-break.
1343 # (After child list has been processed, an exit-break is produced as well.)
1345 sub emit_break
1347 my ($self, $indent) = @_;
1349 # number of newlines to emit
1350 my $break_value = $self->block_break_value ();
1352 $self->add_to_doc ("\n" x $break_value);
1353 # add indent if there *was* a break
1354 $self->add_to_doc (" " x $indent) if $indent > 0 && $break_value > 0;
1358 # Flush pending output to output document collected thus far:
1359 # - Wrap pending contents as necessary, with indent before *each* line.
1360 # - Add pending text to output document (thus "flushing" it)
1361 # - Clear pending array.
1363 sub flush_pending
1365 my ($self, $indent) = @_;
1367 # Do nothing if nothing to flush
1368 return if !@{$self->{pending}};
1370 # If current block is not normalized:
1371 # - Text nodes cannot be modified (no wrapping or indent). Flush
1372 # text as is without adding a break or indent.
1373 # If current block is normalized:
1374 # - Add a break.
1375 # - If line wrap is disabled:
1376 # - Add indent if there is a break. (If there isn't a break, text
1377 # should immediately follow preceding tag, so don't add indent.)
1378 # - Add text without wrapping
1379 # - If line wrap is enabled:
1380 # - First line indent is 0 if there is no break. (Text immediately
1381 # follows preceding tag.) Otherwise first line indent is same as
1382 # prevailing indent.
1383 # - Any subsequent lines get the prevailing indent.
1385 # After flushing text, advance break type to element-break.
1387 my $s = "";
1389 if (!$self->block_normalize ())
1391 $s .= join ("", @{$self->{pending}});
1393 else
1395 $self->emit_break (0);
1396 my $wrap_len = $self->block_wrap_length ();
1397 my $break_value = $self->block_break_value ();
1398 if ($wrap_len <= 0)
1400 $s .= " " x $indent if $break_value > 0;
1401 $s .= join ("", @{$self->{pending}});
1403 else
1405 my $first_indent = ($break_value > 0 ? $indent : 0);
1406 # Wrap lines, then join by newlines (don't add one at end)
1407 my @lines = $self->line_wrap ($self->{pending},
1408 $first_indent,
1409 $indent,
1410 $wrap_len);
1411 $s .= join ("\n", @lines);
1415 $self->add_to_doc ($s);
1416 $self->{pending} = [ ];
1417 $self->set_block_break_type ("element-break");
1421 # Perform line-wrapping of string array to lines no longer than given
1422 # length (including indent).
1423 # Any word longer than line length appears by itself on line.
1424 # Return array of lines (not newline-terminated).
1426 # $strs - reference to array of text items to be joined and line-wrapped.
1427 # Each item may be:
1428 # - A tag (such as <emphasis role="bold">). This should be treated as
1429 # an atomic unit, which is important for preserving inline tags intact.
1430 # - A possibly multi-word string (such as "This is a string"). In this
1431 # latter case, line-wrapping preserves internal whitespace in the
1432 # string, with the exception that if whitespace would be placed at
1433 # the end of a line, it is discarded.
1435 # $first_indent - indent for first line
1436 # $rest_indent - indent for any remaining lines
1437 # $max_len - maximum length of output lines (including indent)
1439 sub line_wrap
1441 my ($self, $strs, $first_indent, $rest_indent, $max_len) = @_;
1443 # First, tokenize the strings
1445 my @words = ();
1446 foreach my $str (@{$strs})
1448 if ($str =~ /^</)
1450 # String is a tag; treat as atomic unit and don't split
1451 push (@words, $str);
1453 else
1455 # String of white and non-white tokens.
1456 # Tokenize into white and non-white tokens.
1457 push (@words, ($str =~ /\S+|\s+/g));
1461 # Now merge tokens that are not separated by whitespace tokens. For
1462 # example, "<i>", "word", "</i>" gets merged to "<i>word</i>". But
1463 # "<i>", " ", "word", " ", "</i>" gets left as separate tokens.
1465 my @words2 = ();
1466 foreach my $word (@words)
1468 # If there is a previous word that does not end with whitespace,
1469 # and the currrent word does not begin with whitespace, concatenate
1470 # current word to previous word. Otherwise append current word to
1471 # end of list of words.
1472 if (@words2 && $words2[$#words2] !~ /\s$/ && $word !~ /^\s/)
1474 $words2[$#words2] .= $word;
1476 else
1478 push (@words2, $word);
1482 my @lines = ();
1483 my $line = "";
1484 my $llen = 0;
1485 # set the indent for the first line
1486 my $indent = $first_indent;
1487 # saved-up whitespace to put before next non-white word
1488 my $white = "";
1490 foreach my $word (@words2) # ... while words remain to wrap
1492 # If word is whitespace, save it. It gets added before next
1493 # word if no line-break occurs.
1494 if ($word =~ /^\s/)
1496 $white .= $word;
1497 next;
1499 my $wlen = length ($word);
1500 if ($llen == 0)
1502 # New output line; it gets at least one word (discard any
1503 # saved whitespace)
1504 $line = " " x $indent . $word;
1505 $llen = $indent + $wlen;
1506 $indent = $rest_indent;
1507 $white = "";
1508 next;
1510 if ($llen + length ($white) + $wlen > $max_len)
1512 # Word (plus saved whitespace) won't fit on current line.
1513 # Begin new line (discard any saved whitespace).
1514 push (@lines, $line);
1515 $line = " " x $indent . $word;
1516 $llen = $indent + $wlen;
1517 $indent = $rest_indent;
1518 $white = "";
1519 next;
1521 # add word to current line with saved whitespace between
1522 $line .= $white . $word;
1523 $llen += length ($white) + $wlen;
1524 $white = "";
1527 # push remaining line, if any
1528 push (@lines, $line) if $line ne "";
1530 return @lines;
1535 # ----------------------------------------------------------------------
1537 # Begin main program
1539 package main;
1542 my $usage = <<EOF;
1543 Usage: $PROG_NAME [options] xml-file
1545 Options:
1546 --help, -h
1547 Print this message and exit.
1548 --backup suffix -b suffix
1549 Back up the input document, adding suffix to the input
1550 filename to create the backup filename.
1551 --canonized-output
1552 Proceed only as far as the document canonization stage,
1553 printing the result.
1554 --check-parser
1555 Parse the document into tokens and verify that their
1556 concatenation is identical to the original input document.
1557 This option suppresses further document processing.
1558 --config-file file_name, -f file_name
1559 Specify the configuration filename. If no file is named,
1560 xmlformat uses the file named by the environment variable
1561 XMLFORMAT_CONF, if it exists, or ./xmlformat.conf, if it
1562 exists. Otherwise, xmlformat uses built-in formatting
1563 options.
1564 --in-place, -i
1565 Format the document in place, replacing the contents of
1566 the input file with the reformatted document. (It's a
1567 good idea to use --backup along with this option.)
1568 --show-config
1569 Show configuration options after reading configuration
1570 file. This option suppresses document processing.
1571 --show-unconfigured-elements
1572 Show elements that are used in the document but for
1573 which no options were specified in the configuration
1574 file. This option suppresses document output.
1575 --verbose, -v
1576 Be verbose about processing stages.
1577 --version, -V
1578 Show version information and exit.
1581 # Variables for command line options; most are undefined initially.
1582 my $help;
1583 my $backup_suffix;
1584 my $conf_file;
1585 my $canonize_only;
1586 my $check_parser;
1587 my $in_place;
1588 my $show_conf;
1589 my $show_unconf_elts;
1590 my $show_version;
1591 my $verbose;
1593 GetOptions (
1594 # =i means an integer argument is required after the option
1595 # =s means a string argument is required after the option
1596 # :s means a string argument is optional after the option
1597 "help|h" => \$help, # print help message
1598 "backup|b=s" => \$backup_suffix, # make backup using suffix
1599 "canonized-output" => \$canonize_only, # print canonized document
1600 "check-parser" => \$check_parser, # verify parser integrity
1601 "config-file|f=s" => \$conf_file, # config file
1602 "in-place|i" => \$in_place, # format in place
1603 "show-config" => \$show_conf, # show configuration file
1604 # need better name
1605 "show-unconfigured-elements" => \$show_unconf_elts, # show unconfigured elements
1606 "verbose|v" => \$verbose, # be verbose
1607 "version|V" => \$show_version, # show version info
1608 ) or do { print "$usage\n"; exit (1); };
1610 if (defined ($help))
1612 print "$usage\n";
1613 exit (0);
1616 if (defined ($show_version))
1618 print "$PROG_NAME $PROG_VERSION ($PROG_LANG version)\n";
1619 exit (0);
1622 # --in-place option requires a named file
1624 warn "WARNING: --in-place/-i option ignored (requires named input files)\n"
1625 if defined ($in_place) && @ARGV == 0;
1627 # --backup/-b is meaningless without --in-place
1629 if (defined ($backup_suffix))
1631 if (!defined ($in_place))
1633 die "--backup/-b option meaningless without --in-place/-i option\n";
1637 # Save input filenames
1638 my @in_file = @ARGV;
1640 my $xf = XMLFormat->new ();
1642 # If a configuration file was named explicitly, use it. An error occurs
1643 # if the file does not exist.
1645 # If no configuration file was named, fall back to:
1646 # - The file named by the environment variable XMLFORMAT_CONF, if it exists
1647 # - ./xmlformat.conf, if it exists
1649 # If no configuration file can be found at all, the built-in default options
1650 # are used. (These are set up in new().)
1652 my $env_conf_file = $ENV{XMLFORMAT_CONF};
1653 my $def_conf_file = "./xmlformat.conf";
1655 # If no config file was named, but XMLFORMAT_CONF is set, use its value
1656 # as the config file name.
1657 if (!defined ($conf_file))
1659 $conf_file = $env_conf_file if defined ($env_conf_file);
1661 # If config file still isn't defined, use the default file if it exists.
1662 if (!defined ($conf_file))
1664 if (-r $def_conf_file && ! -d $def_conf_file)
1666 $conf_file = $def_conf_file;
1669 if (defined ($conf_file))
1671 warn "Reading configuration file...\n" if $verbose;
1672 die "Configuration file '$conf_file' is not readable.\n" if ! -r $conf_file;
1673 die "Configuration file '$conf_file' is a directory.\n" if -d $conf_file;
1674 $xf->read_config ($conf_file)
1677 if ($show_conf) # show configuration and exit
1679 $xf->display_config ();
1680 exit(0);
1683 my ($in_doc, $out_doc);
1685 # Process arguments.
1686 # - If no files named, read string, write to stdout.
1687 # - If files named, read and process each one. Write output to stdout
1688 # unless --in-place option was given. Make backup of original file
1689 # if --backup option was given.
1691 if (@ARGV == 0)
1693 warn "Reading document...\n" if $verbose;
1695 local $/ = undef;
1696 $in_doc = <>; # slurp input document as single string
1699 $out_doc = $xf->process_doc ($in_doc,
1700 $verbose, $check_parser, $canonize_only, $show_unconf_elts);
1701 if (defined ($out_doc))
1703 warn "Writing output document...\n" if $verbose;
1704 print $out_doc;
1707 else
1709 foreach my $file (@ARGV)
1711 warn "Reading document $file...\n" if $verbose;
1712 open (IN, $file)
1713 or die "Cannot read $file: $!\n";
1715 local $/ = undef;
1716 $in_doc = <IN>; # slurp input document as single string
1718 close (IN);
1719 $out_doc = $xf->process_doc ($in_doc,
1720 $verbose, $check_parser, $canonize_only, $show_unconf_elts);
1721 next unless defined ($out_doc);
1722 if (defined ($in_place))
1724 if (defined ($backup_suffix))
1726 warn "Making backup of $file to $file$backup_suffix...\n" if $verbose;
1727 rename ($file, $file . $backup_suffix)
1728 or die "Could not rename $file to $file$backup_suffix: $!\n";
1730 warn "Writing output document to $file...\n" if $verbose;
1731 open (OUT, ">$file") or die "Cannot write to $file: $!\n";
1732 print OUT $out_doc;
1733 close (OUT);
1735 else
1737 warn "Writing output document...\n" if $verbose;
1738 print $out_doc;
1743 warn "Done!\n" if $verbose;
1745 exit (0);