manual/xmlformat/xmlformat.pl

   1 #! /usr/bin/perl -w
   2 # vim:set ts=2 sw=2 expandtab:
   3
   4 # xmlformat - configurable XML file formatter/pretty-printer
   5
   6 # Copyright (c) 2004, 2005 Kitebird, LLC.  All rights reserved.
   7 # Some portions are based on the REX shallow XML parser, which
   8 # is Copyright (c) 1998, Robert D. Cameron. These include the
   9 # regular expression parsing variables and the shallow_parse()
  10 # method.
  11 # This software is licensed as described in the file LICENSE,
  12 # which you should have received as part of this distribution.
  13
  14 # Syntax: xmlformat [config-file] xml-file
  15
  16 # Default config file is $ENV{XMLFORMAT_CONF} or ./xmlformat.conf, in that
  17 # order.
  18
  19 # Paul DuBois
  20 # paul@kitebird.com
  21 # 2003-12-14
  22
  23 # The input document first is parsed into a list of strings.  Each string
  24 # represents one of the following:
  25 # - text node
  26 # - processing instruction (the XML declaration is treated as a PI)
  27 # - comment
  28 # - CDATA section
  29 # - DOCTYPE declaration
  30 # - element tag (either <abc>, </abc>, or <abc/>), *including attributes*
  31
  32 # Entities are left untouched. They appear in their original form as part
  33 # of the text node in which they occur.
  34
  35 # The list of strings then is converted to a hierarchical structure.
  36 # The document top level is represented by a reference to a list.
  37 # Each list element is a reference to a node -- a hash that has "type"
  38 # and "content" key/value pairs. The "type" key indicates the node
  39 # type and has one of the following values:
  40
  41 # "text"    - text node
  42 # "pi"      - processing instruction node
  43 # "comment" - comment node
  44 # "CDATA"   - CDATA section node
  45 # "DOCTYPE" - DOCTYPE node
  46 # "elt"     - element node
  47
  48 # (For purposes of this program, it's really only necessary to have "text",
  49 # "elt", and "other".  The types other than "text" and "elt" currently are
  50 # all treated the same way.)
  51
  52 # For all but element nodes, the "content" value is the text of the node.
  53
  54 # For element nodes, the "content" hash is a reference to a list of
  55 # nodes for the element's children. In addition, an element node has
  56 # three additional key/value pairs:
  57 # - The "name" value is the tag name within the opening tag, minus angle
  58 #   brackets or attributes.
  59 # - The "open_tag" value is the full opening tag, which may also be the
  60 #   closing tag.
  61 # - The "close_tag" value depends on the opening tag.  If the open tag is
  62 #   "<abc>", the close tag is "</abc>". If the open tag is "<abc/>", the
  63 #   close tag is the empty string.
  64
  65 # If the tree structure is converted back into a string with
  66 # tree_stringify(), the result can be compared to the input file
  67 # as a regression test. The string should be identical to the original
  68 # input document.
  69
  70 use strict;
  71
  72 use Getopt::Long;
  73 $Getopt::Long::ignorecase = 0; # options are case sensitive
  74 $Getopt::Long::bundling = 1;   # allow short options to be bundled
  75
  76 my $PROG_NAME = "xmlformat";
  77 my $PROG_VERSION = "1.04";
  78 my $PROG_LANG = "Perl";
  79
  80 # ----------------------------------------------------------------------
  81
  82 package XMLFormat;
  83
  84 use strict;
  85
  86 # ----------------------------------------------------------------------
  87
  88 # Regular expressions for parsing document components. Based on REX.
  89
  90 # SPE = shallow parsing expression
  91 # SE = scanning expression
  92 # CE = completion expression
  93 # RSB = right square brackets
  94 # QM = question mark
  95
  96 my $TextSE = "[^<]+";
  97 my $UntilHyphen = "[^-]*-";
  98 my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
  99 my $CommentCE = "$Until2Hyphens>?";
 100 my $UntilRSBs = "[^\\]]*\\](?:[^\\]]+\\])*\\]+";
 101 my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
 102 my $S = "[ \\n\\t\\r]+";
 103 my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
 104 my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
 105 my $Name = "(?:$NameStrt)(?:$NameChar)*";
 106 my $QuoteSE = "\"[^\"]*\"|'[^']*'";
 107 my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
 108 my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
 109 my $S1 = "[\\n\\r\\t ]";
 110 my $UntilQMs = "[^?]*\\?+";
 111 my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
 112 my $DT_ItemSE =
 113 "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
 114 my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*\\](?:$S)?)?>?";
 115 my $DeclCE =
 116 "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
 117 my $PI_CE = "$Name(?:$PI_Tail)?";
 118 my $EndTagCE = "$Name(?:$S)?>?";
 119 my $AttValSE = "\"[^<\"]*\"|'[^<']*'";
 120 my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
 121 my $MarkupSPE =
 122 "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
 123 my $XML_SPE = "$TextSE|$MarkupSPE";
 124
 125 # ----------------------------------------------------------------------
 126
 127 # Allowable options and their possible values:
 128 # - The keys of this hash are the allowable option names
 129 # - The value for each key is list of allowable option values
 130 # - If the value is undef, the option value must be numeric
 131 # If any new formatting option is added to this program, it
 132 # must be specified here, *and* a default value for it should
 133 # be listed in the *DOCUMENT and *DEFAULT pseudo-element
 134 # option hashes.
 135
 136 my %opt_list = (
 137   "format"    => [ "block", "inline", "verbatim" ],
 138   "normalize"   => [ "yes", "no" ],
 139   "subindent"   => undef,
 140   "wrap-length" => undef,
 141   "entry-break" => undef,
 142   "exit-break"  => undef,
 143   "element-break" => undef
 144 );
 145
 146 # Object creation: set up the default formatting configuration
 147 # and variables for maintaining input and output document.
 148
 149 sub new
 150 {
 151 my $type = shift;
 152
 153   my $self = {};
 154
 155   # Formatting options for each element.
 156
 157   $self->{elt_opts} = { };
 158
 159   # The formatting options for the *DOCUMENT and *DEFAULT pseudo-elements can
 160   # be overridden in the configuration file, but the options must also be
 161   # built in to make sure they exist if not specified in the configuration
 162   # file.  Each of the structures must have a value for every option.
 163
 164   # Options for top-level document children.
 165   # - Do not change entry-break: 0 ensures no extra newlines before
 166   #   first element of output.
 167   # - Do not change exit-break: 1 ensures a newline after final element
 168   #   of output document.
 169   # - It's probably best not to change any of the others, except perhaps
 170   #   if you want to increase the element-break.
 171
 172   $self->{elt_opts}->{"*DOCUMENT"} = {
 173     "format"    => "block",
 174     "normalize"   => "no",
 175     "subindent"   => 0,
 176     "wrap-length" => 0,
 177     "entry-break" => 0, # do not change
 178     "exit-break"  => 1, # do not change
 179     "element-break" => 1
 180   };
 181
 182   # Default options. These are used for any elements in the document
 183   # that are not specified explicitly in the configuration file.
 184
 185   $self->{elt_opts}->{"*DEFAULT"} = {
 186     "format"    => "block",
 187     "normalize"   => "no",
 188     "subindent"   => 1,
 189     "wrap-length" => 0,
 190     "entry-break" => 1,
 191     "exit-break"  => 1,
 192     "element-break" => 1
 193   };
 194
 195   # Run the *DOCUMENT and *DEFAULT options through the option-checker
 196   # to verify that the built-in values are legal.
 197
 198   my $err_count = 0;
 199
 200   foreach my $elt_name (keys (%{$self->{elt_opts}}))  # ... for each element
 201   {
 202     # Check each option for element
 203     while (my ($opt_name, $opt_val) = each (%{$self->{elt_opts}->{$elt_name}}))
 204     {
 205       my $err_msg;
 206
 207       ($opt_val, $err_msg) = check_option ($opt_name, $opt_val);
 208       if (!defined ($err_msg))
 209       {
 210         $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val;
 211       }
 212       else
 213       {
 214         warn "LOGIC ERROR: $elt_name default option is invalid\n";
 215         warn "$err_msg\n";
 216         ++$err_count;
 217       }
 218     }
 219   }
 220
 221   # Make sure that the every option is represented in the
 222   # *DOCUMENT and *DEFAULT structures.
 223
 224   foreach my $opt_name (keys (%opt_list))
 225   {
 226     foreach my $elt_name (keys (%{$self->{elt_opts}}))
 227     {
 228       if (!exists ($self->{elt_opts}->{$elt_name}->{$opt_name}))
 229       {
 230         warn "LOGIC ERROR: $elt_name has no default '$opt_name' option\n";
 231         ++$err_count;
 232       }
 233     }
 234   }
 235
 236   die "Cannot continue; internal default formatting options must be fixed\n"
 237     if $err_count > 0;
 238
 239   bless $self, $type;    # bless object and return it
 240 }
 241
 242 # Initialize the variables that are used per-document
 243
 244 sub init_doc_vars
 245 {
 246 my $self = shift;
 247
 248   # Elements that are used in the document but not named explicitly
 249   # in the configuration file.
 250
 251   $self->{unconf_elts} = { };
 252
 253   # List of tokens for current document.
 254
 255   $self->{tokens} = [ ];
 256
 257   # List of line numbers for each token
 258
 259   $self->{line_num} = [ ];
 260
 261   # Document node tree (constructed from the token list).
 262
 263   $self->{tree} = [ ];
 264
 265   # Variables for formatting operations:
 266   # out_doc = resulting output document (constructed from document tree)
 267   # pending = array of pending tokens being held until flushed
 268
 269   $self->{out_doc} = "";
 270   $self->{pending} = [ ];
 271
 272   # Inline elements within block elements are processed using the
 273   # text normalization (and possible line-wrapping) values of their
 274   # enclosing block. Blocks and inlines may be nested, so we maintain
 275   # a stack that allows the normalize/wrap-length values of the current
 276   # block to be determined.
 277
 278   $self->{block_name_stack} = [ ];  # for debugging
 279   $self->{block_opts_stack} = [ ];
 280
 281   # A similar stack for maintaining each block's current break type.
 282
 283   $self->{block_break_type_stack} = [ ];
 284 }
 285
 286 # Accessors for token list and resulting output document
 287
 288 sub tokens
 289 {
 290 my $self = shift;
 291
 292   return $self->{tokens};
 293 }
 294
 295 sub out_doc
 296 {
 297 my $self = shift;
 298
 299   return $self->{out_doc};
 300 }
 301
 302
 303 # Methods for adding strings to output document or
 304 # to the pending output array
 305
 306 sub add_to_doc
 307 {
 308 my ($self, $str) = @_;
 309
 310   $self->{out_doc} .= $str;
 311 }
 312
 313 sub add_to_pending
 314 {
 315 my ($self, $str) = @_;
 316
 317   push (@{$self->{pending}}, $str);
 318 }
 319
 320
 321 # Block stack mainenance methods
 322
 323 # Push options onto or pop options off from the stack.  When doing
 324 # this, also push or pop an element onto the break-level stack.
 325
 326 sub begin_block
 327 {
 328 my ($self, $name, $opts) = @_;
 329
 330   push (@{$self->{block_name_stack}}, $name);
 331   push (@{$self->{block_opts_stack}}, $opts);
 332   push (@{$self->{block_break_type_stack}}, "entry-break");
 333 }
 334
 335 sub end_block
 336 {
 337 my $self = shift;
 338
 339   pop (@{$self->{block_name_stack}});
 340   pop (@{$self->{block_opts_stack}});
 341   pop (@{$self->{block_break_type_stack}});
 342 }
 343
 344 # Return the current block's normalization status or wrap length
 345
 346 sub block_normalize
 347 {
 348 my $self = shift;
 349
 350   my $size = @{$self->{block_opts_stack}};
 351   my $opts = $self->{block_opts_stack}->[$size-1];
 352   return $opts->{normalize} eq "yes";
 353 }
 354
 355 sub block_wrap_length
 356 {
 357 my $self = shift;
 358
 359   my $size = @{$self->{block_opts_stack}};
 360   my $opts = $self->{block_opts_stack}->[$size-1];
 361   return $opts->{"wrap-length"};
 362 }
 363
 364 # Set the current block's break type, or return the number of newlines
 365 # for the block's break type
 366
 367 sub set_block_break_type
 368 {
 369 my ($self, $type) = @_;
 370
 371   my $size = @{$self->{block_break_type_stack}};
 372   $self->{block_break_type_stack}->[$size-1] = $type;
 373 }
 374
 375 sub block_break_value
 376 {
 377 my $self = shift;
 378
 379   my $size = @{$self->{block_opts_stack}};
 380   my $opts = $self->{block_opts_stack}->[$size-1];
 381   $size = @{$self->{block_break_type_stack}};
 382   my $type = $self->{block_break_type_stack}->[$size-1];
 383   return $opts->{$type};
 384 }
 385
 386
 387 # ----------------------------------------------------------------------
 388
 389 # Read configuration information.  For each element, construct a hash
 390 # containing a hash key and value for each option name and value.
 391 # After reading the file, fill in missing option values for
 392 # incomplete option structures using the *DEFAULT options.
 393
 394 sub read_config
 395 {
 396 my $self = shift;
 397 my $conf_file = shift;
 398 my @elt_names = ();
 399 my $err_msg;
 400 my $in_continuation = 0;
 401 my $saved_line = "";
 402
 403   open (FH, $conf_file) or die "Cannot read config file $conf_file: $!\n";
 404   while (<FH>)
 405   {
 406     chomp;
 407
 408     next if /^\s*($|#)/;  # skip blank lines, comments
 409     if ($in_continuation)
 410     {
 411       $_ = $saved_line . " " . $_;
 412       $saved_line = "";
 413       $in_continuation = 0;
 414     }
 415     if (!/^\s/)
 416     {
 417       # Line doesn't begin with whitespace, so it lists element names.
 418       # Names are separated by whitespace or commas, possibly followed
 419       # by a continuation character or a comment.
 420       if (/\\$/)
 421       {
 422         s/\\$//;                          # remove continuation character
 423         $saved_line = $_;
 424         $in_continuation = 1;
 425         next;
 426       }
 427       s/\s*#.*$//;                        # remove any trailing comment
 428       @elt_names = split (/[\s,]+/, $_);
 429       # make sure each name has an entry in the elt_opts structure
 430       foreach my $elt_name (@elt_names)
 431       {
 432         $self->{elt_opts}->{$elt_name} = { }
 433           unless exists ($self->{elt_opts}->{$elt_name});
 434       }
 435     }
 436     else
 437     {
 438       # Line begins with whitespace, so it contains an option
 439       # to apply to the current element list, possibly followed by
 440       # a comment.  First check that there is a current list.
 441       # Then parse the option name/value.
 442
 443       die "$conf_file:$.: Option setting found before any "
 444           . "elements were named.\n"
 445         if !@elt_names;
 446       s/\s*#.*$//;
 447       my ($opt_name, $opt_val) = /^\s+(\S+)(?:\s+|\s*=\s*)(\S+)$/;
 448       die "$conf_file:$.: Malformed line: $_\n" unless defined ($opt_val);
 449
 450       # Check option. If illegal, die with message. Otherwise,
 451       # add option to each element in current element list
 452
 453       ($opt_val, $err_msg) = check_option ($opt_name, $opt_val);
 454       die "$conf_file:$.: $err_msg\n" if defined ($err_msg);
 455       foreach my $elt_name (@elt_names)
 456       {
 457         $self->{elt_opts}->{$elt_name}->{$opt_name} = $opt_val;
 458       }
 459     }
 460   }
 461   close (FH);
 462
 463   # For any element that has missing option values, fill in the values
 464   # using the options for the *DEFAULT pseudo-element.  This speeds up
 465   # element option lookups later.  It also makes it unnecessary to test
 466   # each option to see if it's defined: All element option structures
 467   # will have every option defined.
 468
 469   my $def_opts = $self->{elt_opts}->{"*DEFAULT"};
 470
 471   foreach my $elt_name (keys (%{$self->{elt_opts}}))
 472   {
 473     next if $elt_name eq "*DEFAULT";
 474     foreach my $opt_name (keys (%{$def_opts}))
 475     {
 476       next if exists ($self->{elt_opts}->{$elt_name}->{$opt_name}); # already set
 477       $self->{elt_opts}->{$elt_name}->{$opt_name} = $def_opts->{$opt_name};
 478     }
 479   }
 480 }
 481
 482
 483 # Check option name to make sure it's legal. Check the value to make sure
 484 # that it's legal for the name.  Return a two-element array:
 485 # (value, undef) if the option name and value are legal.
 486 # (undef, message) if an error was found; message contains error message.
 487 # For legal values, the returned value should be assigned to the option,
 488 # because it may get type-converted here.
 489
 490 sub check_option
 491 {
 492 my ($opt_name, $opt_val) = @_;
 493
 494   # - Check option name to make sure it's a legal option
 495   # - Then check the value.  If there is a list of values
 496   #   the value must be one of them.  Otherwise, the value
 497   #   must be an integer.
 498
 499   return (undef, "Unknown option name: $opt_name")
 500     unless exists ($opt_list{$opt_name});
 501   my $allowable_val = $opt_list{$opt_name};
 502   if (defined ($allowable_val))
 503   {
 504     return (undef, "Unknown '$opt_name' value: $opt_val")
 505       unless grep (/^$opt_val$/, @{$allowable_val});
 506   }
 507   else  # other options should be numeric
 508   {
 509     # "$opt_val" converts $opt_val to string for pattern match
 510     return (undef, "'$opt_name' value ($opt_val) should be an integer")
 511       unless "$opt_val" =~ /^\d+$/;
 512   }
 513   return ($opt_val, undef);
 514 }
 515
 516
 517 # Return hash of option values for a given element.  If no options are found:
 518 # - Add the element name to the list of unconfigured options.
 519 # - Assign the default options to the element.  (This way the test for the
 520 #   option fails only once.)
 521
 522 sub get_opts
 523 {
 524 my $self = shift;
 525 my $elt_name = shift;
 526
 527   my $opts = $self->{elt_opts}->{$elt_name};
 528   if (!defined ($opts))
 529   {
 530     $self->{unconf_elts}->{$elt_name} = 1;
 531     $opts = $self->{elt_opts}->{$elt_name} = $self->{elt_opts}->{"*DEFAULT"};
 532   }
 533   return $opts;
 534 }
 535
 536
 537 # Display contents of configuration options to be used to process document.
 538 # For each element named in the elt_opts structure, display its format
 539 # type, and those options that apply to the type.
 540
 541 sub display_config
 542 {
 543 my $self = shift;
 544 # Format types and the additional options that apply to each type
 545 my $format_opts = {
 546   "block" => [
 547               "entry-break",
 548               "element-break",
 549               "exit-break",
 550               "subindent",
 551               "normalize",
 552               "wrap-length"
 553               ],
 554   "inline" => [ ],
 555   "verbatim" => [ ]
 556 };
 557
 558   foreach my $elt_name (sort (keys (%{$self->{elt_opts}})))
 559   {
 560     print "$elt_name\n";
 561     my %opts = %{$self->{elt_opts}->{$elt_name}};
 562     my $format = $opts{format};
 563     # Write out format type, then options that apply to the format type
 564     print "  format = $format\n";
 565     foreach my $opt_name (@{$format_opts->{$format}})
 566     {
 567       print "  $opt_name = $opts{$opt_name}\n";
 568     }
 569     print "\n";
 570   }
 571 }
 572
 573
 574 # Display the list of elements that are used in the document but not
 575 # configured in the configuration file.
 576
 577 # Then re-unconfigure the elements so that they won't be considered
 578 # as configured for the next document, if there is one.
 579
 580 sub display_unconfigured_elements
 581 {
 582 my $self = shift;
 583
 584   my @elts = keys (%{$self->{unconf_elts}});
 585   if (@elts == 0)
 586   {
 587     print "The document contains no unconfigured elements.\n";
 588   }
 589   else
 590   {
 591     print "The following document elements were assigned no formatting options:\n";
 592     foreach my $line ($self->line_wrap ([ join (" ", sort (@elts)) ], 0, 0, 65))
 593     {
 594       print "$line\n";
 595     }
 596   }
 597
 598   foreach my $elt_name (@elts)
 599   {
 600     delete ($self->{elt_opts}->{$elt_name});
 601   }
 602 }
 603
 604 # ----------------------------------------------------------------------
 605
 606 # Main document processing routine.
 607 # - Argument is a string representing an input document
 608 # - Return value is the reformatted document, or undef. An undef return
 609 #   signifies either that an error occurred, or that some option was
 610 #   given that suppresses document output. In either case, don't write
 611 #   any output for the document.  Any error messages will already have
 612 #   been printed when this returns.
 613
 614 sub process_doc
 615 {
 616 my $self = shift;
 617 my ($doc, $verbose, $check_parser, $canonize_only, $show_unconf_elts) = @_;
 618 my $str;
 619
 620   $self->init_doc_vars ();
 621
 622   # Perform lexical parse to split document into list of tokens
 623   warn "Parsing document...\n" if $verbose;
 624   $self->shallow_parse ($doc);
 625
 626   if ($check_parser)
 627   {
 628     warn "Checking parser...\n" if $verbose;
 629     # concatentation of tokens should be identical to original document
 630     if ($doc eq join ("", @{$self->tokens ()}))
 631     {
 632       print "Parser is okay\n";
 633     }
 634     else
 635     {
 636       print "PARSER ERROR: document token concatenation differs from document\n";
 637     }
 638     return undef;
 639   }
 640
 641   # Assign input line number to each token
 642   $self->assign_line_numbers ();
 643
 644   # Look for and report any error tokens returned by parser
 645   warn "Checking document for errors...\n" if $verbose;
 646   if ($self->report_errors () > 0)
 647   {
 648     warn "Cannot continue processing document.\n";
 649     return undef;
 650   }
 651
 652   # Convert the token list to a tree structure
 653   warn "Converting document tokens to tree...\n" if $verbose;
 654   if ($self->tokens_to_tree () > 0)
 655   {
 656     warn "Cannot continue processing document.\n";
 657     return undef;
 658   }
 659
 660   # Check: Stringify the tree to convert it back to a single string,
 661   # then compare to original document string (should be identical)
 662   # (This is an integrity check on the validity of the to-tree and stringify
 663   # operations; if one or both do not work properly, a mismatch should occur.)
 664   #$str = $self->tree_stringify ();
 665   #print $str;
 666   #warn "ERROR: mismatch between document and resulting string\n" if $doc ne $str;
 667
 668   # Canonize tree to remove extraneous whitespace
 669   warn "Canonizing document tree...\n" if $verbose;
 670   $self->tree_canonize ();
 671
 672   if ($canonize_only)
 673   {
 674     print $self->tree_stringify () . "\n";
 675     return undef;
 676   }
 677
 678   # One side-effect of canonizing the tree is that the formatting
 679   # options are looked up for each element in the document.  That
 680   # causes the list of elements that have no explicit configuration
 681   # to be built.  Display the list and return if user requested it.
 682
 683   if ($show_unconf_elts)
 684   {
 685     $self->display_unconfigured_elements ();
 686     return undef;
 687   }
 688
 689   # Format the tree to produce formatted XML as a single string
 690   warn "Formatting document tree...\n" if $verbose;
 691   $self->tree_format ();
 692
 693   # If the document is not empty, add a newline and emit a warning if
 694   # reformatting failed to add a trailing newline.  This shouldn't
 695   # happen if the *DOCUMENT options are set up with exit-break = 1,
 696   # which is the reason for the warning rather than just silently
 697   # adding the newline.
 698
 699   $str = $self->out_doc ();
 700   if ($str ne "" && $str !~ /\n$/)
 701   {
 702     warn "LOGIC ERROR: trailing newline had to be added\n";
 703     $str .= "\n";
 704   }
 705
 706   return $str;
 707 }
 708
 709 # ----------------------------------------------------------------------
 710
 711 # Parse XML document into array of tokens and store array
 712
 713 sub shallow_parse
 714 {
 715 my ($self, $xml_document) = @_;
 716
 717   $self->{tokens} = [ $xml_document =~ /$XML_SPE/g ];
 718 }
 719
 720 # ----------------------------------------------------------------------
 721
 722 # Extract a tag name from a tag and return it.
 723
 724 # Dies if the tag cannot be found, because this is supposed to be
 725 # called only with a legal tag.
 726
 727 sub extract_tag_name
 728 {
 729 my $tag = shift;
 730
 731   die "Cannot find tag name in tag: $tag\n" unless $tag =~ /^<\/?($Name)/;
 732   return $1;
 733 }
 734
 735 # ----------------------------------------------------------------------
 736
 737 # Assign an input line number to each token.  The number indicates
 738 # the line number on which the token begins.
 739
 740 sub assign_line_numbers
 741 {
 742 my $self = shift;
 743 my $line_num = 1;
 744
 745   $self->{line_num} = [ ];
 746   for (my $i = 0; $i < @{$self->{tokens}}; $i++)
 747   {
 748     my $token = $self->{tokens}->[$i];
 749     push (@{$self->{line_num}}, $line_num);
 750     # count newlines and increment line counter (tr returns no. of matches)
 751     $line_num += ($token =~ tr/\n/\n/);
 752   }
 753 }
 754
 755 # ----------------------------------------------------------------------
 756
 757 # Check token list for errors and report any that are found. Error
 758 # tokens are those that begin with "<" but do not end with ">".
 759
 760 # Returns the error count.
 761
 762 # Does not modify the original token list.
 763
 764 sub report_errors
 765 {
 766 my $self = shift;
 767 my $err_count = 0;
 768
 769   for (my $i = 0; $i < @{$self->{tokens}}; $i++)
 770   {
 771     my $token = $self->{tokens}->[$i];
 772     if ($token =~ /^</ && $token !~ />$/)
 773     {
 774       my $line_num = $self->{line_num}->[$i];
 775       warn "Malformed token at line $line_num, token " . ($i+1) . ": $token\n";
 776       ++$err_count;
 777     }
 778   }
 779   warn "Number of errors found: $err_count\n" if $err_count > 0;
 780   return $err_count;
 781 }
 782
 783 # ----------------------------------------------------------------------
 784
 785 # Helper routine to print tag stack for tokens_to_tree
 786
 787 sub print_tag_stack
 788 {
 789 my ($label, @stack) = @_;
 790   if (@stack < 1)
 791   {
 792     warn "  $label: none\n";
 793   }
 794   else
 795   {
 796     warn "  $label:\n";
 797     for (my $i = 0; $i < @stack; $i++)
 798     {
 799       warn "  ", ($i+1), ": ", $stack[$i], "\n";
 800     }
 801   }
 802 }
 803
 804 # Convert the list of XML document tokens to a tree representation.
 805 # The implementation uses a loop and a stack rather than recursion.
 806
 807 # Does not modify the original token list.
 808
 809 # Returns an error count.
 810
 811 sub tokens_to_tree
 812 {
 813 my $self = shift;
 814
 815   my @tag_stack = ();     # stack for element tags
 816   my @children_stack = ();  # stack for lists of children
 817   my $children = [ ];     # current list of children
 818   my $err_count = 0;
 819
 820   for (my $i = 0; $i < @{$self->{tokens}}; $i++)
 821   {
 822     my $token = $self->{tokens}->[$i];
 823     my $line_num = $self->{line_num}->[$i];
 824     my $tok_err = "Error near line $line_num, token " . ($i+1) . " ($token)";
 825     if ($token !~ /^</)           # text
 826     {
 827       push (@{$children}, text_node ($token));
 828     }
 829     elsif ($token =~ /^<!--/)       # comment
 830     {
 831       push (@{$children}, comment_node ($token));
 832     }
 833     elsif ($token =~ /^<\?/)        # processing instruction
 834     {
 835       push (@{$children}, pi_node ($token));
 836     }
 837     elsif ($token =~ /^<!DOCTYPE/)      # DOCTYPE
 838     {
 839       push (@{$children}, doctype_node ($token));
 840     }
 841     elsif ($token =~ /^<!\[/)       # CDATA
 842     {
 843       push (@{$children}, cdata_node ($token));
 844     }
 845     elsif ($token =~ /^<\//)        # element close tag
 846     {
 847       if (!@tag_stack)
 848       {
 849         warn "$tok_err: Close tag w/o preceding open tag; malformed document?\n";
 850         ++$err_count;
 851         next;
 852       }
 853       if (!@children_stack)
 854       {
 855         warn "$tok_err: Empty children stack; malformed document?\n";
 856         ++$err_count;
 857         next;
 858       }
 859       my $tag = pop (@tag_stack);
 860       my $open_tag_name = extract_tag_name ($tag);
 861       my $close_tag_name = extract_tag_name ($token);
 862       if ($open_tag_name ne $close_tag_name)
 863       {
 864         warn "$tok_err: Tag mismatch; malformed document?\n";
 865         warn "  open tag: $tag\n";
 866         warn "  close tag: $token\n";
 867         print_tag_stack ("enclosing tags", @tag_stack);
 868         ++$err_count;
 869         next;
 870       }
 871       my $elt = element_node ($tag, $token, $children);
 872       $children = pop (@children_stack);
 873       push (@{$children}, $elt);
 874     }
 875     else                  # element open tag
 876     {
 877       # If we reach here, we're seeing the open tag for an element:
 878       # - If the tag is also the close tag (e.g., <abc/>), close the
 879       #   element immediately, giving it an empty child list.
 880       # - Otherwise, push tag and child list on stacks, begin new child
 881       #   list for element body.
 882       if ($token =~ /\/>$/)     # tag is of form <abc/>
 883       {
 884         push (@{$children}, element_node ($token, "", [ ]));
 885       }
 886       else              # tag is of form <abc>
 887       {
 888         push (@tag_stack, $token);
 889         push (@children_stack, $children);
 890         $children = [ ];
 891       }
 892     }
 893   }
 894
 895   # At this point, the stacks should be empty if the document is
 896   # well-formed.
 897
 898   if (@tag_stack)
 899   {
 900     warn "Error at EOF: Unclosed tags; malformed document?\n";
 901     print_tag_stack ("unclosed tags", @tag_stack);
 902     ++$err_count;
 903   }
 904   if (@children_stack)
 905   {
 906     warn "Error at EOF: Unprocessed child elements; malformed document?\n";
 907 # TODO: print out info about them
 908     ++$err_count;
 909   }
 910
 911   $self->{tree} = $children;
 912   return $err_count;
 913 }
 914
 915
 916 # Node-generating helper methods for tokens_to_tree
 917
 918 # Generic node generator
 919
 920 sub node         { return { "type" => $_[0], "content" => $_[1] }; }
 921
 922 # Generators for specific non-element nodes
 923
 924 sub text_node    { return node ("text",    $_[0]); }
 925 sub comment_node { return node ("comment", $_[0]); }
 926 sub pi_node      { return node ("pi",      $_[0]); }
 927 sub doctype_node { return node ("DOCTYPE", $_[0]); }
 928 sub cdata_node   { return node ("CDATA",   $_[0]); }
 929
 930 # For an element node, create a standard node with the type and content
 931 # key/value pairs. Then add pairs for the "name", "open_tag", and
 932 # "close_tag" hash keys.
 933
 934 sub element_node
 935 {
 936 my ($open_tag, $close_tag, $children) = @_;
 937
 938   my $elt = node ("elt", $children);
 939   # name is the open tag with angle brackets and attibutes stripped
 940   $elt->{name} = extract_tag_name ($open_tag);
 941   $elt->{open_tag} = $open_tag;
 942   $elt->{close_tag} = $close_tag;
 943   return $elt;
 944 }
 945
 946 # ----------------------------------------------------------------------
 947
 948 # Convert the given XML document tree (or subtree) to string form by
 949 # concatentating all of its components.  Argument is a reference
 950 # to a list of nodes at a given level of the tree.
 951
 952 # Does not modify the node list.
 953
 954 sub tree_stringify
 955 {
 956 my $self = shift;
 957 my $children = shift || $self->{tree}; # use entire tree if no arg;
 958 my $str = "";
 959
 960   for (my $i = 0; $i < @{$children}; $i++)
 961   {
 962     my $child = $children->[$i];
 963
 964     # - Elements have list of child nodes as content (process recursively)
 965     # - All other node types have text content
 966
 967     if ($child->{type} eq "elt")
 968     {
 969       $str .= $child->{open_tag}
 970           . $self->tree_stringify ($child->{content})
 971           . $child->{close_tag};
 972     }
 973     else
 974     {
 975       $str .= $child->{content};
 976     }
 977   }
 978   return $str;
 979 }
 980
 981 # ----------------------------------------------------------------------
 982
 983
 984 # Put tree in "canonical" form by eliminating extraneous whitespace
 985 # from element text content.
 986
 987 # $children is a list of child nodes
 988
 989 # This function modifies the node list.
 990
 991 # Canonizing occurs as follows:
 992 # - Comment, PI, DOCTYPE, and CDATA nodes remain untouched
 993 # - Verbatim elements and their descendants remain untouched
 994 # - Within non-normalized block elements:
 995 #   - Delete all-whitespace text node children
 996 #   - Leave other text node children untouched
 997 # - Within normalized block elements:
 998 #   - Convert runs of whitespace (including line-endings) to single spaces
 999 #   - Trim leading whitespace of first text node
1000 #   - Trim trailing whitespace of last text node
1001 #   - Trim whitespace that is adjacent to a verbatim or non-normalized
1002 #     sub-element.  (For example, if a <programlisting> is followed by
1003 #     more text, delete any whitespace at beginning of that text.)
1004 # - Within inline elements:
1005 #   - Normalize the same way as the enclosing block element, with the
1006 #     exception that a space at the beginning or end is not removed.
1007 #     (Otherwise, <para>three<literal> blind </literal>mice</para>
1008 #     would become <para>three<literal>blind</literal>mice</para>.)
1009
1010 sub tree_canonize
1011 {
1012 my $self = shift;
1013
1014   $self->{tree} = $self->tree_canonize2 ($self->{tree}, "*DOCUMENT");
1015 }
1016
1017
1018 sub tree_canonize2
1019 {
1020 my $self = shift;
1021 my $children = shift;
1022 my $par_name = shift;
1023
1024   # Formatting options for parent
1025   my $par_opts = $self->get_opts ($par_name);
1026
1027   # If parent is a block element, remember its formatting options on
1028   # the block stack so they can be used to control canonization of
1029   # inline child elements.
1030
1031   $self->begin_block ($par_name, $par_opts) if $par_opts->{format} eq "block";
1032
1033   # Iterate through list of child nodes to preserve, modify, or
1034   # discard whitespace.  Return resulting list of children.
1035
1036   # Canonize element and text nodes. Leave everything else (comments,
1037   # processing instructions, etc.) untouched.
1038
1039   my @new_children = ();
1040
1041   while (@{$children})
1042   {
1043     my $child = shift (@{$children});
1044
1045     if ($child->{type} eq "elt")
1046     {
1047       # Leave verbatim elements untouched. For other element nodes,
1048       # canonize child list using options appropriate to element.
1049
1050       if ($self->get_opts ($child->{name})->{format} ne "verbatim")
1051       {
1052         $child->{content} = $self->tree_canonize2 ($child->{content},
1053                             $child->{name});
1054       }
1055     }
1056     elsif ($child->{type} eq "text")
1057     {
1058       # Delete all-whitespace node or strip whitespace as appropriate.
1059
1060       # Paranoia check: We should never get here for verbatim elements,
1061       # because normalization is irrelevant for them.
1062
1063       die "LOGIC ERROR: trying to canonize verbatim element $par_name!\n"
1064         if $par_opts->{format} eq "verbatim";
1065
1066       if (!$self->block_normalize ())
1067       {
1068         # Enclosing block is not normalized:
1069         # - Delete child all-whitespace text nodes.
1070         # - Leave other text nodes untouched.
1071
1072         next if $child->{content} =~ /^\s*$/;
1073       }
1074       else
1075       {
1076         # Enclosing block is normalized, so normalize this text node:
1077         # - Convert runs of whitespace characters (including
1078         #   line-endings characters) to single spaces.
1079         # - Trim leading whitespace if this node is the first child
1080         #   of a block element or it follows a non-normalized node.
1081         # - Trim leading whitespace if this node is the last child
1082         #   of a block element or it precedes a non-normalized node.
1083
1084         # These are nil if there is no prev or next child
1085         my $prev_child = $new_children[$#new_children];
1086         my $next_child = $children->[0];
1087
1088         $child->{content} =~ s/\s+/ /g;
1089         $child->{content} =~ s/^ //
1090           if (!defined ($prev_child) && $par_opts->{format} eq "block")
1091             || $self->non_normalized_node ($prev_child);
1092         $child->{content} =~ s/ $//
1093           if (!defined ($next_child) && $par_opts->{format} eq "block")
1094             || $self->non_normalized_node ($next_child);
1095
1096         # If resulting text is empty, discard the node.
1097         next if $child->{content} =~ /^$/;
1098       }
1099     }
1100     push (@new_children, $child);
1101   }
1102
1103   # Pop block stack if parent was a block element
1104   $self->end_block () if $par_opts->{format} eq "block";
1105
1106   return \@new_children;
1107 }
1108
1109
1110 # Helper function for tree_canonize().
1111
1112 # Determine whether a node is normalized.  This is used to check
1113 # the node that is adjacent to a given text node (either previous
1114 # or following).
1115 # - No is node is nil
1116 # - No if the node is a verbatim element
1117 # - If the node is a block element, yes or no according to its
1118 #   normalize option
1119 # - No if the node is an inline element.  Inlines are normalized
1120 #   if the parent block is normalized, but this method is not called
1121 #   except while examinine normalized blocks. So its inline children
1122 #   are also normalized.
1123 # - No if node is a comment, PI, DOCTYPE, or CDATA section. These are
1124 #   treated like verbatim elements.
1125
1126 sub non_normalized_node
1127 {
1128 my $self = shift;
1129 my $node = shift;
1130
1131   return 0 if !$node;
1132   my $type = $node->{type};
1133   if ($type eq "elt")
1134   {
1135     my $node_opts = $self->get_opts ($node->{name});
1136     if ($node_opts->{format} eq "verbatim")
1137     {
1138       return 1;
1139     }
1140     if ($node_opts->{format} eq "block")
1141     {
1142       return $node_opts->{normalize} eq "no";
1143     }
1144     if ($node_opts->{format} eq "inline")
1145     {
1146       return 0;
1147     }
1148     die "LOGIC ERROR: non_normalized_node: unhandled node format.\n";
1149   }
1150   if ($type eq "comment" || $type eq "pi" || $type eq "DOCTYPE"
1151             || $type eq "CDATA")
1152   {
1153     return 1;
1154   }
1155   if ($type eq "text")
1156   {
1157     die "LOGIC ERROR: non_normalized_node: got called for text node.\n";
1158   }
1159   die "LOGIC ERROR: non_normalized_node: unhandled node type.\n";
1160 }
1161
1162 # ----------------------------------------------------------------------
1163
1164 # Format (pretty-print) the document tree
1165
1166 # Does not modify the node list.
1167
1168 # The class maintains two variables for storing output:
1169 # - out_doc stores content that has been seen and "flushed".
1170 # - pending stores an array of strings (content of text nodes and inline
1171 #   element tags).  These are held until they need to be flushed, at
1172 #   which point they are concatenated and possibly wrapped/indented.
1173 #   Flushing occurs when a break needs to be written, which happens
1174 #   when something other than a text node or inline element is seen.
1175
1176 # If parent name and children are not given, format the entire document.
1177 # Assume prevailing indent = 0 if not given.
1178
1179 sub tree_format
1180 {
1181 my $self = shift;
1182 my $par_name = shift || "*DOCUMENT";    # format entire document if no arg
1183 my $children = shift || $self->{tree};  # use entire tree if no arg
1184 my $indent = shift || 0;
1185
1186   # Formatting options for parent element
1187   my $par_opts = $self->get_opts ($par_name);
1188
1189   # If parent is a block element:
1190   # - Remember its formatting options on the block stack so they can
1191   #   be used to control formatting of inline child elements.
1192   # - Set initial break type to entry-break.
1193   # - Shift prevailing indent right before generating child content.
1194
1195   if ($par_opts->{format} eq "block")
1196   {
1197     $self->begin_block ($par_name, $par_opts);
1198     $self->set_block_break_type ("entry-break");
1199     $indent += $par_opts->{"subindent"};
1200   }
1201
1202   # Variables for keeping track of whether the previous child
1203   # was a text node. Used for controlling break behavior in
1204   # non-normalized block elements: No line breaks are added around
1205   # text in such elements, nor is indenting added.
1206
1207   my $prev_child_is_text = 0;
1208   my $cur_child_is_text = 0;
1209
1210   foreach my $child (@{$children})
1211   {
1212     $prev_child_is_text = $cur_child_is_text;
1213
1214     # Text nodes: just add text to pending output
1215
1216     if ($child->{type} eq "text")
1217     {
1218       $cur_child_is_text = 1;
1219       $self->add_to_pending ($child->{content});
1220       next;
1221     }
1222
1223     $cur_child_is_text = 0;
1224
1225     # Element nodes: handle depending on format type
1226
1227     if ($child->{type} eq "elt")
1228     {
1229       my $child_opts = $self->get_opts ($child->{name});
1230
1231       # Verbatim elements:
1232       # - Print literally without change (use _stringify).
1233       # - Do not line-wrap or add any indent.
1234
1235       if ($child_opts->{format} eq "verbatim")
1236       {
1237         $self->flush_pending ($indent);
1238         $self->emit_break (0)
1239           unless $prev_child_is_text && !$self->block_normalize ();
1240         $self->set_block_break_type ("element-break");
1241         $self->add_to_doc ($child->{open_tag}
1242                           . $self->tree_stringify ($child->{content})
1243                           . $child->{close_tag});
1244         next;
1245       }
1246
1247       # Inline elements:
1248       # - Do not break or indent.
1249       # - Do not line-wrap content; just add content to pending output
1250       #   and let it be wrapped as part of parent's content.
1251
1252       if ($child_opts->{format} eq "inline")
1253       {
1254         $self->add_to_pending ($child->{open_tag});
1255         $self->tree_format ($child->{name}, $child->{content}, $indent);
1256         $self->add_to_pending ($child->{close_tag});
1257         next;
1258       }
1259
1260       # If we get here, node is a block element.
1261
1262       # - Break and flush any pending output
1263       # - Break and indent (no indent if break count is zero)
1264       # - Process element itself:
1265       #   - Put out opening tag
1266       #   - Put out element content
1267       #   - Put out any indent needed before closing tag. None needed if:
1268       #     - Element's exit-break is 0 (closing tag is not on new line,
1269       #       so don't indent it)
1270       #     - There is no separate closing tag (it was in <abc/> format)
1271       #     - Element has no children (tags will be written as
1272       #       <abc></abc>, so don't indent closing tag)
1273       #     - Element has children, but the block is not normalized and
1274       #       the last child is a text node
1275       #   - Put out closing tag
1276
1277       $self->flush_pending ($indent);
1278       $self->emit_break ($indent)
1279         unless $prev_child_is_text && !$self->block_normalize ();
1280       $self->set_block_break_type ("element-break");
1281       $self->add_to_doc ($child->{open_tag});
1282       $self->tree_format ($child->{name}, $child->{content}, $indent);
1283       $self->add_to_doc (" " x $indent)
1284         unless $child_opts->{"exit-break"} <= 0
1285         || $child->{close_tag} eq ""
1286         || !@{$child->{content}}
1287         || (@{$child->{content}}
1288               && $child->{content}->[$#{$child->{content}}]->{type} eq "text"
1289               && $child_opts->{normalize} eq "no");
1290       $self->add_to_doc ($child->{close_tag});
1291       next;
1292     }
1293
1294     # Comments, PIs, etc. (everything other than text and elements),
1295     # treat similarly to verbatim block:
1296     # - Flush any pending output
1297     # - Put out a break
1298     # - Add node content to collected output
1299
1300     $self->flush_pending ($indent);
1301     $self->emit_break (0)
1302       unless $prev_child_is_text && !$self->block_normalize ();
1303     $self->set_block_break_type ("element-break");
1304     $self->add_to_doc ($child->{content});
1305   }
1306
1307   $prev_child_is_text = $cur_child_is_text;
1308
1309   # Done processing current element's children now.
1310
1311   # If current element is a block element:
1312   # - If there were any children, flush any pending output and put
1313   #   out the exit break.
1314   # - Pop the block stack
1315
1316   if ($par_opts->{format} eq "block")
1317   {
1318     if (@{$children})
1319     {
1320       $self->flush_pending ($indent);
1321       $self->set_block_break_type ("exit-break");
1322       $self->emit_break (0)
1323         unless $prev_child_is_text && !$self->block_normalize ();
1324     }
1325     $self->end_block ();
1326   }
1327 }
1328
1329
1330 # Emit a break - the appropriate number of newlines according to the
1331 # enclosing block's current break type.
1332
1333 # In addition, emit the number of spaces indicated by indent.  (indent
1334 # > 0 when breaking just before emitting an element tag that should
1335 # be indented within its parent element.)
1336
1337 # Exception: Emit no indent if break count is zero. That indicates
1338 # any following output will be written on the same output line, not
1339 # indented on a new line.
1340
1341 # Initially, when processing a node's child list, the break type is
1342 # set to entry-break. Each subsequent break is an element-break.
1343 # (After child list has been processed, an exit-break is produced as well.)
1344
1345 sub emit_break
1346 {
1347 my ($self, $indent) = @_;
1348
1349   # number of newlines to emit
1350   my $break_value = $self->block_break_value ();
1351
1352   $self->add_to_doc ("\n" x $break_value);
1353   # add indent if there *was* a break
1354   $self->add_to_doc (" " x $indent) if $indent > 0 && $break_value > 0;
1355 }
1356
1357
1358 # Flush pending output to output document collected thus far:
1359 # - Wrap pending contents as necessary, with indent before *each* line.
1360 # - Add pending text to output document (thus "flushing" it)
1361 # - Clear pending array.
1362
1363 sub flush_pending
1364 {
1365 my ($self, $indent) = @_;
1366
1367   # Do nothing if nothing to flush
1368   return if !@{$self->{pending}};
1369
1370   # If current block is not normalized:
1371   # - Text nodes cannot be modified (no wrapping or indent).  Flush
1372   #   text as is without adding a break or indent.
1373   # If current block is normalized:
1374   # - Add a break.
1375   # - If line wrap is disabled:
1376   #   - Add indent if there is a break. (If there isn't a break, text
1377   #     should immediately follow preceding tag, so don't add indent.)
1378   #   - Add text without wrapping
1379   # - If line wrap is enabled:
1380   #   - First line indent is 0 if there is no break. (Text immediately
1381   #     follows preceding tag.) Otherwise first line indent is same as
1382   #     prevailing indent.
1383   #   - Any subsequent lines get the prevailing indent.
1384
1385   # After flushing text, advance break type to element-break.
1386
1387   my $s = "";
1388
1389   if (!$self->block_normalize ())
1390   {
1391     $s .= join ("", @{$self->{pending}});
1392   }
1393   else
1394   {
1395     $self->emit_break (0);
1396     my $wrap_len = $self->block_wrap_length ();
1397     my $break_value = $self->block_break_value ();
1398     if ($wrap_len <= 0)
1399     {
1400       $s .= " " x $indent if $break_value > 0;
1401       $s .= join ("", @{$self->{pending}});
1402     }
1403     else
1404     {
1405       my $first_indent = ($break_value > 0 ? $indent : 0);
1406       # Wrap lines, then join by newlines (don't add one at end)
1407       my @lines = $self->line_wrap ($self->{pending},
1408                   $first_indent,
1409                   $indent,
1410                   $wrap_len);
1411       $s .= join ("\n", @lines);
1412     }
1413   }
1414
1415   $self->add_to_doc ($s);
1416   $self->{pending} = [ ];
1417   $self->set_block_break_type ("element-break");
1418 }
1419
1420
1421 # Perform line-wrapping of string array to lines no longer than given
1422 # length (including indent).
1423 # Any word longer than line length appears by itself on line.
1424 # Return array of lines (not newline-terminated).
1425
1426 # $strs - reference to array of text items to be joined and line-wrapped.
1427 # Each item may be:
1428 # - A tag (such as <emphasis role="bold">). This should be treated as
1429 #   an atomic unit, which is important for preserving inline tags intact.
1430 # - A possibly multi-word string (such as "This is a string"). In this
1431 #   latter case, line-wrapping preserves internal whitespace in the
1432 #   string, with the exception that if whitespace would be placed at
1433 #   the end of a line, it is discarded.
1434
1435 # $first_indent - indent for first line
1436 # $rest_indent - indent for any remaining lines
1437 # $max_len - maximum length of output lines (including indent)
1438
1439 sub line_wrap
1440 {
1441 my ($self, $strs, $first_indent, $rest_indent, $max_len) = @_;
1442
1443   # First, tokenize the strings
1444
1445   my @words = ();
1446   foreach my $str (@{$strs})
1447   {
1448     if ($str =~ /^</)
1449     {
1450       # String is a tag; treat as atomic unit and don't split
1451       push (@words, $str);
1452     }
1453     else
1454     {
1455       # String of white and non-white tokens.
1456       # Tokenize into white and non-white tokens.
1457       push (@words, ($str =~ /\S+|\s+/g));
1458     }
1459   }
1460
1461   # Now merge tokens that are not separated by whitespace tokens. For
1462   # example, "<i>", "word", "</i>" gets merged to "<i>word</i>".  But
1463   # "<i>", " ", "word", " ", "</i>" gets left as separate tokens.
1464
1465   my @words2 = ();
1466   foreach my $word (@words)
1467   {
1468     # If there is a previous word that does not end with whitespace,
1469     # and the currrent word does not begin with whitespace, concatenate
1470     # current word to previous word.  Otherwise append current word to
1471     # end of list of words.
1472     if (@words2 && $words2[$#words2] !~ /\s$/ && $word !~ /^\s/)
1473     {
1474       $words2[$#words2] .= $word;
1475     }
1476     else
1477     {
1478       push (@words2, $word);
1479     }
1480   }
1481
1482   my @lines = ();
1483   my $line = "";
1484   my $llen = 0;
1485   # set the indent for the first line
1486   my $indent = $first_indent;
1487   # saved-up whitespace to put before next non-white word
1488   my $white = "";
1489
1490   foreach my $word (@words2)   # ... while words remain to wrap
1491   {
1492     # If word is whitespace, save it. It gets added before next
1493     # word if no line-break occurs.
1494     if ($word =~ /^\s/)
1495     {
1496        $white .= $word;
1497       next;
1498     }
1499     my $wlen = length ($word);
1500     if ($llen == 0)
1501     {
1502       # New output line; it gets at least one word (discard any
1503       # saved whitespace)
1504       $line = " " x $indent . $word;
1505       $llen = $indent + $wlen;
1506       $indent = $rest_indent;
1507       $white = "";
1508       next;
1509     }
1510     if ($llen + length ($white) + $wlen > $max_len)
1511     {
1512       # Word (plus saved whitespace) won't fit on current line.
1513       # Begin new line (discard any saved whitespace).
1514       push (@lines, $line);
1515       $line = " " x $indent . $word;
1516       $llen = $indent + $wlen;
1517       $indent = $rest_indent;
1518       $white = "";
1519       next;
1520     }
1521     # add word to current line with saved whitespace between
1522     $line .= $white . $word;
1523     $llen += length ($white) + $wlen;
1524     $white = "";
1525   }
1526
1527   # push remaining line, if any
1528   push (@lines, $line) if $line ne "";
1529
1530   return @lines;
1531 }
1532
1533 1;
1534
1535 # ----------------------------------------------------------------------
1536
1537 # Begin main program
1538
1539 package main;
1540
1541
1542 my $usage = <<EOF;
1543 Usage: $PROG_NAME [options] xml-file
1544
1545 Options:
1546 --help, -h
1547     Print this message and exit.
1548 --backup suffix -b suffix
1549     Back up the input document, adding suffix to the input
1550     filename to create the backup filename.
1551 --canonized-output
1552     Proceed only as far as the document canonization stage,
1553     printing the result.
1554 --check-parser
1555     Parse the document into tokens and verify that their
1556     concatenation is identical to the original input document.
1557     This option suppresses further document processing.
1558 --config-file file_name, -f file_name
1559     Specify the configuration filename. If no file is named,
1560     xmlformat uses the file named by the environment variable
1561     XMLFORMAT_CONF, if it exists, or ./xmlformat.conf, if it
1562     exists. Otherwise, xmlformat uses built-in formatting
1563     options.
1564 --in-place, -i
1565     Format the document in place, replacing the contents of
1566     the input file with the reformatted document. (It's a
1567     good idea to use --backup along with this option.)
1568 --show-config
1569     Show configuration options after reading configuration
1570     file. This option suppresses document processing.
1571 --show-unconfigured-elements
1572     Show elements that are used in the document but for
1573     which no options were specified in the configuration
1574     file. This option suppresses document output.
1575 --verbose, -v
1576     Be verbose about processing stages.
1577 --version, -V
1578     Show version information and exit.
1579 EOF
1580
1581 # Variables for command line options; most are undefined initially.
1582 my $help;
1583 my $backup_suffix;
1584 my $conf_file;
1585 my $canonize_only;
1586 my $check_parser;
1587 my $in_place;
1588 my $show_conf;
1589 my $show_unconf_elts;
1590 my $show_version;
1591 my $verbose;
1592
1593 GetOptions (
1594   # =i means an integer argument is required after the option
1595   # =s means a string argument is required after the option
1596   # :s means a string argument is optional after the option
1597   "help|h"           => \$help,          # print help message
1598   "backup|b=s"       => \$backup_suffix, # make backup using suffix
1599   "canonized-output" => \$canonize_only, # print canonized document
1600   "check-parser"     => \$check_parser,  # verify parser integrity
1601   "config-file|f=s"  => \$conf_file,     # config file
1602   "in-place|i"       => \$in_place,      # format in place
1603   "show-config"      => \$show_conf,     # show configuration file
1604   # need better name
1605   "show-unconfigured-elements" => \$show_unconf_elts,   # show unconfigured elements
1606   "verbose|v"        => \$verbose,       # be verbose
1607   "version|V"        => \$show_version,  # show version info
1608 ) or do { print "$usage\n"; exit (1); };
1609
1610 if (defined ($help))
1611 {
1612   print "$usage\n";
1613   exit (0);
1614 }
1615
1616 if (defined ($show_version))
1617 {
1618   print "$PROG_NAME $PROG_VERSION ($PROG_LANG version)\n";
1619   exit (0);
1620 }
1621
1622 # --in-place option requires a named file
1623
1624 warn "WARNING: --in-place/-i option ignored (requires named input files)\n"
1625   if defined ($in_place) && @ARGV == 0;
1626
1627 # --backup/-b is meaningless without --in-place
1628
1629 if (defined ($backup_suffix))
1630 {
1631   if (!defined ($in_place))
1632   {
1633     die "--backup/-b option meaningless without --in-place/-i option\n";
1634   }
1635 }
1636
1637 # Save input filenames
1638 my @in_file = @ARGV;
1639
1640 my $xf = XMLFormat->new ();
1641
1642 # If a configuration file was named explicitly, use it. An error occurs
1643 # if the file does not exist.
1644
1645 # If no configuration file was named, fall back to:
1646 # - The file named by the environment variable XMLFORMAT_CONF, if it exists
1647 # - ./xmlformat.conf, if it exists
1648
1649 # If no configuration file can be found at all, the built-in default options
1650 # are used. (These are set up in new().)
1651
1652 my $env_conf_file = $ENV{XMLFORMAT_CONF};
1653 my $def_conf_file = "./xmlformat.conf";
1654
1655 # If no config file was named, but XMLFORMAT_CONF is set, use its value
1656 # as the config file name.
1657 if (!defined ($conf_file))
1658 {
1659   $conf_file = $env_conf_file if defined ($env_conf_file);
1660 }
1661 # If config file still isn't defined, use the default file if it exists.
1662 if (!defined ($conf_file))
1663 {
1664   if (-r $def_conf_file && ! -d $def_conf_file)
1665   {
1666     $conf_file = $def_conf_file;
1667   }
1668 }
1669 if (defined ($conf_file))
1670 {
1671   warn "Reading configuration file...\n" if $verbose;
1672   die "Configuration file '$conf_file' is not readable.\n" if ! -r $conf_file;
1673   die "Configuration file '$conf_file' is a directory.\n"  if -d $conf_file;
1674   $xf->read_config ($conf_file)
1675 }
1676
1677 if ($show_conf)   # show configuration and exit
1678 {
1679   $xf->display_config ();
1680   exit(0);
1681 }
1682
1683 my ($in_doc, $out_doc);
1684
1685 # Process arguments.
1686 # - If no files named, read string, write to stdout.
1687 # - If files named, read and process each one. Write output to stdout
1688 #   unless --in-place option was given.  Make backup of original file
1689 #   if --backup option was given.
1690
1691 if (@ARGV == 0)
1692 {
1693   warn "Reading document...\n" if $verbose;
1694   {
1695     local $/ = undef;
1696     $in_doc = <>;            # slurp input document as single string
1697   }
1698
1699   $out_doc = $xf->process_doc ($in_doc,
1700               $verbose, $check_parser, $canonize_only, $show_unconf_elts);
1701   if (defined ($out_doc))
1702   {
1703     warn "Writing output document...\n" if $verbose;
1704     print $out_doc;
1705   }
1706 }
1707 else
1708 {
1709   foreach my $file (@ARGV)
1710   {
1711     warn "Reading document $file...\n" if $verbose;
1712     open (IN, $file)
1713       or die "Cannot read $file: $!\n";
1714     {
1715       local $/ = undef;
1716       $in_doc = <IN>;            # slurp input document as single string
1717     }
1718     close (IN);
1719     $out_doc = $xf->process_doc ($in_doc,
1720                 $verbose, $check_parser, $canonize_only, $show_unconf_elts);
1721     next unless defined ($out_doc);
1722     if (defined ($in_place))
1723     {
1724       if (defined ($backup_suffix))
1725       {
1726         warn "Making backup of $file to $file$backup_suffix...\n" if $verbose;
1727         rename ($file, $file . $backup_suffix)
1728           or die "Could not rename $file to $file$backup_suffix: $!\n";
1729       }
1730       warn "Writing output document to $file...\n" if $verbose;
1731       open (OUT, ">$file") or die "Cannot write to $file: $!\n";
1732       print OUT $out_doc;
1733       close (OUT);
1734     }
1735     else
1736     {
1737       warn "Writing output document...\n" if $verbose;
1738       print $out_doc;
1739     }
1740   }
1741 }
1742
1743 warn "Done!\n" if $verbose;
1744
1745 exit (0);