bin/docx2txt

   1 #!/usr/bin/env perl
   2
   3 # docx2txt, a command-line utility to convert Docx documents to text format.
   4 # Copyright (C) 2008-2009 Sandeep Kumar
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19
  20 #
  21 # This script extracts text from document.xml contained inside .docx file.
  22 # Perl v5.8.2 was used for testing this script.
  23 #
  24 # Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
  25 #
  26 # ChangeLog :
  27 #
  28 #    10/08/2008 - Initial version (v0.1)
  29 #    15/08/2008 - Script takes two arguments [second optional] now and can be
  30 #                 used independently to extract text from docx file. It accepts
  31 #                 docx file directly, instead of xml file.
  32 #    18/08/2008 - Added support for center and right justification of text that
  33 #                 fits in a line 80 characters wide (adjustable).
  34 #    03/09/2008 - Fixed the slip in usage message.
  35 #    12/09/2008 - Slightly changed the script invocation and argument handling
  36 #                 to incorporate some of the shell script functionality here.
  37 #                 Added support to handle embedded urls in docx document.
  38 #    23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
  39 #                 Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
  40 #                 during installation.
  41 #    31/08/2009 - Added support for handling more escape characters.
  42 #                 Using OS specific null device to redirect stderr.
  43 #                 Saving text file in binary mode.
  44 #    03/09/2009 - Updations based on feedback/suggestions from Sergei Kulakov
  45 #                 (sergei>AT<dewia>DOT<com).
  46 #                 - removal of non-document text in between TOC related tags.
  47 #                 - display of hyperlink alongside linked text user controlled.
  48 #                 - some character conversion updates
  49 #    05/09/2009 - Merged cjustify and rjustify into single subroutine justify.
  50 #                 Added more character conversions.
  51 #                 Organised conversion mappings in tabular form for speedup and
  52 #                 easy maintenance.
  53 #                 Tweaked code to reduce number of passes over document content.
  54 #    10/09/2009 - For leaner text experience, hyperlink is not displayed if
  55 #                 hyperlink and hyperlinked text are same, even if user has
  56 #                 enabled hyperlink display.
  57 #                 Improved handling of short line justification. Many
  58 #                 justification tag patterns were not captured earlier.
  59 #    11/09/2009 - A directory holding the unzipped content of .docx file can
  60 #                 also be specified as argument to the script, in place of file.
  61 #    17/09/2009 - Removed trailing slashes from input directory name.
  62 #                 Updated unzip command invocations to handle path names
  63 #                 containing spaces.
  64 #    01/10/2009 - Added support for configuration file.
  65 #    02/10/2009 - Using single quotes to specify path for unzip command.
  66 #    04/10/2009 - Corrected configuration option name lineIndent to listIndent.
  67 #
  68
  69
  70 #
  71 # The default settings below can be overridden via docx2txt.config - searched
  72 # first in current directory and then in the same location as this script.
  73 #
  74
  75 our $unzip = '/usr/bin/unzip';  # Windows path like 'C:/path/to/unzip.exe'
  76 our $newLine = "\n";            # Alternative is "\r\n".
  77 our $listIndent = "  ";         # Indent nested lists by "\t", " " etc.
  78 our $lineWidth = 80;            # Line width, used for short line justification.
  79 our $showHyperLink = "N";       # Show hyperlink alongside linked text.
  80
  81
  82 # ToDo: Better list handling. Currently assumed 8 level nesting.
  83 my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--');
  84
  85 #
  86 # Character conversion tables
  87 #
  88
  89 # Only amp, gt and lt are required for docx escapes, others are used for better
  90 # text experience.
  91 my %escChrs = ( amp => '&', gt => '>', lt => '<',
  92                 acute => '\'', brvbar => '|', copy => '(C)', divide => '/',
  93                 laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>',
  94                 reg => '(R)', shy => '-', times => 'x'
  95 );
  96
  97 my %splchars = (
  98         "\xC2\xA0" => ' ',              # <nbsp>
  99         "\xC2\xA6" => '|',              # <brokenbar>
 100         "\xC2\xA9" => '(C)',            # <copyright>
 101         "\xC2\xAB" => '<<',             # <laquo>
 102         "\xC2\xAC" => '-',              # <negate>
 103         "\xC2\xAE" => '(R)',            # <regd>
 104         "\xC2\xB1" => '+-',             # <plusminus>
 105         "\xC2\xBB" => '>>',             # <raquo>
 106
 107 #       "\xC2\xA7" => '',               # <section>
 108 #       "\xC2\xB6" => '',               # <para>
 109
 110         "\xC3\x97" => 'x',              # <mul>
 111         "\xC3\xB7" => '/',              # <div>
 112
 113         "\xE2\x80\x82" => '  ',         # <enspc>
 114         "\xE2\x80\x83" => '  ',         # <emspc>
 115         "\xE2\x80\x85" => ' ',          # <qemsp>
 116         "\xE2\x80\x93" => ' - ',        # <endash>
 117         "\xE2\x80\x94" => ' -- ',       # <emdash>
 118         "\xE2\x80\x98" => '`',          # <soq>
 119         "\xE2\x80\x99" => '\'',         # <scq>
 120         "\xE2\x80\x9C" => '"',          # <doq>
 121         "\xE2\x80\x9D" => '"',          # <dcq>
 122         "\xE2\x80\xA2" => '::',         # <diamond symbol>
 123         "\xE2\x80\xA6" => '...',        # <ellipsis>
 124
 125         "\xE2\x84\xA2" => '(TM)',       # <trademark>
 126
 127         "\xE2\x89\xA0" => '!=',         # <neq>
 128         "\xE2\x89\xA4" => '<=',         # <leq>
 129         "\xE2\x89\xA5" => '>=',         # <geq>
 130
 131         #
 132         # Currency symbols
 133         #
 134         "\xC2\xA2" => 'cent',
 135         "\xC2\xA3" => 'Pound',
 136         "\xC2\xA5" => 'Yen',
 137         "\xE2\x82\xAC" => 'Euro'
 138 );
 139
 140
 141 #
 142 # Check argument(s) sanity.
 143 #
 144
 145 my $usage = <<USAGE;
 146
 147 Usage:  $0 <infile.docx> [outfile.txt|-]
 148
 149         Use '-' as the outfile name to dump the text on STDOUT.
 150         Output is saved in infile.txt if second argument is omitted.
 151
 152         infile.docx can also be a directory name holding the unzipped content
 153         of concerned .docx file.
 154
 155 USAGE
 156
 157 die $usage if (@ARGV == 0 || @ARGV > 2);
 158
 159
 160 #
 161 # Check for existence and readability of required file in specified directory,
 162 # and whether it is a text file.
 163 #
 164
 165 sub check_for_required_file_in_folder {
 166     stat("$_[1]/$_[0]");
 167     die "Can't read <$_[0]> in <$_[1]>!\n" if ! (-f _ && -r _);
 168     die "<$_[1]/$_[0]> does not seem to be a text file!\n" if ! -T _;
 169 }
 170
 171 sub readFileInto
 172 {
 173   local $/ = undef;
 174   open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n";
 175   binmode $fh;
 176   $_[1] = <$fh>;
 177   close $fh;
 178 }
 179
 180
 181 #
 182 # Check whether first argument is specifying a directory holding extracted
 183 # content of .docx file, or .docx file itself.
 184 #
 185
 186 stat($ARGV[0]);
 187
 188 if (-d _) {
 189     check_for_required_file_in_folder("word/document.xml", $ARGV[0]);
 190     check_for_required_file_in_folder("word/_rels/document.xml.rels", $ARGV[0]);
 191     $inpIsDir = 'y';
 192 }
 193 else {
 194     die "Can't read docx file <$ARGV[0]>!\n" if ! (-f _ && -r _);
 195     die "<$ARGV[0]> does not seem to be docx file!\n" if -T _;
 196 }
 197
 198
 199 #
 200 # Get user configuration, if any.
 201 #
 202
 203 my %config;
 204
 205 if (-f "docx2txt.config") {
 206     %config = do 'docx2txt.config';
 207 } elsif ($0 =~ m%^(.*[/\\])[^/\\]*?$%) {
 208     %config = do "$1docx2txt.config" if (-f "$1docx2txt.config");
 209 }
 210
 211 if (%config) {
 212     foreach my $var (keys %config) {
 213         $$var = $config{$var};
 214     }
 215 }
 216
 217
 218 #
 219 # Extract xml document content from argument docx file/directory.
 220 #
 221
 222 if ($ENV{OS} =~ /^Windows/) {
 223     $nulldevice = "nul";
 224 } else {
 225     $nulldevice = "/dev/null";
 226 }
 227
 228 if ($inpIsDir eq 'y') {
 229     readFileInto("$ARGV[0]/word/document.xml", $content);
 230 } else {
 231     $content = `"$unzip" -p "$ARGV[0]" word/document.xml 2>$nulldevice`;
 232 }
 233
 234 die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;
 235
 236
 237 #
 238 # Be ready for outputting the extracted text contents.
 239 #
 240
 241 if (@ARGV == 1) {
 242      $ARGV[1] = $ARGV[0];
 243
 244      # Remove any trailing slashes to generate proper output filename, when
 245      # input is directory.
 246      $ARGV[1] =~ s%[/\\]+$%% if ($inpIsDir eq 'y');
 247
 248      $ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/);
 249 }
 250
 251 my $txtfile;
 252 open($txtfile, "> $ARGV[1]") || die "Can't create <$ARGV[1]> for output!\n";
 253 binmode $txtfile;    # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
 254
 255
 256 #
 257 # Gather information about header, footer, hyperlinks, images, footnotes etc.
 258 #
 259
 260 if ($inpIsDir eq 'y') {
 261     readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
 262 } else {
 263     $_ = `"$unzip" -p "$ARGV[0]" word/_rels/document.xml.rels 2>$nulldevice`;
 264 }
 265
 266 my %docurels;
 267 while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( .*?)?\/>/g)
 268 {
 269     $docurels{"$2:$1"} = $3;
 270 }
 271
 272
 273 #
 274 # Subroutines for center and right justification of text in a line.
 275 #
 276
 277 sub justify {
 278     my $len = length $_[1];
 279
 280     if ($_[0] eq "center" && $len < ($lineWidth - 1)) {
 281         return ' ' x (($lineWidth - $len) / 2) . $_[1];
 282     } elsif ($_[0] eq "right" && $len < $lineWidth) {
 283         return ' ' x ($lineWidth - $len) . $_[1];
 284     } else {
 285         return $_[1];
 286     }
 287 }
 288
 289 #
 290 # Subroutines for dealing with embedded links and images
 291 #
 292
 293 sub hyperlink {
 294     my $hlrid = $_[0];
 295     my $hltext = $_[1];
 296     my $hlink = $docurels{"hyperlink:$hlrid"};
 297
 298     $hltext =~ s/<[^>]*?>//og;
 299     $hltext .= " [HYPERLINK: $hlink]" if ($showHyperLink eq "y" && $hltext ne $hlink);
 300
 301     return $hltext;
 302 }
 303
 304 #
 305 # Subroutines for processing paragraph content.
 306 #
 307
 308 sub processParagraph {
 309     my $para = $_[0] . "$newLine";
 310     my $align = $1 if ($_[0] =~ /<w:jc w:val="([^"]*?)"\/>/);
 311
 312     $para =~ s/<.*?>//og;
 313     return justify($align,$para) if $align;
 314
 315     return $para;
 316 }
 317
 318
 319 #
 320 # Force configuration value to lowercase as expected by script.
 321 #
 322 $showHyperLink = lc $showHyperLink;
 323
 324
 325 #
 326 # Text extraction starts.
 327 #
 328
 329 my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - ");
 330
 331 $content =~ s/<?xml .*?\?>(\r)?\n//;
 332
 333 # Remove stuff between TOC related tags.
 334 if ($content =~ m|<w:pStyle w:val="TOCHeading"/>|) {
 335     $content =~ s|<w:instrText[^>]*>.*?</w:instrText>||og;
 336 }
 337
 338 $content =~ s{<w:(tab|noBreakHyphen|softHyphen)/>}|$tag2chr{$1}|og;
 339
 340 my $hr = '-' x $lineWidth . $newLine;
 341 $content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og;
 342
 343 $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$listIndent x $1 . "$levchar[$1] "|oge;
 344
 345 #
 346 # Uncomment either of below two lines and comment above line, if dealing
 347 # with more than 8 level nested lists.
 348 #
 349
 350 # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$listIndent x $1 . '* '|oge;
 351 # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|'*' x ($1+1) . ' '|oge;
 352
 353 $content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge;
 354
 355 $content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperlink($1,$2)/oge;
 356
 357 $content =~ s/<w:p [^>]+?>(.*?)<\/w:p>/processParagraph($1)/oge;
 358
 359 $content =~ s{<w:p [^/>]+?/>|</w:p>|<w:br/>}|$newLine|og;
 360 $content =~ s/<.*?>//og;
 361
 362
 363 #
 364 # Convert non-ASCII characters/character sequences to ASCII characters.
 365 #
 366
 367 $content =~ s/(\xE2..|\xC2.|\xC3.)/($splchars{$1} ? $splchars{$1} : $1)/oge;
 368
 369 #
 370 # Convert docx specific escape chars first.
 371 #
 372 $content =~ s/(&)(amp|gt|lt)(;)/$escChrs{lc $2}/iog;
 373
 374 #
 375 # Another pass for a better text experience, after sequences like "&amp;laquo;"
 376 # are converted to "&laquo;".
 377 #
 378 $content =~ s/((&)([a-z]+)(;))/($escChrs{lc $3} ? $escChrs{lc $3} : $1)/ioge;
 379
 380
 381 #
 382 # Write the extracted and converted text contents to output.
 383 #
 384
 385 print $txtfile $content;
 386 close $txtfile;
 387