WIP: set instaweb.httpd = mongoose globally
[msysgit.git] / bin / docx2txt
blob7bf8de7f11358672f278c5acd2b33b8d43c8fe9e
1 #!/usr/bin/env perl
3 # docx2txt, a command-line utility to convert Docx documents to text format.
4 # Copyright (C) 2008-2009 Sandeep Kumar
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 # This script extracts text from document.xml contained inside .docx file.
22 # Perl v5.8.2 was used for testing this script.
24 # Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
26 # ChangeLog :
28 # 10/08/2008 - Initial version (v0.1)
29 # 15/08/2008 - Script takes two arguments [second optional] now and can be
30 # used independently to extract text from docx file. It accepts
31 # docx file directly, instead of xml file.
32 # 18/08/2008 - Added support for center and right justification of text that
33 # fits in a line 80 characters wide (adjustable).
34 # 03/09/2008 - Fixed the slip in usage message.
35 # 12/09/2008 - Slightly changed the script invocation and argument handling
36 # to incorporate some of the shell script functionality here.
37 # Added support to handle embedded urls in docx document.
38 # 23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
39 # Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
40 # during installation.
41 # 31/08/2009 - Added support for handling more escape characters.
42 # Using OS specific null device to redirect stderr.
43 # Saving text file in binary mode.
44 # 03/09/2009 - Updations based on feedback/suggestions from Sergei Kulakov
45 # (sergei>AT<dewia>DOT<com).
46 # - removal of non-document text in between TOC related tags.
47 # - display of hyperlink alongside linked text user controlled.
48 # - some character conversion updates
49 # 05/09/2009 - Merged cjustify and rjustify into single subroutine justify.
50 # Added more character conversions.
51 # Organised conversion mappings in tabular form for speedup and
52 # easy maintenance.
53 # Tweaked code to reduce number of passes over document content.
54 # 10/09/2009 - For leaner text experience, hyperlink is not displayed if
55 # hyperlink and hyperlinked text are same, even if user has
56 # enabled hyperlink display.
57 # Improved handling of short line justification. Many
58 # justification tag patterns were not captured earlier.
59 # 11/09/2009 - A directory holding the unzipped content of .docx file can
60 # also be specified as argument to the script, in place of file.
61 # 17/09/2009 - Removed trailing slashes from input directory name.
62 # Updated unzip command invocations to handle path names
63 # containing spaces.
64 # 01/10/2009 - Added support for configuration file.
65 # 02/10/2009 - Using single quotes to specify path for unzip command.
66 # 04/10/2009 - Corrected configuration option name lineIndent to listIndent.
71 # The default settings below can be overridden via docx2txt.config - searched
72 # first in current directory and then in the same location as this script.
75 our $unzip = '/usr/bin/unzip'; # Windows path like 'C:/path/to/unzip.exe'
76 our $newLine = "\n"; # Alternative is "\r\n".
77 our $listIndent = " "; # Indent nested lists by "\t", " " etc.
78 our $lineWidth = 80; # Line width, used for short line justification.
79 our $showHyperLink = "N"; # Show hyperlink alongside linked text.
82 # ToDo: Better list handling. Currently assumed 8 level nesting.
83 my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--');
86 # Character conversion tables
89 # Only amp, gt and lt are required for docx escapes, others are used for better
90 # text experience.
91 my %escChrs = ( amp => '&', gt => '>', lt => '<',
92 acute => '\'', brvbar => '|', copy => '(C)', divide => '/',
93 laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>',
94 reg => '(R)', shy => '-', times => 'x'
97 my %splchars = (
98 "\xC2\xA0" => ' ', # <nbsp>
99 "\xC2\xA6" => '|', # <brokenbar>
100 "\xC2\xA9" => '(C)', # <copyright>
101 "\xC2\xAB" => '<<', # <laquo>
102 "\xC2\xAC" => '-', # <negate>
103 "\xC2\xAE" => '(R)', # <regd>
104 "\xC2\xB1" => '+-', # <plusminus>
105 "\xC2\xBB" => '>>', # <raquo>
107 # "\xC2\xA7" => '', # <section>
108 # "\xC2\xB6" => '', # <para>
110 "\xC3\x97" => 'x', # <mul>
111 "\xC3\xB7" => '/', # <div>
113 "\xE2\x80\x82" => ' ', # <enspc>
114 "\xE2\x80\x83" => ' ', # <emspc>
115 "\xE2\x80\x85" => ' ', # <qemsp>
116 "\xE2\x80\x93" => ' - ', # <endash>
117 "\xE2\x80\x94" => ' -- ', # <emdash>
118 "\xE2\x80\x98" => '`', # <soq>
119 "\xE2\x80\x99" => '\'', # <scq>
120 "\xE2\x80\x9C" => '"', # <doq>
121 "\xE2\x80\x9D" => '"', # <dcq>
122 "\xE2\x80\xA2" => '::', # <diamond symbol>
123 "\xE2\x80\xA6" => '...', # <ellipsis>
125 "\xE2\x84\xA2" => '(TM)', # <trademark>
127 "\xE2\x89\xA0" => '!=', # <neq>
128 "\xE2\x89\xA4" => '<=', # <leq>
129 "\xE2\x89\xA5" => '>=', # <geq>
132 # Currency symbols
134 "\xC2\xA2" => 'cent',
135 "\xC2\xA3" => 'Pound',
136 "\xC2\xA5" => 'Yen',
137 "\xE2\x82\xAC" => 'Euro'
142 # Check argument(s) sanity.
145 my $usage = <<USAGE;
147 Usage: $0 <infile.docx> [outfile.txt|-]
149 Use '-' as the outfile name to dump the text on STDOUT.
150 Output is saved in infile.txt if second argument is omitted.
152 infile.docx can also be a directory name holding the unzipped content
153 of concerned .docx file.
155 USAGE
157 die $usage if (@ARGV == 0 || @ARGV > 2);
161 # Check for existence and readability of required file in specified directory,
162 # and whether it is a text file.
165 sub check_for_required_file_in_folder {
166 stat("$_[1]/$_[0]");
167 die "Can't read <$_[0]> in <$_[1]>!\n" if ! (-f _ && -r _);
168 die "<$_[1]/$_[0]> does not seem to be a text file!\n" if ! -T _;
171 sub readFileInto
173 local $/ = undef;
174 open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n";
175 binmode $fh;
176 $_[1] = <$fh>;
177 close $fh;
182 # Check whether first argument is specifying a directory holding extracted
183 # content of .docx file, or .docx file itself.
186 stat($ARGV[0]);
188 if (-d _) {
189 check_for_required_file_in_folder("word/document.xml", $ARGV[0]);
190 check_for_required_file_in_folder("word/_rels/document.xml.rels", $ARGV[0]);
191 $inpIsDir = 'y';
193 else {
194 die "Can't read docx file <$ARGV[0]>!\n" if ! (-f _ && -r _);
195 die "<$ARGV[0]> does not seem to be docx file!\n" if -T _;
200 # Get user configuration, if any.
203 my %config;
205 if (-f "docx2txt.config") {
206 %config = do 'docx2txt.config';
207 } elsif ($0 =~ m%^(.*[/\\])[^/\\]*?$%) {
208 %config = do "$1docx2txt.config" if (-f "$1docx2txt.config");
211 if (%config) {
212 foreach my $var (keys %config) {
213 $$var = $config{$var};
219 # Extract xml document content from argument docx file/directory.
222 if ($ENV{OS} =~ /^Windows/) {
223 $nulldevice = "nul";
224 } else {
225 $nulldevice = "/dev/null";
228 if ($inpIsDir eq 'y') {
229 readFileInto("$ARGV[0]/word/document.xml", $content);
230 } else {
231 $content = `"$unzip" -p "$ARGV[0]" word/document.xml 2>$nulldevice`;
234 die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;
238 # Be ready for outputting the extracted text contents.
241 if (@ARGV == 1) {
242 $ARGV[1] = $ARGV[0];
244 # Remove any trailing slashes to generate proper output filename, when
245 # input is directory.
246 $ARGV[1] =~ s%[/\\]+$%% if ($inpIsDir eq 'y');
248 $ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/);
251 my $txtfile;
252 open($txtfile, "> $ARGV[1]") || die "Can't create <$ARGV[1]> for output!\n";
253 binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
257 # Gather information about header, footer, hyperlinks, images, footnotes etc.
260 if ($inpIsDir eq 'y') {
261 readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
262 } else {
263 $_ = `"$unzip" -p "$ARGV[0]" word/_rels/document.xml.rels 2>$nulldevice`;
266 my %docurels;
267 while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( .*?)?\/>/g)
269 $docurels{"$2:$1"} = $3;
274 # Subroutines for center and right justification of text in a line.
277 sub justify {
278 my $len = length $_[1];
280 if ($_[0] eq "center" && $len < ($lineWidth - 1)) {
281 return ' ' x (($lineWidth - $len) / 2) . $_[1];
282 } elsif ($_[0] eq "right" && $len < $lineWidth) {
283 return ' ' x ($lineWidth - $len) . $_[1];
284 } else {
285 return $_[1];
290 # Subroutines for dealing with embedded links and images
293 sub hyperlink {
294 my $hlrid = $_[0];
295 my $hltext = $_[1];
296 my $hlink = $docurels{"hyperlink:$hlrid"};
298 $hltext =~ s/<[^>]*?>//og;
299 $hltext .= " [HYPERLINK: $hlink]" if ($showHyperLink eq "y" && $hltext ne $hlink);
301 return $hltext;
305 # Subroutines for processing paragraph content.
308 sub processParagraph {
309 my $para = $_[0] . "$newLine";
310 my $align = $1 if ($_[0] =~ /<w:jc w:val="([^"]*?)"\/>/);
312 $para =~ s/<.*?>//og;
313 return justify($align,$para) if $align;
315 return $para;
320 # Force configuration value to lowercase as expected by script.
322 $showHyperLink = lc $showHyperLink;
326 # Text extraction starts.
329 my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - ");
331 $content =~ s/<?xml .*?\?>(\r)?\n//;
333 # Remove stuff between TOC related tags.
334 if ($content =~ m|<w:pStyle w:val="TOCHeading"/>|) {
335 $content =~ s|<w:instrText[^>]*>.*?</w:instrText>||og;
338 $content =~ s{<w:(tab|noBreakHyphen|softHyphen)/>}|$tag2chr{$1}|og;
340 my $hr = '-' x $lineWidth . $newLine;
341 $content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og;
343 $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$listIndent x $1 . "$levchar[$1] "|oge;
346 # Uncomment either of below two lines and comment above line, if dealing
347 # with more than 8 level nested lists.
350 # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$listIndent x $1 . '* '|oge;
351 # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|'*' x ($1+1) . ' '|oge;
353 $content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge;
355 $content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperlink($1,$2)/oge;
357 $content =~ s/<w:p [^>]+?>(.*?)<\/w:p>/processParagraph($1)/oge;
359 $content =~ s{<w:p [^/>]+?/>|</w:p>|<w:br/>}|$newLine|og;
360 $content =~ s/<.*?>//og;
364 # Convert non-ASCII characters/character sequences to ASCII characters.
367 $content =~ s/(\xE2..|\xC2.|\xC3.)/($splchars{$1} ? $splchars{$1} : $1)/oge;
370 # Convert docx specific escape chars first.
372 $content =~ s/(&)(amp|gt|lt)(;)/$escChrs{lc $2}/iog;
375 # Another pass for a better text experience, after sequences like "&amp;laquo;"
376 # are converted to "&laquo;".
378 $content =~ s/((&)([a-z]+)(;))/($escChrs{lc $3} ? $escChrs{lc $3} : $1)/ioge;
382 # Write the extracted and converted text contents to output.
385 print $txtfile $content;
386 close $txtfile;