2 * odt2txt.c: A simple (and stupid) converter from OpenDocument Text
5 * Copyright (c) 2006-2009 Dennis Stosberg <dennis@stosberg.net>
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License,
9 * version 2 as published by the Free Software Foundation
13 #include <sys/types.h>
24 # include <langinfo.h>
40 # include "kunzip/kunzip.h"
48 static int opt_raw_input
= 0;
49 static char *opt_encoding
;
50 static int opt_width
= 63;
51 static const char *opt_filename
;
52 static char *opt_output
;
58 static int opt_subst
= SUBST_SOME
;
61 #define ICONV_CHAR char
65 static void show_iconvlist();
68 #define RS_O(a,b) (void)regex_subst(buf, (a), _REG_DEFAULT, (b))
69 #define RS_G(a,b) (void)regex_subst(buf, (a), _REG_GLOBAL, (b))
70 #define RS_E(a,b) (void)regex_subst(buf, (a), _REG_EXEC | _REG_GLOBAL, (void*)(b))
72 static char *guess_encoding(void);
73 static void write_to_file(STRBUF
*outbuf
, const char *filename
);
81 static struct subst substs
[] = {
82 /* number, UTF-8 sequence, ascii substitution */
83 { 0x00A0, "\xC2\xA0", " " }, /* no-break space */
84 { 0x00A9, "\xC2\xA9", "(c)" }, /* copyright sign */
85 { 0x00AB, "\xC2\xAB", "<<" }, /* left double angle quote */
86 { 0x00AD, "\xC2\xAD", "-" }, /* soft hyphen */
87 { 0x00AE, "\xC2\xAE", "(r)" }, /* registered sign */
88 { 0x00BB, "\xC2\xBB", ">>" }, /* right double angle quote */
90 { 0x00BC, "\xC2\xBC", "1/4" }, /* one quarter */
91 { 0x00BD, "\xC2\xBD", "1/2" }, /* one half */
92 { 0x00BE, "\xC2\xBE", "3/4" }, /* three quarters */
94 { 0x00C4, "\xC3\x84", "Ae" }, /* german umlaut A */
95 { 0x00D6, "\xC3\x96", "Oe" }, /* german umlaut O */
96 { 0x00DC, "\xC3\x9C", "Ue" }, /* german umlaut U */
97 { 0x00DF, "\xC3\x9F", "ss" }, /* german sharp s */
98 { 0x00E4, "\xC3\xA4", "ae" }, /* german umlaut a */
99 { 0x00F6, "\xC3\xB6", "oe" }, /* german umlaut o */
100 { 0x00FC, "\xC3\xBC", "ue" }, /* german umlaut u */
102 { 0x2010, "\xE2\x80\x90", "-" }, /* hyphen */
103 { 0x2011, "\xE2\x80\x91", "-" }, /* non-breaking hyphen */
104 { 0x2012, "\xE2\x80\x92", "-" }, /* figure dash */
105 { 0x2013, "\xE2\x80\x93", "-" }, /* en dash */
106 { 0x2014, "\xE2\x80\x94", "--" }, /* em dash */
107 { 0x2015, "\xE2\x80\x95", "--" }, /* quotation dash */
109 { 0x2018, "\xE2\x80\x98", "`" }, /* single left quotation mark */
110 { 0x2019, "\xE2\x80\x99", "'" }, /* single right quotation mark */
111 { 0x201A, "\xE2\x80\x9A", "," }, /* german single right quotation mark */
112 { 0x201B, "\xE2\x80\x9B", "`" }, /* reversed right quotation mark */
113 { 0x201C, "\xE2\x80\x9C", "``" }, /* left quotation mark */
114 { 0x201D, "\xE2\x80\x9D", "''" }, /* right quotation mark */
115 { 0x201E, "\xE2\x80\x9E", ",," }, /* german left quotes */
117 { 0x2022, "\xE2\x80\xA2", "o " }, /* bullet */
118 { 0x2022, "\xE2\x80\xA3", "< " }, /* triangle bullet */
120 { 0x2025, "\xE2\x80\xA5", ".." }, /* double dot */
121 { 0x2026, "\xE2\x80\xA6", "..." }, /* ellipsis */
123 { 0x2030, "\xE2\x80\xB0", "o/oo" }, /* per mille */
124 { 0x2039, "\xE2\x80\xB9", "<" }, /* left single angle quote */
125 { 0x203A, "\xE2\x80\xBA", ">" }, /* right single angle quote */
127 { 0x20AC, "\xE2\x82\xAC", "EUR" }, /* euro currency symbol */
129 { 0x2190, "\xE2\x86\x90", "<-" }, /* left arrow */
130 { 0x2192, "\xE2\x86\x92", "->" }, /* right arrow */
131 { 0x2194, "\xE2\x86\x94", "<->"}, /* left right arrow */
136 static void usage(void)
138 printf("odt2txt %s\n"
139 "Converts an OpenDocument or OpenOffice.org XML File to raw text.\n\n"
140 "Syntax: odt2txt [options] filename\n\n"
141 "Options: --raw Print raw XML\n"
142 " --raw-input Input file is a raw XML (fodt, fods, ...)\n"
144 " --encoding=X Ignored. odt2txt has been built without iconv support.\n"
145 " Output will always be encoded in UTF-8\n"
147 " --encoding=X Do not try to autodetect the terminal encoding, but\n"
148 " convert the document to encoding X unconditionally\n"
150 " You can list all supported encodings by specifying\n"
153 " To find out, which terminal encoding will be used in\n"
154 " auto mode, use --encoding=show\n"
156 " --width=X Wrap text lines after X characters. Default: 65.\n"
157 " If set to -1 then no lines will be broken\n"
158 " --output=file Write output to file, instead of STDOUT\n"
159 " --subst=X Select which non-ascii characters shall be replaced\n"
160 " by ascii look-a-likes:\n"
161 " --subst=all Substitute all characters for which\n"
162 " substitutions are known\n"
163 " --subst=some Substitute all characters which the\n"
164 " output charset does not contain\n"
165 " This is the default\n"
166 " --subst=none Substitute no characters\n"
167 " --version Show version and copyright information\n",
172 static void version_info(void)
174 printf("odt2txt %s\n"
175 "Copyright (c) 2006,2007 Dennis Stosberg <dennis@stosberg.net>\n"
177 "Uses the kunzip library, Copyright 2005,2006 by Michael Kohn\n"
180 "This program is free software; you can redistribute it and/or\n"
181 "modify it under the terms of the GNU General Public License,\n"
182 "version 2 as published by the Free Software Foundation\n"
184 "https://github.com/dstosberg/odt2txt\n",
189 static void yrealloc_buf(char **buf
, char **mark
, size_t len
) {
190 ptrdiff_t offset
= *mark
- *buf
;
191 *buf
= yrealloc(*buf
, len
);
192 *mark
= *buf
+ offset
;
197 static void finish_conv(iconv_t ic
)
202 static iconv_t
init_conv(const char *input_enc
, const char *output_enc
)
207 static STRBUF
*conv(iconv_t ic
, STRBUF
*buf
) {
210 output
= strbuf_new();
211 strbuf_append_n(output
, strbuf_get(buf
), strbuf_len(buf
));
216 static void subst_doc(iconv_t ic
, STRBUF
*buf
) {
220 static char *guess_encoding(void)
227 static iconv_t
init_conv(const char *input_enc
, const char *output_enc
)
230 ic
= iconv_open(output_enc
, input_enc
);
231 if (ic
== (iconv_t
)-1) {
232 if (errno
== EINVAL
) {
233 fprintf(stderr
, "warning: Conversion from %s to %s is not supported.\n",
234 input_enc
, opt_encoding
);
235 ic
= iconv_open("us-ascii", input_enc
);
236 if (ic
== (iconv_t
)-1) {
239 fprintf(stderr
, "warning: Using us-ascii as fall-back.\n");
241 fprintf(stderr
, "iconv_open returned: %s\n", strerror(errno
));
248 static void finish_conv(iconv_t ic
)
250 if(iconv_close(ic
) == -1) {
251 fprintf(stderr
, "iconv_close returned: %s\n", strerror(errno
));
256 static STRBUF
*conv(iconv_t ic
, STRBUF
*buf
)
258 /* FIXME: This functionality belongs into strbuf.c */
261 size_t inleft
, outleft
= 0;
264 const size_t alloc_step
= 4096;
267 inleft
= strbuf_len(buf
);
268 doc
= (ICONV_CHAR
*)strbuf_get(buf
);
269 outlen
= alloc_step
; outleft
= alloc_step
;
270 outbuf
= ymalloc(alloc_step
);
272 outleft
= alloc_step
;
276 outlen
+= alloc_step
; outleft
+= alloc_step
;
277 yrealloc_buf(&outbuf
, &out
, outlen
);
279 r
= iconv(ic
, &doc
, &inleft
, &out
, &outleft
);
280 if (r
== (size_t)-1) {
282 outlen
+= alloc_step
; outleft
+= alloc_step
;
283 if (outlen
> (strbuf_len(buf
) << 3)) {
284 fprintf(stderr
, "Buffer grew to much. "
285 "Corrupted document?\n");
288 yrealloc_buf(&outbuf
, &out
, outlen
);
290 } else if ((errno
== EILSEQ
) || (errno
== EINVAL
)) {
293 /* advance in source buffer */
294 if ((unsigned char)*doc
> 0x80)
295 skip
+= utf8_length
[(unsigned char)*doc
- 0x80];
299 /* advance in output buffer */
306 fprintf(stderr
, "iconv returned: %s\n", strerror(errno
));
309 } while(inleft
!= 0);
312 outbuf
= yrealloc(outbuf
, outlen
+ 1);
316 output
= strbuf_slurp_n(outbuf
, (size_t)(out
- outbuf
));
317 strbuf_setopt(output
, STRBUF_NULLOK
);
321 static void subst_doc(iconv_t ic
, STRBUF
*buf
)
323 struct subst
*s
= substs
;
326 const size_t outbuf_sz
= 20;
332 if (opt_subst
== SUBST_NONE
)
335 outbuf
= ymalloc(outbuf_sz
);
337 if (opt_subst
== SUBST_ALL
) {
338 RS_G(s
->utf8
, s
->ascii
);
342 in
= (ICONV_CHAR
*)s
->utf8
;
344 r
= iconv(ic
, &in
, &inleft
, &out
, &outleft
);
345 if (r
== (size_t)-1) {
346 if ((errno
== EILSEQ
) || (errno
== EINVAL
)) {
347 RS_G(s
->utf8
, s
->ascii
);
350 "iconv returned an unexpected error: %s\n",
361 static char *guess_encoding(void)
368 snprintf(enc
, 20, "CP%u", GetACP());
370 tmp
= nl_langinfo(CODESET
);
371 strncpy(enc
, tmp
, 20);
374 fprintf(stderr
, "warning: Could not detect console "
375 "encoding. Assuming ISO-8859-1\n");
376 strncpy(enc
, "ISO-8859-1", 20);
384 static STRBUF
*read_from_zip(const char *zipfile
, const char *filename
)
387 STRBUF
*content
= NULL
;
390 r
= kunzip_get_offset_by_name((char*)zipfile
, (char*)filename
, 3, -1);
393 struct zip
*zip
= NULL
;
394 struct zip_stat stat
;
395 struct zip_file
*unzipped
= NULL
;
398 if ( !(zip
= zip_open(zipfile
, 0, &zip_error
)) ||
399 (r
= zip_name_locate(zip
, filename
, 0)) < 0 ||
400 (zip_stat_index(zip
, r
, ZIP_FL_UNCHANGED
, &stat
) < 0) ||
401 !(unzipped
= zip_fopen_index(zip
, r
, ZIP_FL_UNCHANGED
)) ) {
403 zip_fclose(unzipped
);
412 "Can't read from %s: Is it an OpenDocument Text?\n", zipfile
);
417 content
= kunzip_next_tobuf((char*)zipfile
, r
);
419 if ( !(buf
= ymalloc(stat
.size
+ 1)) ||
420 ((zip_uint64_t
)zip_fread(unzipped
, buf
, stat
.size
) != stat
.size
) ||
421 !(content
= strbuf_slurp_n(buf
, stat
.size
)) ) {
426 zip_fclose(unzipped
);
432 "Can't extract %s from %s. Maybe the file is corrupted?\n",
440 static STRBUF
*read_from_xml(const char *xmlfile
, const char *filename
)
442 FILE *in
= fopen(xmlfile
, "rb");
444 fprintf(stderr
, "Can't open %s.\n", filename
);
448 STRBUF
*content
= strbuf_new();
449 strbuf_append_file(content
, in
);
456 static void format_doc(STRBUF
*buf
, int raw_input
)
458 /* FIXME: Convert buffer to utf-8 first. Are there
459 OpenOffice texts which are not utf8-encoded? */
462 RS_O(".*<office:body>", "<office:body>"); /* only body */
463 RS_G("<office:binary-data>[^>]*</office:binary-data>", ""); /* remove binary */
466 /* remove soft-page-breaks. We don't need them and they may disturb later decoding */
467 RS_G("<text:soft-page-break/>", "");
468 /* same for xml-protected spaces */
469 RS_G("<text:s/>", " ");
471 /* headline, first level */
472 RS_E("<text:h[^>]*outline-level=\"1\"[^>]*>([^<]*)<[^>]*>", &h1
);
473 RS_E("<text:h[^>]*>([^<]*)<[^>]*>", &h2
); /* other headlines */
474 RS_G("<text:p [^>]*>", "\n\n"); /* normal paragraphs */
475 RS_G("</text:p>", "\n\n");
476 RS_G("<text:tab/>", " "); /* tabs */
477 RS_G("<text:line-break/>", "\n");
480 RS_E("<draw:frame[^>]*draw:name=\"([^\"]*)\"[^>]*>", &image
);
483 RS_G("<[^>]*>", ""); /* replace all remaining tags */
484 RS_G("\n +", "\n"); /* remove indentations, e.g. kword */
485 RS_G("\n{3,}", "\n\n"); /* remove large vertical spaces */
487 RS_G("'", "'"); /* common entities */
489 RS_G(""", "\"");
493 RS_O("^\n+", ""); /* blank lines at beginning and end of document */
494 RS_O("\n{2,}$", "\n");
497 int main(int argc
, const char **argv
)
506 (void)setlocale(LC_ALL
, "");
509 if (!strcmp(argv
[i
], "--raw")) {
512 } else if (!strcmp(argv
[i
], "--raw-input")) {
515 } else if (!strncmp(argv
[i
], "--encoding=", 11)) {
516 size_t arglen
= strlen(argv
[i
]) - 10;
518 if (!strcmp(argv
[i
] + 11, "list")) {
522 opt_encoding
= ymalloc(arglen
);
523 memcpy(opt_encoding
, argv
[i
] + 11, arglen
);
525 } else if (!strncmp(argv
[i
], "--width=", 8)) {
526 opt_width
= atoi(argv
[i
] + 8);
527 if(opt_width
< 3 && opt_width
!= -1) {
528 fprintf(stderr
, "Invalid value for width: %s\n",
533 } else if (!strcmp(argv
[i
], "--force")) {
534 // ignore this setting
536 } else if (!strncmp(argv
[i
], "--output=", 9)) {
537 if (*(argv
[i
] + 9) != '-') {
538 size_t arglen
= strlen(argv
[i
]) - 8;
539 opt_output
= ymalloc(arglen
);
540 memcpy(opt_output
, argv
[i
] + 9, arglen
);
543 } else if (!strncmp(argv
[i
], "--subst=", 8)) {
544 if (!strcmp(argv
[i
] + 8, "none"))
545 opt_subst
= SUBST_NONE
;
546 else if (!strcmp(argv
[i
] + 8, "some"))
547 opt_subst
= SUBST_SOME
;
548 else if (!strcmp(argv
[i
] + 8, "all"))
549 opt_subst
= SUBST_ALL
;
551 fprintf(stderr
, "Invalid value for --subst: %s\n",
556 } else if (!strcmp(argv
[i
], "--help")) {
558 } else if (!strcmp(argv
[i
], "--version")
559 || !strcmp(argv
[i
], "-v")) {
561 } else if (!strcmp(argv
[i
], "-")) {
566 opt_filename
= argv
[i
];
571 if(opt_encoding
&& !strcmp("show", opt_encoding
)) {
573 opt_encoding
= guess_encoding();
574 printf("%s\n", opt_encoding
);
586 opt_encoding
= guess_encoding();
589 ic
= init_conv("UTF-8", opt_encoding
);
591 if (0 != stat(opt_filename
, &st
)) {
592 fprintf(stderr
, "%s: %s\n",
593 opt_filename
, strerror(errno
));
597 /* read content.xml */
598 docbuf
= opt_raw_input
?
599 read_from_xml(opt_filename
, "content.xml") :
600 read_from_zip(opt_filename
, "content.xml");
603 subst_doc(ic
, docbuf
);
604 format_doc(docbuf
, opt_raw_input
);
607 wbuf
= wrap(docbuf
, opt_width
);
609 /* remove all trailing whitespace */
610 (void) regex_subst(wbuf
, " +\n", _REG_GLOBAL
, "\n");
612 outbuf
= conv(ic
, wbuf
);
615 write_to_file(outbuf
, opt_output
);
617 fwrite(strbuf_get(outbuf
), strbuf_len(outbuf
), 1, stdout
);
632 static void write_to_file(STRBUF
*outbuf
, const char *filename
)
637 fd
= open(filename
, O_WRONLY
| O_CREAT
| O_TRUNC
, 0644);
639 fprintf(stderr
, "Can't open %s: %s\n", filename
, strerror(errno
));
643 len
= write(fd
, strbuf_get(outbuf
), strbuf_len(outbuf
));
645 fprintf(stderr
, "Can't write to %s: %s\n", filename
, strerror(errno
));
654 static int print_one (unsigned int namescount
, const char * const * names
,
659 for (i
= 0; i
< namescount
; i
++) {
662 fputs(names
[i
],stdout
);
668 static void show_iconvlist() {
669 iconvlist(print_one
, NULL
);