Convert xml-protected spaces to real spaces
[odt2txt.git] / odt2txt.c
blob854df950c5587bcd959420956f9dffe58eb141f2
1 /*
2 * odt2txt.c: A simple (and stupid) converter from OpenDocument Text
3 * to plain text.
5 * Copyright (c) 2006-2009 Dennis Stosberg <dennis@stosberg.net>
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License,
9 * version 2 as published by the Free Software Foundation
12 #include <sys/stat.h>
13 #include <sys/types.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #ifdef NO_ICONV
18 # define iconv_t int
19 #else
20 # include <iconv.h>
21 # ifdef WIN32
22 # include <windows.h>
23 # else
24 # include <langinfo.h>
25 # endif
26 #endif
28 #include <limits.h>
29 #include <locale.h>
30 #include <stddef.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <unistd.h>
36 #include "mem.h"
37 #include "regex.h"
38 #include "strbuf.h"
39 #ifdef USE_KUNZIP
40 # include "kunzip/kunzip.h"
41 #else
42 # include <zip.h>
43 #endif
45 #define VERSION "0.5"
47 static int opt_raw;
48 static int opt_raw_input = 0;
49 static char *opt_encoding;
50 static int opt_width = 63;
51 static const char *opt_filename;
52 static char *opt_output;
54 #define SUBST_NONE 0
55 #define SUBST_SOME 1
56 #define SUBST_ALL 2
58 static int opt_subst = SUBST_SOME;
60 #ifndef ICONV_CHAR
61 #define ICONV_CHAR char
62 #endif
64 #ifdef iconvlist
65 static void show_iconvlist();
66 #endif
68 #define RS_O(a,b) (void)regex_subst(buf, (a), _REG_DEFAULT, (b))
69 #define RS_G(a,b) (void)regex_subst(buf, (a), _REG_GLOBAL, (b))
70 #define RS_E(a,b) (void)regex_subst(buf, (a), _REG_EXEC | _REG_GLOBAL, (void*)(b))
72 static char *guess_encoding(void);
73 static void write_to_file(STRBUF *outbuf, const char *filename);
75 struct subst {
76 int unicode;
77 const char *utf8;
78 const char *ascii;
81 static struct subst substs[] = {
82 /* number, UTF-8 sequence, ascii substitution */
83 { 0x00A0, "\xC2\xA0", " " }, /* no-break space */
84 { 0x00A9, "\xC2\xA9", "(c)" }, /* copyright sign */
85 { 0x00AB, "\xC2\xAB", "&lt;&lt;" }, /* left double angle quote */
86 { 0x00AD, "\xC2\xAD", "-" }, /* soft hyphen */
87 { 0x00AE, "\xC2\xAE", "(r)" }, /* registered sign */
88 { 0x00BB, "\xC2\xBB", "&gt;&gt;" }, /* right double angle quote */
90 { 0x00BC, "\xC2\xBC", "1/4" }, /* one quarter */
91 { 0x00BD, "\xC2\xBD", "1/2" }, /* one half */
92 { 0x00BE, "\xC2\xBE", "3/4" }, /* three quarters */
94 { 0x00C4, "\xC3\x84", "Ae" }, /* german umlaut A */
95 { 0x00D6, "\xC3\x96", "Oe" }, /* german umlaut O */
96 { 0x00DC, "\xC3\x9C", "Ue" }, /* german umlaut U */
97 { 0x00DF, "\xC3\x9F", "ss" }, /* german sharp s */
98 { 0x00E4, "\xC3\xA4", "ae" }, /* german umlaut a */
99 { 0x00F6, "\xC3\xB6", "oe" }, /* german umlaut o */
100 { 0x00FC, "\xC3\xBC", "ue" }, /* german umlaut u */
102 { 0x2010, "\xE2\x80\x90", "-" }, /* hyphen */
103 { 0x2011, "\xE2\x80\x91", "-" }, /* non-breaking hyphen */
104 { 0x2012, "\xE2\x80\x92", "-" }, /* figure dash */
105 { 0x2013, "\xE2\x80\x93", "-" }, /* en dash */
106 { 0x2014, "\xE2\x80\x94", "--" }, /* em dash */
107 { 0x2015, "\xE2\x80\x95", "--" }, /* quotation dash */
109 { 0x2018, "\xE2\x80\x98", "`" }, /* single left quotation mark */
110 { 0x2019, "\xE2\x80\x99", "&apos;" }, /* single right quotation mark */
111 { 0x201A, "\xE2\x80\x9A", "," }, /* german single right quotation mark */
112 { 0x201B, "\xE2\x80\x9B", "`" }, /* reversed right quotation mark */
113 { 0x201C, "\xE2\x80\x9C", "``" }, /* left quotation mark */
114 { 0x201D, "\xE2\x80\x9D", "''" }, /* right quotation mark */
115 { 0x201E, "\xE2\x80\x9E", ",," }, /* german left quotes */
117 { 0x2022, "\xE2\x80\xA2", "o " }, /* bullet */
118 { 0x2022, "\xE2\x80\xA3", "&lt; " }, /* triangle bullet */
120 { 0x2025, "\xE2\x80\xA5", ".." }, /* double dot */
121 { 0x2026, "\xE2\x80\xA6", "..." }, /* ellipsis */
123 { 0x2030, "\xE2\x80\xB0", "o/oo" }, /* per mille */
124 { 0x2039, "\xE2\x80\xB9", "&lt;" }, /* left single angle quote */
125 { 0x203A, "\xE2\x80\xBA", "&gt;" }, /* right single angle quote */
127 { 0x20AC, "\xE2\x82\xAC", "EUR" }, /* euro currency symbol */
129 { 0x2190, "\xE2\x86\x90", "&lt;-" }, /* left arrow */
130 { 0x2192, "\xE2\x86\x92", "-&gt;" }, /* right arrow */
131 { 0x2194, "\xE2\x86\x94", "&lt;-&gt;"}, /* left right arrow */
133 { 0, NULL, NULL },
136 static void usage(void)
138 printf("odt2txt %s\n"
139 "Converts an OpenDocument or OpenOffice.org XML File to raw text.\n\n"
140 "Syntax: odt2txt [options] filename\n\n"
141 "Options: --raw Print raw XML\n"
142 " --raw-input Input file is a raw XML (fodt, fods, ...)\n"
143 #ifdef NO_ICONV
144 " --encoding=X Ignored. odt2txt has been built without iconv support.\n"
145 " Output will always be encoded in UTF-8\n"
146 #else
147 " --encoding=X Do not try to autodetect the terminal encoding, but\n"
148 " convert the document to encoding X unconditionally\n"
149 # ifdef iconvlist
150 " You can list all supported encodings by specifying\n"
151 " --encoding=list\n"
152 # endif
153 " To find out, which terminal encoding will be used in\n"
154 " auto mode, use --encoding=show\n"
155 #endif
156 " --width=X Wrap text lines after X characters. Default: 65.\n"
157 " If set to -1 then no lines will be broken\n"
158 " --output=file Write output to file, instead of STDOUT\n"
159 " --subst=X Select which non-ascii characters shall be replaced\n"
160 " by ascii look-a-likes:\n"
161 " --subst=all Substitute all characters for which\n"
162 " substitutions are known\n"
163 " --subst=some Substitute all characters which the\n"
164 " output charset does not contain\n"
165 " This is the default\n"
166 " --subst=none Substitute no characters\n"
167 " --version Show version and copyright information\n",
168 VERSION);
169 exit(EXIT_FAILURE);
172 static void version_info(void)
174 printf("odt2txt %s\n"
175 "Copyright (c) 2006,2007 Dennis Stosberg <dennis@stosberg.net>\n"
176 #ifdef USE_KUNZIP
177 "Uses the kunzip library, Copyright 2005,2006 by Michael Kohn\n"
178 #endif
179 "\n"
180 "This program is free software; you can redistribute it and/or\n"
181 "modify it under the terms of the GNU General Public License,\n"
182 "version 2 as published by the Free Software Foundation\n"
183 "\n"
184 "https://github.com/dstosberg/odt2txt\n",
185 VERSION);
186 exit(EXIT_SUCCESS);
189 static void yrealloc_buf(char **buf, char **mark, size_t len) {
190 ptrdiff_t offset = *mark - *buf;
191 *buf = yrealloc(*buf, len);
192 *mark = *buf + offset;
195 #ifdef NO_ICONV
197 static void finish_conv(iconv_t ic)
199 return;
202 static iconv_t init_conv(const char *input_enc, const char *output_enc)
204 return 0;
207 static STRBUF *conv(iconv_t ic, STRBUF *buf) {
208 STRBUF *output;
210 output = strbuf_new();
211 strbuf_append_n(output, strbuf_get(buf), strbuf_len(buf));
213 return output;
216 static void subst_doc(iconv_t ic, STRBUF *buf) {
217 return;
220 static char *guess_encoding(void)
222 return NULL;
225 #else
227 static iconv_t init_conv(const char *input_enc, const char *output_enc)
229 iconv_t ic;
230 ic = iconv_open(output_enc, input_enc);
231 if (ic == (iconv_t)-1) {
232 if (errno == EINVAL) {
233 fprintf(stderr, "warning: Conversion from %s to %s is not supported.\n",
234 input_enc, opt_encoding);
235 ic = iconv_open("us-ascii", input_enc);
236 if (ic == (iconv_t)-1) {
237 exit(EXIT_FAILURE);
239 fprintf(stderr, "warning: Using us-ascii as fall-back.\n");
240 } else {
241 fprintf(stderr, "iconv_open returned: %s\n", strerror(errno));
242 exit(EXIT_FAILURE);
245 return ic;
248 static void finish_conv(iconv_t ic)
250 if(iconv_close(ic) == -1) {
251 fprintf(stderr, "iconv_close returned: %s\n", strerror(errno));
252 exit(EXIT_FAILURE);
256 static STRBUF *conv(iconv_t ic, STRBUF *buf)
258 /* FIXME: This functionality belongs into strbuf.c */
259 ICONV_CHAR *doc;
260 char *out, *outbuf;
261 size_t inleft, outleft = 0;
262 size_t r;
263 size_t outlen = 0;
264 const size_t alloc_step = 4096;
265 STRBUF *output;
267 inleft = strbuf_len(buf);
268 doc = (ICONV_CHAR*)strbuf_get(buf);
269 outlen = alloc_step; outleft = alloc_step;
270 outbuf = ymalloc(alloc_step);
271 out = outbuf;
272 outleft = alloc_step;
274 do {
275 if (!outleft) {
276 outlen += alloc_step; outleft += alloc_step;
277 yrealloc_buf(&outbuf, &out, outlen);
279 r = iconv(ic, &doc, &inleft, &out, &outleft);
280 if (r == (size_t)-1) {
281 if(errno == E2BIG) {
282 outlen += alloc_step; outleft += alloc_step;
283 if (outlen > (strbuf_len(buf) << 3)) {
284 fprintf(stderr, "Buffer grew to much. "
285 "Corrupted document?\n");
286 exit(EXIT_FAILURE);
288 yrealloc_buf(&outbuf, &out, outlen);
289 continue;
290 } else if ((errno == EILSEQ) || (errno == EINVAL)) {
291 char skip = 1;
293 /* advance in source buffer */
294 if ((unsigned char)*doc > 0x80)
295 skip += utf8_length[(unsigned char)*doc - 0x80];
296 doc += skip;
297 inleft -= skip;
299 /* advance in output buffer */
300 *out = '?';
301 out++;
302 outleft--;
304 continue;
306 fprintf(stderr, "iconv returned: %s\n", strerror(errno));
307 exit(EXIT_FAILURE);
309 } while(inleft != 0);
311 if (!outleft) {
312 outbuf = yrealloc(outbuf, outlen + 1);
314 *out = '\0';
316 output = strbuf_slurp_n(outbuf, (size_t)(out - outbuf));
317 strbuf_setopt(output, STRBUF_NULLOK);
318 return output;
321 static void subst_doc(iconv_t ic, STRBUF *buf)
323 struct subst *s = substs;
324 ICONV_CHAR *in;
325 size_t inleft;
326 const size_t outbuf_sz = 20;
327 char *outbuf;
328 char *out;
329 size_t outleft;
330 size_t r;
332 if (opt_subst == SUBST_NONE)
333 return;
335 outbuf = ymalloc(outbuf_sz);
336 while (s->unicode) {
337 if (opt_subst == SUBST_ALL) {
338 RS_G(s->utf8, s->ascii);
339 } else {
340 out = outbuf;
341 outleft = outbuf_sz;
342 in = (ICONV_CHAR*)s->utf8;
343 inleft = strlen(in);
344 r = iconv(ic, &in, &inleft, &out, &outleft);
345 if (r == (size_t)-1) {
346 if ((errno == EILSEQ) || (errno == EINVAL)) {
347 RS_G(s->utf8, s->ascii);
348 } else {
349 fprintf(stderr,
350 "iconv returned an unexpected error: %s\n",
351 strerror(errno));
352 exit(EXIT_FAILURE);
356 s++;
358 yfree(outbuf);
361 static char *guess_encoding(void)
363 char *enc;
364 char *tmp;
366 enc = ymalloc(20);
367 #ifdef WIN32
368 snprintf(enc, 20, "CP%u", GetACP());
369 #else
370 tmp = nl_langinfo(CODESET);
371 strncpy(enc, tmp, 20);
372 #endif
373 if(!enc) {
374 fprintf(stderr, "warning: Could not detect console "
375 "encoding. Assuming ISO-8859-1\n");
376 strncpy(enc, "ISO-8859-1", 20);
379 return enc;
382 #endif
384 static STRBUF *read_from_zip(const char *zipfile, const char *filename)
386 int r = 0;
387 STRBUF *content = NULL;
389 #ifdef USE_KUNZIP
390 r = kunzip_get_offset_by_name((char*)zipfile, (char*)filename, 3, -1);
391 #else
392 int zip_error;
393 struct zip *zip = NULL;
394 struct zip_stat stat;
395 struct zip_file *unzipped = NULL;
396 char *buf = NULL;
398 if ( !(zip = zip_open(zipfile, 0, &zip_error)) ||
399 (r = zip_name_locate(zip, filename, 0)) < 0 ||
400 (zip_stat_index(zip, r, ZIP_FL_UNCHANGED, &stat) < 0) ||
401 !(unzipped = zip_fopen_index(zip, r, ZIP_FL_UNCHANGED)) ) {
402 if (unzipped)
403 zip_fclose(unzipped);
404 if (zip)
405 zip_close(zip);
406 r = -1;
408 #endif
410 if(-1 == r) {
411 fprintf(stderr,
412 "Can't read from %s: Is it an OpenDocument Text?\n", zipfile);
413 exit(EXIT_FAILURE);
416 #ifdef USE_KUNZIP
417 content = kunzip_next_tobuf((char*)zipfile, r);
418 #else
419 if ( !(buf = ymalloc(stat.size + 1)) ||
420 ((zip_uint64_t)zip_fread(unzipped, buf, stat.size) != stat.size) ||
421 !(content = strbuf_slurp_n(buf, stat.size)) ) {
422 if (buf)
423 yfree(buf);
424 content = NULL;
426 zip_fclose(unzipped);
427 zip_close(zip);
428 #endif
430 if (!content) {
431 fprintf(stderr,
432 "Can't extract %s from %s. Maybe the file is corrupted?\n",
433 filename, zipfile);
434 exit(EXIT_FAILURE);
437 return content;
440 static STRBUF *read_from_xml(const char *xmlfile, const char *filename)
442 FILE *in = fopen(xmlfile, "rb");
443 if (in == 0) {
444 fprintf(stderr, "Can't open %s.\n", filename);
445 exit(EXIT_FAILURE);
448 STRBUF *content = strbuf_new();
449 strbuf_append_file(content, in);
451 fclose(in);
453 return content;
456 static void format_doc(STRBUF *buf, int raw_input)
458 /* FIXME: Convert buffer to utf-8 first. Are there
459 OpenOffice texts which are not utf8-encoded? */
461 if (raw_input) {
462 RS_O(".*<office:body>", "<office:body>"); /* only body */
463 RS_G("<office:binary-data>[^>]*</office:binary-data>", ""); /* remove binary */
466 /* remove soft-page-breaks. We don't need them and they may disturb later decoding */
467 RS_G("<text:soft-page-break/>", "");
468 /* same for xml-protected spaces */
469 RS_G("<text:s/>", " ");
471 /* headline, first level */
472 RS_E("<text:h[^>]*outline-level=\"1\"[^>]*>([^<]*)<[^>]*>", &h1);
473 RS_E("<text:h[^>]*>([^<]*)<[^>]*>", &h2); /* other headlines */
474 RS_G("<text:p [^>]*>", "\n\n"); /* normal paragraphs */
475 RS_G("</text:p>", "\n\n");
476 RS_G("<text:tab/>", " "); /* tabs */
477 RS_G("<text:line-break/>", "\n");
479 /* images */
480 RS_E("<draw:frame[^>]*draw:name=\"([^\"]*)\"[^>]*>", &image);
483 RS_G("<[^>]*>", ""); /* replace all remaining tags */
484 RS_G("\n +", "\n"); /* remove indentations, e.g. kword */
485 RS_G("\n{3,}", "\n\n"); /* remove large vertical spaces */
487 RS_G("&apos;", "'"); /* common entities */
488 RS_G("&amp;", "&");
489 RS_G("&quot;", "\"");
490 RS_G("&gt;", ">");
491 RS_G("&lt;", "<");
493 RS_O("^\n+", ""); /* blank lines at beginning and end of document */
494 RS_O("\n{2,}$", "\n");
497 int main(int argc, const char **argv)
499 struct stat st;
500 iconv_t ic;
501 STRBUF *wbuf;
502 STRBUF *docbuf;
503 STRBUF *outbuf;
504 int i = 1;
506 (void)setlocale(LC_ALL, "");
508 while (argv[i]) {
509 if (!strcmp(argv[i], "--raw")) {
510 opt_raw = 1;
511 i++; continue;
512 } else if (!strcmp(argv[i], "--raw-input")) {
513 opt_raw_input = 1;
514 i++; continue;
515 } else if (!strncmp(argv[i], "--encoding=", 11)) {
516 size_t arglen = strlen(argv[i]) - 10;
517 #ifdef iconvlist
518 if (!strcmp(argv[i] + 11, "list")) {
519 show_iconvlist();
521 #endif
522 opt_encoding = ymalloc(arglen);
523 memcpy(opt_encoding, argv[i] + 11, arglen);
524 i++; continue;
525 } else if (!strncmp(argv[i], "--width=", 8)) {
526 opt_width = atoi(argv[i] + 8);
527 if(opt_width < 3 && opt_width != -1) {
528 fprintf(stderr, "Invalid value for width: %s\n",
529 argv[i] + 8);
530 exit(EXIT_FAILURE);
532 i++; continue;
533 } else if (!strcmp(argv[i], "--force")) {
534 // ignore this setting
535 i++; continue;
536 } else if (!strncmp(argv[i], "--output=", 9)) {
537 if (*(argv[i] + 9) != '-') {
538 size_t arglen = strlen(argv[i]) - 8;
539 opt_output = ymalloc(arglen);
540 memcpy(opt_output, argv[i] + 9, arglen);
542 i++; continue;
543 } else if (!strncmp(argv[i], "--subst=", 8)) {
544 if (!strcmp(argv[i] + 8, "none"))
545 opt_subst = SUBST_NONE;
546 else if (!strcmp(argv[i] + 8, "some"))
547 opt_subst = SUBST_SOME;
548 else if (!strcmp(argv[i] + 8, "all"))
549 opt_subst = SUBST_ALL;
550 else {
551 fprintf(stderr, "Invalid value for --subst: %s\n",
552 argv[i] + 8);
553 exit(EXIT_FAILURE);
555 i++; continue;
556 } else if (!strcmp(argv[i], "--help")) {
557 usage();
558 } else if (!strcmp(argv[i], "--version")
559 || !strcmp(argv[i], "-v")) {
560 version_info();
561 } else if (!strcmp(argv[i], "-")) {
562 usage();
563 } else {
564 if(opt_filename)
565 usage();
566 opt_filename = argv[i];
567 i++; continue;
571 if(opt_encoding && !strcmp("show", opt_encoding)) {
572 yfree(opt_encoding);
573 opt_encoding = guess_encoding();
574 printf("%s\n", opt_encoding);
575 yfree(opt_encoding);
576 exit(EXIT_SUCCESS);
579 if(opt_raw)
580 opt_width = -1;
582 if(!opt_filename)
583 usage();
585 if(!opt_encoding) {
586 opt_encoding = guess_encoding();
589 ic = init_conv("UTF-8", opt_encoding);
591 if (0 != stat(opt_filename, &st)) {
592 fprintf(stderr, "%s: %s\n",
593 opt_filename, strerror(errno));
594 exit(EXIT_FAILURE);
597 /* read content.xml */
598 docbuf = opt_raw_input ?
599 read_from_xml(opt_filename, "content.xml") :
600 read_from_zip(opt_filename, "content.xml");
602 if (!opt_raw) {
603 subst_doc(ic, docbuf);
604 format_doc(docbuf, opt_raw_input);
607 wbuf = wrap(docbuf, opt_width);
609 /* remove all trailing whitespace */
610 (void) regex_subst(wbuf, " +\n", _REG_GLOBAL, "\n");
612 outbuf = conv(ic, wbuf);
614 if (opt_output)
615 write_to_file(outbuf, opt_output);
616 else
617 fwrite(strbuf_get(outbuf), strbuf_len(outbuf), 1, stdout);
619 finish_conv(ic);
620 strbuf_free(wbuf);
621 strbuf_free(docbuf);
622 strbuf_free(outbuf);
623 #ifndef NO_ICONV
624 yfree(opt_encoding);
625 #endif
626 if (opt_output)
627 yfree(opt_output);
629 return EXIT_SUCCESS;
632 static void write_to_file(STRBUF *outbuf, const char *filename)
634 int fd;
635 ssize_t len;
637 fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
638 if (fd == -1) {
639 fprintf(stderr, "Can't open %s: %s\n", filename, strerror(errno));
640 exit(EXIT_FAILURE);
643 len = write(fd, strbuf_get(outbuf), strbuf_len(outbuf));
644 if (len == -1) {
645 fprintf(stderr, "Can't write to %s: %s\n", filename, strerror(errno));
646 exit(EXIT_FAILURE);
649 close(fd);
653 #ifdef iconvlist
654 static int print_one (unsigned int namescount, const char * const * names,
655 void *data)
657 int i;
659 for (i = 0; i < namescount; i++) {
660 if (i > 0)
661 putc(' ',stdout);
662 fputs(names[i],stdout);
664 putc('\n',stdout);
665 return 0;
668 static void show_iconvlist() {
669 iconvlist(print_one, NULL);
670 exit(EXIT_SUCCESS);
672 #endif