beta-0.89.2
[luatex.git] / source / libs / zziplib / zziplib-0.13.62 / docs / zzipdoc / htm2dbk.py
blobec9685bfd3e0c4e1903d1675c279abdbc57c2b02
1 #! /usr/bin/env python
3 """
4 this file converts simple html text into a docbook xml variant.
5 The mapping of markups and links is far from perfect. But all we
6 want is the docbook-to-pdf converter and similar technology being
7 present in the world of docbook-to-anything converters. """
9 from datetime import date
10 import match
11 import sys
13 m = match.Match
15 class htm2dbk_conversion_base:
16 regexlist = [
17 m()("</[hH]2>(.*)", "m") >> "</title>\n<subtitle>\\1</subtitle>",
18 m()("<[hH]2>") >> "<sect1 id=\"--filename--\"><title>",
19 m()("<[Pp]([> ])","m") >> "<para\\1",
20 m()("</[Pp]>") >> "</para>",
21 m()("<(pre|PRE)>") >> "<screen>",
22 m()("</(pre|PRE)>") >> "</screen>",
23 m()("<[hH]3>") >> "<sect2><title>",
24 m()("</[hH]3>((?:.(?!<sect2>))*.?)", "s") >> "</title>\\1</sect2>",
25 m()("<!doctype [^<>]*>","s") >> "",
26 m()("<!DOCTYPE [^<>]*>","s") >> "",
27 m()("(<\w+\b[^<>]*\swidth=)(\d+\%)","s") >> "\\1\"\\2\"",
28 m()("(<\w+\b[^<>]*\s\w+=)(\d+)","s") >> "\\1\"\\2\"",
29 m()("&&") >> "\&amp\;\&amp\;",
30 m()("\$\<") >> "\$\&lt\;",
31 m()("&(\w+[\),])") >> "\&amp\;\\1",
32 m()("(</?)span(\s[^<>]*)?>","s") >> "\\1phrase\\2>",
33 m()("(</?)small(\s[^<>]*)?>","s") >> "\\1note\\2>",
34 m()("(</?)(b|em|i)>")>> "\\1emphasis>",
35 m()("(</?)(li)>") >> "\\1listitem>",
36 m()("(</?)(ul)>") >> "\\1itemizedlist>",
37 m()("(</?)(ol)>") >> "\\1orderedlist>",
38 m()("(</?)(dl)>") >> "\\1variablelist>",
39 m()("<dt\b([^<>]*)>","s") >> "<varlistentry\\1><term>",
40 m()("</dt\b([^<>]*)>","s") >> "</term>",
41 m()("<dd\b([^<>]*)>","s") >> "<listitem\\1>",
42 m()("</dd\b([^<>]*)>","s") >> "</listitem></varlistentry>",
43 m()("<table\b([^<>]*)>","s")
44 >> "<informaltable\\1><tgroup cols=\"2\"><tbody>",
45 m()("</table\b([^<>]*)>","s") >> "</tbody></tgroup></informaltable>",
46 m()("(</?)tr(\s[^<>]*)?>","s") >> "\\1row\\2>",
47 m()("(</?)td(\s[^<>]*)?>","s") >> "\\1entry\\2>",
48 m()("<informaltable\b[^<>]*>\s*<tgroup\b[^<>]*>\s*<tbody>"+
49 "\s*<row\b[^<>]*>\s*<entry\b[^<>]*>\s*<informaltable\b","s")
50 >> "<informaltable",
51 m()("</informaltable>\s*</entry>\s*</row>"+
52 "\s*</tbody>\s*</tgroup>\s*</informaltable>", "s")
53 >> "</informaltable>",
54 m()("(<informaltable[^<>]*\swidth=\"100\%\")","s") >> "\\1 pgwide=\"1\"",
55 m()("(<tbody>\s*<row[^<>]*>\s*<entry[^<>]*\s)(width=\"50\%\")","s")
56 >> "<colspec colwidth=\"1*\" /><colspec colwidth=\"1*\" />\n\\1\\2",
57 m()("<nobr>([\'\`]*)<tt>") >> "<cmdsynopsis>\\1",
58 m()("</tt>([\'\`]*)</nobr>") >> "\\1</cmdsynopsis>",
59 m()("<nobr><(?:tt|code)>([\`\"\'])") >> "<cmdsynopsis>\\1",
60 m()("<(?:tt|code)><nobr>([\`\"\'])") >> "<cmdsynopsis>\\1",
61 m()("([\`\"\'])</(?:tt|code)></nobr>") >> "\\1</cmdsynopsis>",
62 m()("([\`\"\'])</nobr></(?:tt|code)>") >> "\\1</cmdsynopsis>",
63 m()("(</?)tt>") >> "\\1constant>",
64 m()("(</?)code>") >> "\\1literal>",
65 m()(">([^<>]+)<br>","s") >> "><highlights>\\1</highlights>",
66 m()("<br>") >> "<br />",
67 # m()("<date>") >> "<sect1info><date>",
68 # m()("</date>") >> "</date></sect1info>",
69 m()("<reference>") >> "<reference id=\"reference\">" >> 1,
70 m()("<a\s+href=\"((?:http|ftp|mailto):[^<>]+)\"\s*>((?:.(?!</a>))*.)</a>"
71 ,"s") >> "<ulink url=\"\\1\">\\2</ulink>",
72 m()("<a\s+href=\"zziplib.html\#([\w_]+)\"\s*>((?:.(?!</a>))*.)</a>","s")
73 >> "<link linkend=\"$1\">$2</link>",
74 m()("<a\s+href=\"(zziplib.html)\"\s*>((?:.(?!</a>))*.)</a>","s")
75 >> "<link linkend=\"reference\">$2</link>",
76 m()("<a\s+href=\"([\w-]+[.]html)\"\s*>((?:.(?!</a>))*.)</a>","s")
77 >> "<link linkend=\"\\1\">\\2</link>",
78 m()("<a\s+href=\"([\w-]+[.](?:h|c|am|txt))\"\s*>((?:.(?!</a>))*.)</a>"
79 ,"s") >> "<ulink url=\"file:\\1\">\\2</ulink>",
80 m()("<a\s+href=\"([A-Z0-9]+[.][A-Z0-9]+)\"\s*>((?:.(?!</a>))*.)</a>","s")
81 >> "<ulink url=\"file:\\1\">\\2</ulink>"
82 # m()("(</?)subtitle>") >> "\\1para>"
83 # $_ .= "</sect1>" if /<sect1[> ]/
85 regexlist2 = [
86 m()(r"<br\s*/?>") >> "",
87 m()(r"(</?)em>") >> r"\1emphasis>",
88 m()(r"<code>") >> "<userinput>",
89 m()(r"</code>") >> "</userinput>",
90 m()(r"<link>") >> "<function>",
91 m()(r"</link>") >> "</function>",
92 m()(r"(?s)\s*</screen>") >> "</screen>",
93 # m()(r"<ul>") >> "</para><programlisting>\n",
94 # m()(r"</ul>") >> "</programlisting><para>",
95 m()(r"<ul>") >> "<itemizedlist>",
96 m()(r"</ul>") >> "</itemizedlist>",
97 # m()(r"<li>") >> "",
98 # m()(r"</li>") >> ""
99 m()(r"<li>") >> "<listitem><para>",
100 m()(r"</li>") >> "</para></listitem>\n",
102 class htm2dbk_conversion(htm2dbk_conversion_base):
103 def __init__(self):
104 self.version = "" # str(date.today)
105 self.filename = "."
106 def convert(self,text): # $text
107 txt = text.replace("<!--VERSION-->", self.version)
108 for conv in self.regexlist:
109 txt &= conv
110 return txt.replace("--filename--", self.filename)
111 def convert2(self,text): # $text
112 txt = text.replace("<!--VERSION-->", self.version)
113 for conv in self.regexlist:
114 txt &= conv
115 return txt
117 class htm2dbk_document(htm2dbk_conversion):
118 """ create document, add(text) and get the value() """
119 doctype = (
120 '<!DOCTYPE book PUBLIC "-//OASIS//DTD'+
121 ' DocBook XML V4.1.2//EN"'+"\n"+
122 ' "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd">'+
123 "\n")
124 book_start = '<book><chapter><title>Documentation</title>'+"\n"
125 book_end_chapters = '</chapter>'+"\n"
126 book_end = '</book>'+"\n"
127 def __init__(self):
128 htm2dbk_conversion.__init__(self)
129 self.text = self.doctype + self.book_start
130 def add(self,text):
131 if self.text & m()("<reference"):
132 self.text += self.book_end_chapters ; self.book_end_chapters = ""
133 self.text += self.convert(text).replace(
134 "<br />","") & (
135 m()("<link>([^<>]*)</link>") >> "<function>\\1</function>") & (
136 m()("(?s)(<refentryinfo>\s*)<sect1info>" +
137 "(<date>[^<>]*</date>)</sect1info>") >> "\\1\\2")
138 def value(self):
139 return self.text + self.book_end_chapters + self.book_end
141 def htm2dbk_files(args):
142 doc = htm2dbk_document()
143 for filename in args:
144 try:
145 f = open(filename, "r")
146 doc.filename = filename
147 doc.add(f.read())
148 f.close()
149 except IOError, e:
150 print >> sys.stderr, "can not open "+filename
151 return doc.value()
153 def html2docbook(text):
154 """ the C comment may contain html markup - simulate with docbook tags """
155 return htm2dbk_conversion().convert2(text)
157 if __name__ == "__main__":
158 print htm2dbk_files(sys.argv[1:])