4 this file converts simple html text into a docbook xml variant.
5 The mapping of markups and links is far from perfect. But all we
6 want is the docbook-to-pdf converter and similar technology being
7 present in the world of docbook-to-anything converters. """
9 from datetime
import date
15 class htm2dbk_conversion_base
:
17 m()("</[hH]2>(.*)", "m") >> "</title>\n<subtitle>\\1</subtitle>",
18 m()("<[hH]2>") >> "<sect1 id=\"--filename--\"><title>",
19 m()("<[Pp]([> ])","m") >> "<para\\1",
20 m()("</[Pp]>") >> "</para>",
21 m()("<(pre|PRE)>") >> "<screen>",
22 m()("</(pre|PRE)>") >> "</screen>",
23 m()("<[hH]3>") >> "<sect2><title>",
24 m()("</[hH]3>((?:.(?!<sect2>))*.?)", "s") >> "</title>\\1</sect2>",
25 m()("<!doctype [^<>]*>","s") >> "",
26 m()("<!DOCTYPE [^<>]*>","s") >> "",
27 m()("(<\w+\b[^<>]*\swidth=)(\d+\%)","s") >> "\\1\"\\2\"",
28 m()("(<\w+\b[^<>]*\s\w+=)(\d+)","s") >> "\\1\"\\2\"",
29 m()("&&") >> "\&\;\&\;",
30 m()("\$\<") >> "\$\<\;",
31 m()("&(\w+[\),])") >> "\&\;\\1",
32 m()("(</?)span(\s[^<>]*)?>","s") >> "\\1phrase\\2>",
33 m()("(</?)small(\s[^<>]*)?>","s") >> "\\1note\\2>",
34 m()("(</?)(b|em|i)>")>> "\\1emphasis>",
35 m()("(</?)(li)>") >> "\\1listitem>",
36 m()("(</?)(ul)>") >> "\\1itemizedlist>",
37 m()("(</?)(ol)>") >> "\\1orderedlist>",
38 m()("(</?)(dl)>") >> "\\1variablelist>",
39 m()("<dt\b([^<>]*)>","s") >> "<varlistentry\\1><term>",
40 m()("</dt\b([^<>]*)>","s") >> "</term>",
41 m()("<dd\b([^<>]*)>","s") >> "<listitem\\1>",
42 m()("</dd\b([^<>]*)>","s") >> "</listitem></varlistentry>",
43 m()("<table\b([^<>]*)>","s")
44 >> "<informaltable\\1><tgroup cols=\"2\"><tbody>",
45 m()("</table\b([^<>]*)>","s") >> "</tbody></tgroup></informaltable>",
46 m()("(</?)tr(\s[^<>]*)?>","s") >> "\\1row\\2>",
47 m()("(</?)td(\s[^<>]*)?>","s") >> "\\1entry\\2>",
48 m()("<informaltable\b[^<>]*>\s*<tgroup\b[^<>]*>\s*<tbody>"+
49 "\s*<row\b[^<>]*>\s*<entry\b[^<>]*>\s*<informaltable\b","s")
51 m()("</informaltable>\s*</entry>\s*</row>"+
52 "\s*</tbody>\s*</tgroup>\s*</informaltable>", "s")
53 >> "</informaltable>",
54 m()("(<informaltable[^<>]*\swidth=\"100\%\")","s") >> "\\1 pgwide=\"1\"",
55 m()("(<tbody>\s*<row[^<>]*>\s*<entry[^<>]*\s)(width=\"50\%\")","s")
56 >> "<colspec colwidth=\"1*\" /><colspec colwidth=\"1*\" />\n\\1\\2",
57 m()("<nobr>([\'\`]*)<tt>") >> "<cmdsynopsis>\\1",
58 m()("</tt>([\'\`]*)</nobr>") >> "\\1</cmdsynopsis>",
59 m()("<nobr><(?:tt|code)>([\`\"\'])") >> "<cmdsynopsis>\\1",
60 m()("<(?:tt|code)><nobr>([\`\"\'])") >> "<cmdsynopsis>\\1",
61 m()("([\`\"\'])</(?:tt|code)></nobr>") >> "\\1</cmdsynopsis>",
62 m()("([\`\"\'])</nobr></(?:tt|code)>") >> "\\1</cmdsynopsis>",
63 m()("(</?)tt>") >> "\\1constant>",
64 m()("(</?)code>") >> "\\1literal>",
65 m()(">([^<>]+)<br>","s") >> "><highlights>\\1</highlights>",
66 m()("<br>") >> "<br />",
67 # m()("<date>") >> "<sect1info><date>",
68 # m()("</date>") >> "</date></sect1info>",
69 m()("<reference>") >> "<reference id=\"reference\">" >> 1,
70 m()("<a\s+href=\"((?:http|ftp|mailto):[^<>]+)\"\s*>((?:.(?!</a>))*.)</a>"
71 ,"s") >> "<ulink url=\"\\1\">\\2</ulink>",
72 m()("<a\s+href=\"zziplib.html\#([\w_]+)\"\s*>((?:.(?!</a>))*.)</a>","s")
73 >> "<link linkend=\"$1\">$2</link>",
74 m()("<a\s+href=\"(zziplib.html)\"\s*>((?:.(?!</a>))*.)</a>","s")
75 >> "<link linkend=\"reference\">$2</link>",
76 m()("<a\s+href=\"([\w-]+[.]html)\"\s*>((?:.(?!</a>))*.)</a>","s")
77 >> "<link linkend=\"\\1\">\\2</link>",
78 m()("<a\s+href=\"([\w-]+[.](?:h|c|am|txt))\"\s*>((?:.(?!</a>))*.)</a>"
79 ,"s") >> "<ulink url=\"file:\\1\">\\2</ulink>",
80 m()("<a\s+href=\"([A-Z0-9]+[.][A-Z0-9]+)\"\s*>((?:.(?!</a>))*.)</a>","s")
81 >> "<ulink url=\"file:\\1\">\\2</ulink>"
82 # m()("(</?)subtitle>") >> "\\1para>"
83 # $_ .= "</sect1>" if /<sect1[> ]/
86 m()(r
"<br\s*/?>") >> "",
87 m()(r
"(</?)em>") >> r
"\1emphasis>",
88 m()(r
"<code>") >> "<userinput>",
89 m()(r
"</code>") >> "</userinput>",
90 m()(r
"<link>") >> "<function>",
91 m()(r
"</link>") >> "</function>",
92 m()(r
"(?s)\s*</screen>") >> "</screen>",
93 # m()(r"<ul>") >> "</para><programlisting>\n",
94 # m()(r"</ul>") >> "</programlisting><para>",
95 m()(r
"<ul>") >> "<itemizedlist>",
96 m()(r
"</ul>") >> "</itemizedlist>",
99 m()(r
"<li>") >> "<listitem><para>",
100 m()(r
"</li>") >> "</para></listitem>\n",
102 class htm2dbk_conversion(htm2dbk_conversion_base
):
104 self
.version
= "" # str(date.today)
106 def convert(self
,text
): # $text
107 txt
= text
.replace("<!--VERSION-->", self
.version
)
108 for conv
in self
.regexlist
:
110 return txt
.replace("--filename--", self
.filename
)
111 def convert2(self
,text
): # $text
112 txt
= text
.replace("<!--VERSION-->", self
.version
)
113 for conv
in self
.regexlist
:
117 class htm2dbk_document(htm2dbk_conversion
):
118 """ create document, add(text) and get the value() """
120 '<!DOCTYPE book PUBLIC "-//OASIS//DTD'+
121 ' DocBook XML V4.1.2//EN"'+"\n"+
122 ' "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd">'+
124 book_start
= '<book><chapter><title>Documentation</title>'+"\n"
125 book_end_chapters
= '</chapter>'+"\n"
126 book_end
= '</book>'+"\n"
128 htm2dbk_conversion
.__init
__(self
)
129 self
.text
= self
.doctype
+ self
.book_start
131 if self
.text
& m()("<reference"):
132 self
.text
+= self
.book_end_chapters
; self
.book_end_chapters
= ""
133 self
.text
+= self
.convert(text
).replace(
135 m()("<link>([^<>]*)</link>") >> "<function>\\1</function>") & (
136 m()("(?s)(<refentryinfo>\s*)<sect1info>" +
137 "(<date>[^<>]*</date>)</sect1info>") >> "\\1\\2")
139 return self
.text
+ self
.book_end_chapters
+ self
.book_end
141 def htm2dbk_files(args
):
142 doc
= htm2dbk_document()
143 for filename
in args
:
145 f
= open(filename
, "r")
146 doc
.filename
= filename
150 print >> sys
.stderr
, "can not open "+filename
153 def html2docbook(text
):
154 """ the C comment may contain html markup - simulate with docbook tags """
155 return htm2dbk_conversion().convert2(text
)
157 if __name__
== "__main__":
158 print htm2dbk_files(sys
.argv
[1:])