1 """html2text: Turn HTML into equivalent Markdown-structured text."""
3 __author__
= "Aaron Swartz (me@aaronsw.com)"
4 __copyright__
= "(C) 2004 Aaron Swartz. GNU GPL 2."
5 __contributors__
= ["Martin 'Joey' Schulze", "Ricardo Reyes"]
8 # Support decoded entities with unifiable.
9 # Relative URL resolution
11 if not hasattr(__builtins__
, 'True'): True, False = 1, 0
12 import re
, sys
, urllib
, htmlentitydefs
, codecs
, StringIO
, types
14 sgmllib
.charref
= re
.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
16 try: from textwrap
import wrap
19 # Use Unicode characters instead of their ascii psuedo-replacements
22 # Put the links after each paragraph instead of at the end.
23 LINKS_EACH_PARAGRAPH
= 0
25 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
28 ### Entity Nonsense ###
31 if k
== 'apos': return ord("'")
32 if hasattr(htmlentitydefs
, "name2codepoint"): # requires Python 2.3
33 return htmlentitydefs
.name2codepoint
[k
]
35 k
= htmlentitydefs
.entitydefs
[k
]
36 if k
.startswith("&#") and k
.endswith(";"): return int(k
[2:-1]) # not in latin-1
37 return ord(codecs
.latin_1_decode(k
)[0])
39 unifiable
= {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
40 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
41 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
42 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
43 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
44 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
45 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
46 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
50 for k
in unifiable
.keys():
51 unifiable_n
[name2cp(k
)] = unifiable
[k
]
54 if name
[0] in ['x','X']:
59 if not UNICODE_SNOB
and c
in unifiable_n
.keys():
65 if not UNICODE_SNOB
and c
in unifiable
.keys():
69 except KeyError: return "&" + c
70 else: return unichr(name2cp(c
))
72 def replaceEntities(s
):
76 else: return entityref(s
)
78 r_unescape
= re
.compile(r
"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
80 return r_unescape
.sub(replaceEntities
, s
)
83 # Fix bug in sgmllib.py
84 if not attrs
: return attrs
87 newattrs
.append((attr
[0], unescape(attr
[1])))
90 ### End Entity Nonsense ###
93 """Return true if the line does only consist of whitespace characters."""
95 if c
is not ' ' and c
is not ' ':
100 """Wrap all paragraphs in the provided text."""
104 assert wrap
# Requires Python 2.3.
107 for para
in text
.split("\n"):
109 if para
[0] is not ' ' and para
[0] is not '-' and para
[0] is not '*':
110 for line
in wrap(para
, BODY_WIDTH
):
111 result
+= line
+ "\n"
115 if not onlywhite(para
):
116 result
+= para
+ "\n"
125 if tag
[0] == 'h' and len(tag
) == 2:
128 if n
in range(1, 10): return n
129 except ValueError: return 0
131 class _html2text(sgmllib
.SGMLParser
):
132 def __init__(self
, out
=sys
.stdout
.write
):
133 sgmllib
.SGMLParser
.__init
__(self
)
135 if out
is None: self
.out
= self
.outtextf
152 def outtextf(self
, s
):
153 if type(s
) is type(''): s
= codecs
.utf_8_decode(s
)[0]
157 sgmllib
.SGMLParser
.close(self
)
164 def handle_charref(self
, c
):
167 def handle_entityref(self
, c
):
170 def unknown_starttag(self
, tag
, attrs
):
171 self
.handle_tag(tag
, attrs
, 1)
173 def unknown_endtag(self
, tag
):
174 self
.handle_tag(tag
, None, 0)
176 def previousIndex(self
, attrs
):
177 """ returns the index of certain set of attributes (of a link) in the
180 If the set of attributes is not found, returns None
182 if not attrs
.has_key('href'): return None
189 if a
.has_key('href') and a
['href'] == attrs
['href']:
190 if a
.has_key('title') or attrs
.has_key('title'):
191 if (a
.has_key('title') and attrs
.has_key('title') and
192 a
['title'] == attrs
['title']):
199 def handle_tag(self
, tag
, attrs
, start
):
200 attrs
= fixattrs(attrs
)
204 if start
: self
.o(hn(tag
)*"#" + ' ')
206 if tag
in ['p', 'div']: self
.p()
208 if tag
== "br" and start
: self
.o(" \n")
210 if tag
== "hr" and start
:
215 if tag
in ["head", "style", 'script']:
216 if start
: self
.quiet
+= 1
217 else: self
.quiet
-= 1
219 if tag
== "blockquote":
221 self
.p(); self
.o('> ', 0, 1); self
.start
= 1
227 if tag
in ['em', 'i', 'u']: self
.o("_")
228 if tag
in ['strong', 'b']: self
.o("**")
229 if tag
== "code" and not self
.pre
: self
.o('`') #TODO: `` `this` ``
234 for (x
, y
) in attrs
: attrsD
[x
] = y
236 if attrs
.has_key('href'):
237 self
.astack
.append(attrs
)
240 self
.astack
.append(None)
243 a
= self
.astack
.pop()
245 i
= self
.previousIndex(a
)
250 a
['count'] = self
.acount
251 a
['outcount'] = self
.outcount
253 self
.o("][" + `a
['count']`
+ "]")
255 if tag
== "img" and start
:
257 for (x
, y
) in attrs
: attrsD
[x
] = y
259 if attrs
.has_key('src'):
260 attrs
['href'] = attrs
['src']
261 alt
= attrs
.get('alt', '')
262 i
= self
.previousIndex(attrs
)
267 attrs
['count'] = self
.acount
268 attrs
['outcount'] = self
.outcount
272 self
.o("]["+`attrs
['count']`
+"]")
274 if tag
in ["ol", "ul"]:
276 self
.list.append({'name':tag
, 'num':0})
278 if self
.list: self
.list.pop()
285 if self
.list: li
= self
.list[-1]
286 else: li
= {'name':'ul', 'num':0}
287 self
.o(" "*len(self
.list)) #TODO: line up <ol><li>s > 9 correctly.
288 if li
['name'] == "ul": self
.o("* ")
289 elif li
['name'] == "ol":
291 self
.o(`li
['num']`
+". ")
296 if tag
in ['tr']: self
.pbr()
307 if self
.p_p
== 0: self
.p_p
= 1
309 def p(self
): self
.p_p
= 2
311 def o(self
, data
, puredata
=0, force
=0):
313 if puredata
and not self
.pre
:
314 data
= re
.sub('\s+', ' ', data
)
315 if data
and data
[0] == ' ':
318 if not data
and not force
: return
321 #self.out(" :") #TODO: not output when already one there
324 bq
= (">" * self
.blockquote
)
325 if not (force
and data
and data
[0] == ">") and self
.blockquote
: bq
+= " "
329 data
= data
.replace("\n", "\n"+bq
)
344 self
.out(('\n'+bq
)*self
.p_p
)
348 if not self
.lastWasNL
: self
.out(' ')
351 if self
.a
and ((self
.p_p
== 2 and LINKS_EACH_PARAGRAPH
) or force
== "end"):
352 if force
== "end": self
.out("\n")
356 if self
.outcount
> link
['outcount']:
357 self
.out(" ["+`link
['count']`
+"]: " + link
['href']) #TODO: base href
358 if link
.has_key('title'): self
.out(" ("+link
['title']+")")
363 if self
.a
!= newa
: self
.out("\n") # Don't need an extra line when nothing was done.
369 self
.lastWasNL
= data
and data
[-1] == '\n'
372 def handle_data(self
, data
):
375 def unknown_decl(self
, data
): pass
377 def html2text_file(html
, out
=sys
.stdout
.write
):
384 return optwrap(html2text_file(html
, None))
386 if __name__
== "__main__":
389 if arg
.startswith('http://'):
390 data
= urllib
.urlopen(arg
).read()
392 data
= open(arg
, 'r').read()
394 data
= sys
.stdin
.read()