scripts/convert-htmldoc-to-rst.py

   1 from bs4 import BeautifulSoup, NavigableString, Comment
   2 import sys
   3 import re
   4 import textwrap
   5
   6 whitespace = re.compile('[ \r\n]+')
   7
   8 def get_id_from_cfg(text):
   9     '''
  10     Formats anchor ID from config option.
  11     '''
  12     if text[:6] == '$cfg[\'':
  13         text = text[6:]
  14     if text[-2:] == '\']':
  15         text = text[:-2]
  16     text = text.replace('[$i]', '')
  17     parts = text.split("']['")
  18     return 'cfg_%s' % '_'.join(parts)
  19
  20 def format_content(tag, ignore_links = False, skip = (), document_mode = False):
  21     '''
  22     Parses inline html content.
  23     '''
  24     out = []
  25     for item in tag:
  26         if isinstance(item, NavigableString):
  27             text = whitespace.sub(' ', item.string)
  28             if text != '':
  29                 out.append(text.replace('*', '\\*').replace('_', '\\_'))
  30             continue
  31
  32         # skip comments
  33         if isinstance(item, Comment):
  34             continue
  35
  36         # skip breaks, they are mostly invalid anyway
  37         if item.name == 'br':
  38             continue
  39
  40         # skip images
  41         if item.name == 'img':
  42             continue
  43
  44         if item.name in skip:
  45             continue
  46
  47         if item.name == 'a' and 'href' in item.attrs:
  48             content = format_content(item)
  49             if ignore_links:
  50                 out.append(content)
  51                 continue
  52             href = item.attrs['href']
  53             if href[0] == '#':
  54                 if content == 'details' or 'see' in content:
  55                     out.append('see :ref:`%s`' % href[1:])
  56                 elif 'FAQ' in content:
  57                     out.append(':ref:`%s`' % href[1:])
  58                 else:
  59                     out.append(':ref:`%s`' % href[1:])
  60             else:
  61                 out.append('`%s <%s>`_' % (content, href))
  62             continue
  63         if item.name == 'code':
  64             out.append('``%s``' % format_content(item))
  65             continue
  66         if item.name == 'strong' or (item.name == 'span' and 'class' in item.attrs and 'important' in item.attrs['class']):
  67             out.append('**%s**' % format_content(item))
  68             continue
  69         if item.name == 'em':
  70             out.append('*%s*' % format_content(item))
  71             continue
  72         if item.name == 'abbr':
  73             out.append(':abbr:`%s (%s)`' % (format_content(item), item.attrs['title']))
  74             continue
  75         if item.name == 'sup':
  76             out.append(':sup:`%s`' % format_content(item))
  77             continue
  78         if item.name == 'sub':
  79             out.append(':sub:`%s`' % format_content(item))
  80             continue
  81         if item.name == 'span':
  82             out.append(format_content(item))
  83             continue
  84
  85         if document_mode:
  86             print textwrap.fill(''.join(out).strip()).encode('utf-8')
  87             print
  88             out = []
  89             parse_block(item)
  90             continue
  91
  92         print item.name
  93         print item.attrs
  94         raise Exception('Unknown tag')
  95     if document_mode:
  96         print textwrap.fill(''.join(out).strip()).encode('utf-8')
  97         print
  98     ret = ''.join(out)
  99     return ret.strip()
 100
 101 def print_id(tag):
 102     tagid = tag.get('id')
 103     if tagid is not None:
 104         print '.. _%s:' % tagid
 105     print
 106
 107 def parse_block(tag):
 108     '''
 109     Parses block tag.
 110     '''
 111     if tag.name == 'h2':
 112         sys.stdout.close()
 113         sys.stdout = open('%s.rst' % tag.get('id'), 'w')
 114         print_id(tag)
 115         print tag.text
 116         print '=' * len(tag.text)
 117         print
 118     elif tag.name == 'h3':
 119         print_id(tag)
 120         print tag.text
 121         print '+' * len(tag.text)
 122         print
 123     elif tag.name in ('h4', 'h5'):
 124         print_id(tag)
 125         text = format_content(tag, True)
 126         print text.encode('utf-8')
 127         print '-' * len(text)
 128         print
 129     elif tag.name == 'p':
 130         text = format_content(tag)
 131         print textwrap.fill(text).encode('utf-8')
 132         print
 133     elif tag.name in ('ul', 'ol'):
 134         if tag.name == 'ul':
 135             header = '*'
 136         else:
 137             header = '#.'
 138         for li in tag:
 139             # skip empty
 140             if isinstance(li, NavigableString) and li.string.strip() == '':
 141                 continue
 142
 143             # skip comments
 144             if isinstance(li, Comment):
 145                 continue
 146
 147             if li.name != 'li':
 148                 raise Exception('UL contains %s' % li.name)
 149             text = format_content(li, skip = ('ul', 'li', 'pre', 'p'))
 150             print header,
 151             indent = ' ' * (len(header) + 1)
 152             joiner = '\n%s' % indent
 153             print joiner.join(textwrap.wrap(text)).encode('utf-8')
 154             for item in li:
 155                 if isinstance(item, NavigableString):
 156                     # Already handle above
 157                     continue
 158                 if item.name == 'ul':
 159                     print
 160                     for lii in item:
 161                         if isinstance(lii, NavigableString) and lii.string.strip() == '':
 162                             continue
 163                         if lii.name != 'li':
 164                             raise Exception('UL contains %s' % lii.name)
 165                         text = format_content(lii)
 166                         print indent + '*',
 167                         joiner = '\n%s  ' % indent
 168                         print joiner.join(textwrap.wrap(text)).encode('utf-8')
 169                         print
 170                 elif item.name == 'pre':
 171                     print
 172                     print indent + '.. code-block:: none'
 173                     print
 174                     for line in item.text.splitlines():
 175                         print indent + '    ', line.strip().encode('utf-8')
 176                     print
 177
 178                     print
 179                 elif item.name == 'p':
 180                     text = format_content(item)
 181                     print textwrap.fill(text, initial_indent = indent).encode('utf-8')
 182                     print
 183         print
 184
 185     elif tag.name == 'dl':
 186         cfg = False
 187         for li in tag:
 188             # skip empty
 189             if isinstance(li, NavigableString) and li.string.strip() == '':
 190                 continue
 191
 192             # skip comments
 193             if isinstance(li, Comment):
 194                 continue
 195
 196             if li.name == 'dt':
 197                 dt_id = li.get('id')
 198                 cfg = dt_id is not None and ('cfg' in dt_id or 'servers' in dt_id or 'control' in dt_id or 'bookmark' in dt_id or 'table' in dt_id or 'pmadb' in dt_id or 'relation' in dt_id or 'col_com' in dt_id or 'history' in dt_id or 'recent' in dt_id or 'tracking' in dt_id or 'designer' in dt_id or 'Arbitrary' in dt_id or 'userconfig' in dt_id)
 199                 if cfg:
 200                     # Extract all IDs
 201                     ids = [dt_id]
 202                     for subtag in li:
 203                         if not isinstance(subtag, NavigableString) and subtag.get('id') is not None:
 204                             ids.append(subtag.get('id'))
 205                 else:
 206                     # Print all IDs
 207                     print_id(li)
 208                     for subtag in li:
 209                         if not isinstance(subtag, NavigableString) and subtag.get('id') is not None:
 210                             print_id(subtag)
 211                 # Extract text
 212                 if cfg:
 213                     options = []
 214                     text = ''
 215                     for subtag in li:
 216                         if isinstance(subtag, NavigableString):
 217                             text += subtag.string
 218                         elif subtag.name == 'span':
 219                             text += subtag.text
 220                         elif subtag.name == 'br':
 221                             options.append(text)
 222                             text = ''
 223                     if text != '':
 224                         options.append(text)
 225                     ids = set(ids)
 226                     config_options = []
 227                     for option in options:
 228                         if option.strip() == '':
 229                             continue
 230                         try:
 231                             optname, opttype = option.split(' ', 1)
 232                         except:
 233                             optname = option
 234                             opttype = ''
 235                         optname = optname.strip()
 236                         opttype = opttype.strip()
 237                         config_options.append((optname, opttype))
 238                         newid = get_id_from_cfg(optname)
 239                         if newid in ids:
 240                             ids.remove(newid)
 241
 242                     for anchor in ids:
 243                         print '.. _%s:' % anchor
 244
 245                     for optname, opttype in config_options:
 246                         print '.. config:option:: %s' % optname
 247                         print
 248                         print '    :type: %s' % opttype
 249                         print '    :default:'
 250                         print
 251                 else:
 252                     text = format_content(li).encode('utf-8')
 253                     print text
 254                     print '-' * len(text)
 255                     print
 256             elif li.name == 'dd':
 257                 format_content(li, document_mode = True)
 258             else:
 259                 print li.name
 260                 print li.attrs
 261                 raise Exception('Unknown tag')
 262     elif tag.name == 'pre':
 263         print '.. code-block:: none'
 264         print
 265         for line in tag.text.splitlines():
 266             print '   ', line.strip().encode('utf-8')
 267         print
 268     else:
 269         print tag.name
 270         print tag.attrs
 271         raise Exception('Unknown tag')
 272
 273
 274 s = BeautifulSoup(file(sys.argv[1]).read())
 275
 276 for tag in s.html.body.find(id = 'body'):
 277
 278     # skip empty
 279     if isinstance(tag, NavigableString) and tag.string.strip() == '':
 280         continue
 281
 282     # skip comments
 283     if isinstance(tag, Comment):
 284         continue
 285
 286     parse_block(tag)