db2html: remove text handling for some tags
[gtk-doc.git] / gtkdoc / common.py
blob5871465120fbf30938c616a109d23aef82f0674a
1 # -*- python -*-
3 # gtk-doc - GTK DocBook documentation generator.
4 # Copyright (C) 2001 Damon Chaplin
5 # 2007-2016 Stefan Sauer
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 # Support both Python 2 and 3
23 from __future__ import print_function
25 from collections import OrderedDict
26 import logging
27 import os
28 import re
29 import subprocess
30 import sys
31 import six
32 import codecs
34 from . import config
37 def open_text(filename, mode='r', encoding='utf-8'):
38 """An open() which removes some differences between Python 2 and 3 and
39 has saner defaults.
41 Unlike the builtin open by default utf-8 is use and not the locale
42 encoding (which is ANSI on Windows for example, not very helpful)
44 For Python 2, files are opened in text mode like with Python 3.
45 """
47 if mode not in ('r', 'w'):
48 raise ValueError("mode %r not supported, must be 'r' or 'w'" % mode)
50 if six.PY3:
51 return open(filename, mode, encoding=encoding)
52 else:
53 # We can't use io.open() here as its write method is too strict and
54 # only allows unicode instances and not everything in the codebase
55 # forces unicode at the moment. codecs.open() on the other hand
56 # happily takes ASCII str and decodes it.
57 return codecs.open(filename, mode, encoding=encoding)
60 def setup_logging():
61 """Check GTKDOC_TRACE environment variable.
63 Set python log level to the value of the environment variable (DEBUG, INFO,
64 WARNING, ERROR and CRITICAL) or INFO if the environment variable is empty.
65 """
66 log_level = os.environ.get('GTKDOC_TRACE')
67 if log_level == '':
68 log_level = 'INFO'
69 if log_level:
70 logging.basicConfig(stream=sys.stdout,
71 level=logging.getLevelName(log_level.upper()),
72 format='%(asctime)s:%(filename)s:%(funcName)s:%(lineno)d:%(levelname)s:%(message)s')
73 # When redirecting the output on python2 or if run with a non utf-8 locale
74 # we get UnicodeEncodeError:
75 encoding = sys.stdout.encoding
76 if 'PYTHONIOENCODING' not in os.environ and (not encoding or encoding != 'UTF-8'):
77 sys.stdout.flush()
78 if six.PY3:
79 sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf8', buffering=1)
80 else:
81 import codecs
82 sys.stdout = codecs.getwriter('utf8')(sys.stdout)
85 def UpdateFileIfChanged(old_file, new_file, make_backup):
86 """Compares the old version of the file with the new version and if the
87 file has changed it moves the new version into the old versions place. This
88 is used so we only change files if needed, so we can do proper dependency
89 tracking.
91 Args:
92 old_file (str): The pathname of the old file.
93 new_file (str): The pathname of the new version of the file.
94 make_backup (bool): True if a backup of the old file should be kept.
95 It will have the .bak suffix added to the file name.
97 Returns:
98 bool: It returns False if the file hasn't changed, and True if it has.
99 """
101 logging.debug("Comparing %s with %s...", old_file, new_file)
103 if os.path.exists(old_file):
104 old_contents = new_contents = None
105 with open(old_file, 'rb') as f:
106 old_contents = f.read()
107 with open(new_file, 'rb') as f:
108 new_contents = f.read()
109 if old_contents == new_contents:
110 os.unlink(new_file)
111 logging.debug("-> content is the same.")
112 return False
114 if make_backup:
115 backupname = old_file + '.bak'
116 if os.path.exists(backupname):
117 os.unlink(backupname)
118 os.rename(old_file, backupname)
119 else:
120 os.unlink(old_file)
121 logging.debug("-> content differs.")
122 else:
123 logging.debug("-> %s created.", old_file)
125 os.rename(new_file, old_file)
126 return True
129 def GetModuleDocDir(module_name):
130 """Get the docdir for the given module via pkg-config
132 Args:
133 module_name (string): The module, e.g. 'glib-2.0'
135 Returns:
136 str: the doc directory or None
138 path = None
139 try:
140 path = subprocess.check_output([config.pkg_config, '--variable=prefix', module_name], universal_newlines=True)
141 except subprocess.CalledProcessError:
142 return None
143 return os.path.join(path.strip(), 'share/gtk-doc/html')
146 def LogWarning(filename, line, message):
147 """Log a warning in gcc style format
149 Args:
150 file (str): The file the error comes from
151 line (int): line number in the file
152 message (str): the error message to print
154 filename = filename or "unknown"
156 # TODO: write to stderr
157 print("%s:%d: warning: %s" % (filename, line, message))
160 def CreateValidSGMLID(xml_id):
161 """Creates a valid SGML 'id' from the given string.
163 According to http://www.w3.org/TR/html4/types.html#type-id "ID and NAME
164 tokens must begin with a letter ([A-Za-z]) and may be followed by any number
165 of letters, digits ([0-9]), hyphens ("-"), underscores ("_"), colons (":"),
166 and periods (".")."
168 When creating SGML IDS, we append ":CAPS" to all all-caps identifiers to
169 prevent name clashes (SGML ids are case-insensitive). (It basically never is
170 the case that mixed-case identifiers would collide.)
172 Args:
173 id (str): The text to be converted into a valid SGML id.
175 Returns:
176 str: The converted id.
179 # Special case, '_' would end up as '' so we use 'gettext-macro' instead.
180 if xml_id == '_':
181 return "gettext-macro"
183 xml_id = re.sub(r'[,;]', '', xml_id)
184 xml_id = re.sub(r'[_ ]', '-', xml_id)
185 xml_id = re.sub(r'^-+', '', xml_id)
186 xml_id = xml_id.replace('::', '-')
187 xml_id = xml_id.replace(':', '--')
189 # Append ":CAPS" to all all-caps identifiers
190 # FIXME: there are some inconsistencies here, we have index files containing e.g. TRUE--CAPS
191 if xml_id.isupper() and not xml_id.endswith('-CAPS'):
192 xml_id += ':CAPS'
194 return xml_id
197 # Parsing helpers (move to mkdb ?)
199 class ParseError(Exception):
200 pass
203 def PreprocessStructOrEnum(declaration):
204 """Trim a type declaration for display.
206 Removes private sections and comments from the declaration.
208 Args:
209 declaration (str): the type declaration (struct or enum)
211 Returns:
212 str: the trimmed declaration
214 # Remove private symbols
215 # Assume end of declaration if line begins with '}'
216 declaration = re.sub(r'\n?[ \t]*/\*\s*<\s*(private|protected)\s*>\s*\*/.*?(?:/\*\s*<\s*public\s*>\s*\*/|(?=^\}))',
217 '', declaration, flags=re.MULTILINE | re.DOTALL)
219 # Remove all other comments
220 declaration = re.sub(r'\n\s*/\*.*?\*/\s*\n', r'\n', declaration, flags=re.MULTILINE | re.DOTALL)
221 declaration = re.sub(r'/\*([^*]+|\*(?!/))*\*/', r' ', declaration)
222 declaration = re.sub(r'\n\s*//.*?\n', r'\n', declaration, flags=re.MULTILINE | re.DOTALL)
223 declaration = re.sub(r'//.*', '', declaration)
225 return declaration
228 # TODO: output_function_params is always passed as 0
229 # TODO: we always pass both functions
230 def ParseStructDeclaration(declaration, is_object, output_function_params, typefunc=None, namefunc=None):
231 """ Parse a struct declaration.
233 Takes a structure declaration and breaks it into individual type declarations.
235 Args:
236 declaration (str): the declaration to parse
237 is_object (bool): true if this is an object structure
238 output_function_params (bool): true if full type is wanted for function pointer members
239 typefunc (func): function to apply to type
240 namefunc (func): function to apply to name
242 Returns:
243 dict: map of (symbol, decl) pairs describing the public declaration
246 # For forward struct declarations just return an empty array.
247 if re.search(r'(?:struct|union)\s+\S+\s*;', declaration, flags=re.MULTILINE | re.DOTALL):
248 return {}
250 # Remove all private parts of the declaration
251 # For objects, assume private
252 if is_object:
253 declaration = re.sub(r'''((?:struct|union)\s+\w*\s*\{)
255 (?:/\*\s*<\s*public\s*>\s*\*/|(?=\}))''',
256 r'\1', declaration, flags=re.MULTILINE | re.DOTALL | re.VERBOSE)
258 # Remove g_iface, parent_instance and parent_class if they are first member
259 declaration = re.sub(r'(\{)\s*(\w)+\s+(g_iface|parent_instance|parent_class)\s*;', r'\1', declaration)
261 declaration = PreprocessStructOrEnum(declaration)
263 if declaration.strip() == '':
264 return {}
266 # Prime match after "struct/union {" declaration
267 match = re.search(r'(?:struct|union)\s+\w*\s*\{', declaration, flags=re.MULTILINE | re.DOTALL)
268 if not match:
269 raise ParseError('Declaration "%s" does not begin with "struct/union [NAME] {"' % declaration)
271 logging.debug('public fields in struct/union: %s', declaration)
273 result = OrderedDict()
275 # Treat lines in sequence, allowing singly nested anonymous structs and unions.
276 for m in re.finditer(r'\s*([^{;]+(\{[^\}]*\}[^{;]+)?);', declaration[match.end():], flags=re.MULTILINE | re.DOTALL):
277 line = m.group(1)
279 logging.debug('checking "%s"', line)
281 if re.search(r'^\s*\}\s*\w*\s*$', line):
282 break
284 # FIXME: Just ignore nested structs and unions for now
285 if '{' in line:
286 continue
288 # ignore preprocessor directives
289 line = re.sub(r'^#.*?\n\s*', '', line, flags=re.MULTILINE | re.DOTALL)
291 if re.search(r'^\s*\}\s*\w*\s*$', line):
292 break
294 func_match = re.search(r'''^
295 (const\s+|G_CONST_RETURN\s+|unsigned\s+|signed\s+|long\s+|short\s+)*(struct\s+|enum\s+)? # mod1
296 (\w+)\s* # type
297 (\**(?:\s*restrict)?)\s* # ptr1
298 (const\s+)? # mod2
299 (\**\s*) # ptr2
300 (const\s+)? # mod3
301 \(\s*\*\s*(\w+)\s*\)\s* # name
302 \(([^)]*)\)\s* # func_params
303 $''', line, flags=re.VERBOSE)
304 vars_match = re.search(r'''^
305 ((?:const\s+|volatile\s+|unsigned\s+|signed\s+|short\s+|long\s+)?)(struct\s+|enum\s+)? # mod1
306 (\w+)\s* # type
307 (\** \s* const\s+)? # mod2
308 (.*) # variables
309 $''', line, flags=re.VERBOSE)
311 # Try to match structure members which are functions
312 if func_match:
313 mod1 = func_match.group(1) or ''
314 if func_match.group(2):
315 mod1 += func_match.group(2)
316 func_type = func_match.group(3)
317 ptr1 = func_match.group(4)
318 mod2 = func_match.group(5) or ''
319 ptr2 = func_match.group(6)
320 mod3 = func_match.group(7) or ''
321 name = func_match.group(8)
322 func_params = func_match.group(9)
323 ptype = func_type
324 if typefunc:
325 ptype = typefunc(func_type, '<type>%s</type>' % func_type)
326 pname = name
327 if namefunc:
328 pname = namefunc(name)
330 if output_function_params:
331 result[name] = '%s%s%s%s%s%s&#160;(*%s)&#160;(%s)' % (
332 mod1, ptype, ptr1, mod2, ptr2, mod3, pname, func_params)
333 else:
334 result[name] = '%s&#160;()' % pname
336 # Try to match normal struct fields of comma-separated variables/
337 elif vars_match:
338 mod1 = vars_match.group(1) or ''
339 if vars_match.group(2):
340 mod1 += vars_match.group(2)
341 vtype = vars_match.group(3)
342 ptype = vtype
343 if typefunc:
344 ptype = typefunc(vtype, '<type>%s</type>' % vtype)
345 mod2 = vars_match.group(4) or ''
346 if mod2:
347 mod2 = ' ' + mod2
348 var_list = vars_match.group(5)
350 logging.debug('"%s" "%s" "%s" "%s"', mod1, vtype, mod2, var_list)
352 mod1 = mod1.replace(' ', '&#160;')
353 mod2 = mod2.replace(' ', '&#160;')
355 for n in var_list.split(','):
356 # Each variable can have any number of '*' before the identifier,
357 # and be followed by any number of pairs of brackets or a bit field specifier.
358 # e.g. *foo, ***bar, *baz[12][23], foo : 25.
359 m = re.search(
360 r'^\s* (\**(?:\s*restrict\b)?) \s* (\w+) \s* (?: ((?:\[[^\]]*\]\s*)+) | (:\s*\d+)?) \s* $',
361 n, flags=re.VERBOSE)
362 if m:
363 ptrs = m.group(1)
364 name = m.group(2)
365 array = m.group(3) or ''
366 bits = m.group(4)
367 if bits:
368 bits = ' ' + bits
369 else:
370 bits = ''
371 if ptrs and not ptrs.endswith('*'):
372 ptrs += ' '
374 array = array.replace(' ', '&#160;')
375 bits = bits.replace(' ', '&#160;')
377 pname = name
378 if namefunc:
379 pname = namefunc(name)
381 result[name] = '%s%s%s&#160;%s%s%s%s;' % (mod1, ptype, mod2, ptrs, pname, array, bits)
383 logging.debug('Matched line: %s%s%s %s%s%s%s', mod1, ptype, mod2, ptrs, pname, array, bits)
384 else:
385 logging.warning('Cannot parse struct field: "%s"', n)
387 else:
388 logging.warning('Cannot parse structure field: "%s"', line)
390 return result
393 def ParseEnumDeclaration(declaration):
394 """Parse an enum declaration.
396 This function takes a enumeration declaration and breaks it into individual
397 enum member declarations.
399 Args:
400 declaration (str): the declaration to parse
402 Returns:
403 str: list of strings describing the public declaration
406 # For forward struct declarations just return an empty array.
407 if re.search(r'enum\s+\S+\s*;', declaration, flags=re.MULTILINE | re.DOTALL):
408 return ()
410 declaration = PreprocessStructOrEnum(declaration)
412 if declaration.strip() == '':
413 return ()
415 result = []
417 # Remove parenthesized expressions (in macros like GTK_BLAH = BLAH(1,3))
418 # to avoid getting confused by commas they might contain. This doesn't
419 # handle nested parentheses correctly.
420 declaration = re.sub(r'\([^)\n]+\)', '', declaration)
422 # Remove apostrophed characters (e.g. '}' or ',') values to avoid getting
423 # confused with end of enumeration.
424 # See https://bugzilla.gnome.org/show_bug.cgi?id=741305
425 declaration = re.sub(r'\'.\'', '', declaration)
427 # Remove comma from comma - possible whitespace - closing brace sequence
428 # since it is legal in GNU C and C99 to have a trailing comma but doesn't
429 # result in an actual enum member
430 declaration = re.sub(r',(\s*})', r'\1', declaration)
432 # Prime match after "typedef enum {" declaration
433 match = re.search(r'(typedef\s+)?enum\s*(\S+\s*)?\{', declaration, flags=re.MULTILINE | re.DOTALL)
434 if not match:
435 raise ParseError('Enum declaration "%s" does not begin with "typedef enum {" or "enum [NAME] {"' % declaration)
437 logging.debug("public fields in enum: %s'", declaration)
439 # Treat lines in sequence.
440 for m in re.finditer(r'\s*([^,\}]+)([,\}])', declaration[match.end():], flags=re.MULTILINE | re.DOTALL):
441 line = m.group(1)
442 terminator = m.group(2)
444 # ignore preprocessor directives
445 line = re.sub(r'^#.*?\n\s*', '', line, flags=re.MULTILINE | re.DOTALL)
447 m1 = re.search(r'^(\w+)\s*(=.*)?$', line, flags=re.MULTILINE | re.DOTALL)
448 # Special case for GIOCondition, where the values are specified by
449 # macros which expand to include the equal sign like '=1'.
450 m2 = re.search(r'^(\w+)\s*GLIB_SYSDEF_POLL', line, flags=re.MULTILINE | re.DOTALL)
451 if m1:
452 result.append(m1.group(1))
453 elif m2:
454 result.append(m2.group(1))
455 elif line.strip().startswith('#'):
456 # Special case include of <gdk/gdkcursors.h>, just ignore it
457 # Special case for #ifdef/#else/#endif, just ignore it
458 break
459 else:
460 logging.warning('Cannot parse enumeration member: %s', line)
462 if terminator == '}':
463 break
465 return result
468 def ParseFunctionDeclaration(declaration, typefunc, namefunc):
469 """Parse a function declaration.
471 This function takes a function declaration and breaks it into individual
472 parameter declarations.
474 Args:
475 declaration (str): the declaration to parse
476 typefunc (func): function to apply to type
477 namefunc (func): function to apply to name
479 Returns:
480 dict: map of (symbol, decl) pairs describing the prototype
483 result = OrderedDict()
485 param_num = 0
486 while declaration:
487 logging.debug('decl=[%s]', declaration)
489 # skip whitespace and commas
490 declaration, n = re.subn(r'^[\s,]+', '', declaration)
491 if n:
492 continue
494 declaration, n = re.subn(r'^void\s*[,\n]', '', declaration)
495 if n:
496 if param_num != 0:
497 logging.warning('void used as parameter %d in function %s', param_num, declaration)
498 result['void'] = namefunc('<type>void</type>')
499 param_num += 1
500 continue
502 declaration, n = re.subn(r'^\s*[_a-zA-Z0-9]*\.\.\.\s*[,\n]', '', declaration)
503 if n:
504 result['...'] = namefunc('...')
505 param_num += 1
506 continue
508 # allow alphanumerics, '_', '[' & ']' in param names, try to match a standard parameter
509 # $1 $2 $3 $4 $5
510 regex = r'^\s*((?:(?:G_CONST_RETURN|G_GNUC_[A-Z_]+\s+|unsigned long|unsigned short|signed long|signed short|unsigned|signed|long|short|volatile|const)\s+)*)((?:struct\b|enum\b)?\s*\w+)\s*((?:(?:const\b|restrict\b|G_GNUC_[A-Z_]+\b)?\s*\*?\s*(?:const\b|restrict\b|G_GNUC_[A-Z_]+\b)?\s*)*)(\w+)?\s*((?:\[\S*\])*)\s*(?:G_GNUC_[A-Z_]+)?\s*[,\n]'
511 m = re.match(regex, declaration)
512 if m:
513 declaration = re.sub(regex, '', declaration)
515 pre = m.group(1) or ''
516 type = m.group(2)
517 ptr = m.group(3) or ''
518 name = m.group(4) or ''
519 array = m.group(5) or ''
521 pre = re.sub(r'\s+', ' ', pre)
522 type = re.sub(r'\s+', ' ', type)
523 ptr = re.sub(r'\s+', ' ', ptr)
524 ptr = re.sub(r'\s+$', '', ptr)
525 if ptr and not ptr.endswith('*'):
526 ptr += ' '
528 logging.debug('"%s" "%s" "%s" "%s" "%s"', pre, type, ptr, name, array)
530 m = re.search(r'^((un)?signed .*)\s?', pre)
531 if name == '' and m:
532 name = type
533 type = m.group(1)
534 pre = ''
536 if name == '':
537 name = 'Param' + str(param_num + 1)
539 logging.debug('"%s" "%s" "%s" "%s" "%s"', pre, type, ptr, name, array)
541 xref = typefunc(type, '<type>%s</type>' % type)
542 result[name] = namefunc('%s%s %s%s%s' % (pre, xref, ptr, name, array))
543 param_num += 1
544 continue
546 # Try to match parameters which are functions
547 # $1 $2 $3 $4 $5 $6 $7 $8
548 regex = r'^(const\s+|G_CONST_RETURN\s+|G_GNUC_[A-Z_]+\s+|signed\s+|unsigned\s+)*(struct\s+)?(\w+)\s*(\**)\s*(?:restrict\b)?\s*(const\s+)?\(\s*(\*[\s\*]*)\s*(\w+)\s*\)\s*\(([^)]*)\)\s*[,\n]'
549 m = re.match(regex, declaration)
550 if m:
551 declaration = re.sub(regex, '', declaration)
553 mod1 = m.group(1) or ''
554 if m.group(2):
555 mod1 += m.group(2)
556 type = m.group(3)
557 ptr1 = m.group(4)
558 mod2 = m.group(5) or ''
559 func_ptr = m.group(6)
560 name = m.group(7)
561 func_params = m.group(8) or ''
563 if ptr1 and not ptr1.endswith('*'):
564 ptr1 += ' '
565 func_ptr = re.sub(r'\s+', ' ', func_ptr)
567 logging.debug('"%s" "%s" "%s" "%s" "%s"', mod1, type, mod2, func_ptr, name)
569 xref = typefunc(type, '<type>%s</type>' % type)
570 result[name] = namefunc('%s%s%s%s (%s%s) (%s)' % (mod1, xref, ptr1, mod2, func_ptr, name, func_params))
571 param_num += 1
572 continue
574 logging.warning('Cannnot parse args for function in "%s"', declaration)
575 break
577 return result
580 def ParseMacroDeclaration(declaration, namefunc):
581 """Parse a macro declaration.
583 This function takes a macro declaration and breaks it into individual
584 parameter declarations.
586 Args:
587 declaration (str): the declaration to parse
588 namefunc (func): function to apply to name
590 Returns:
591 dict: map of (symbol, decl) pairs describing the macro
594 result = OrderedDict()
596 logging.debug('decl=[%s]', declaration)
598 m = re.search(r'^\s*#\s*define\s+\w+\(([^\)]*)\)', declaration)
599 if m:
600 params = m.group(1)
601 params = re.sub(r'\n', '', params)
603 logging.debug('params=[%s]', params)
605 for param in params.split(','):
606 param = param.strip()
608 # Allow varargs variations
609 if param.endswith('...'):
610 param = '...'
612 if param != '':
613 result[param] = namefunc(param)
615 return result