c10e-html: strip more stuff
[gtk-doc.git] / gtkdoc / common.py
blobf62e6deb9d40b33519a2d5a6ba779c0909e1af25
1 # -*- python -*-
3 # gtk-doc - GTK DocBook documentation generator.
4 # Copyright (C) 2001 Damon Chaplin
5 # 2007-2016 Stefan Sauer
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 # Support both Python 2 and 3
23 from __future__ import print_function
25 from collections import OrderedDict
26 import logging
27 import os
28 import re
29 import subprocess
30 import sys
31 import six
32 import codecs
34 from . import config
37 def open_text(filename, mode='r', encoding='utf-8'):
38 """An open() which removes some differences between Python 2 and 3 and
39 has saner defaults.
41 Unlike the builtin open by default utf-8 is use and not the locale
42 encoding (which is ANSI on Windows for example, not very helpful)
44 For Python 2, files are opened in text mode like with Python 3.
45 """
47 if mode not in ('r', 'w'):
48 raise ValueError("mode %r not supported, must be 'r' or 'w'" % mode)
50 if six.PY3:
51 return open(filename, mode, encoding=encoding)
52 else:
53 # We can't use io.open() here as its write method is too strict and
54 # only allows unicode instances and not everything in the codebase
55 # forces unicode at the moment. codecs.open() on the other hand
56 # happily takes ASCII str and decodes it.
57 return codecs.open(filename, mode, encoding=encoding)
60 def setup_logging():
61 """Check GTKDOC_TRACE environment variable.
63 Set python log level to the value of the environment variable (DEBUG, INFO,
64 WARNING, ERROR and CRITICAL) or INFO if the environment variable is empty.
65 """
66 log_level = os.environ.get('GTKDOC_TRACE', 'WARNING')
67 if log_level == '':
68 log_level = 'WARNING'
69 logging.basicConfig(stream=sys.stdout,
70 level=logging.getLevelName(log_level.upper()),
71 format='%(asctime)s:%(filename)s:%(funcName)s:%(lineno)d:%(levelname)s:%(message)s')
72 # When redirecting the output on python2 or if run with a non utf-8 locale
73 # we get UnicodeEncodeError:
74 encoding = sys.stdout.encoding
75 if 'PYTHONIOENCODING' not in os.environ and (not encoding or encoding != 'UTF-8'):
76 sys.stdout.flush()
77 if six.PY3:
78 sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf8', buffering=1)
79 else:
80 import codecs
81 sys.stdout = codecs.getwriter('utf8')(sys.stdout)
84 def UpdateFileIfChanged(old_file, new_file, make_backup):
85 """Compares the old version of the file with the new version and if the
86 file has changed it moves the new version into the old versions place. This
87 is used so we only change files if needed, so we can do proper dependency
88 tracking.
90 Args:
91 old_file (str): The pathname of the old file.
92 new_file (str): The pathname of the new version of the file.
93 make_backup (bool): True if a backup of the old file should be kept.
94 It will have the .bak suffix added to the file name.
96 Returns:
97 bool: It returns False if the file hasn't changed, and True if it has.
98 """
100 logging.debug("Comparing %s with %s...", old_file, new_file)
102 if os.path.exists(old_file):
103 old_contents = new_contents = None
104 with open(old_file, 'rb') as f:
105 old_contents = f.read()
106 with open(new_file, 'rb') as f:
107 new_contents = f.read()
108 if old_contents == new_contents:
109 os.unlink(new_file)
110 logging.debug("-> content is the same.")
111 return False
113 if make_backup:
114 backupname = old_file + '.bak'
115 if os.path.exists(backupname):
116 os.unlink(backupname)
117 os.rename(old_file, backupname)
118 else:
119 os.unlink(old_file)
120 logging.debug("-> content differs.")
121 else:
122 logging.debug("-> %s created.", old_file)
124 os.rename(new_file, old_file)
125 return True
128 def GetModuleDocDir(module_name):
129 """Get the docdir for the given module via pkg-config
131 Args:
132 module_name (string): The module, e.g. 'glib-2.0'
134 Returns:
135 str: the doc directory or None
137 path = None
138 try:
139 path = subprocess.check_output([config.pkg_config, '--variable=prefix', module_name], universal_newlines=True)
140 except subprocess.CalledProcessError:
141 return None
142 return os.path.join(path.strip(), 'share/gtk-doc/html')
145 def LogWarning(filename, line, message):
146 """Log a warning in gcc style format
148 Args:
149 file (str): The file the error comes from
150 line (int): line number in the file
151 message (str): the error message to print
153 filename = filename or "unknown"
155 # TODO: write to stderr
156 print("%s:%d: warning: %s" % (filename, line, message))
159 def CreateValidSGMLID(xml_id):
160 """Creates a valid SGML 'id' from the given string.
162 According to http://www.w3.org/TR/html4/types.html#type-id "ID and NAME
163 tokens must begin with a letter ([A-Za-z]) and may be followed by any number
164 of letters, digits ([0-9]), hyphens ("-"), underscores ("_"), colons (":"),
165 and periods (".")."
167 When creating SGML IDS, we append ":CAPS" to all all-caps identifiers to
168 prevent name clashes (SGML ids are case-insensitive). (It basically never is
169 the case that mixed-case identifiers would collide.)
171 Args:
172 id (str): The text to be converted into a valid SGML id.
174 Returns:
175 str: The converted id.
178 # Special case, '_' would end up as '' so we use 'gettext-macro' instead.
179 if xml_id == '_':
180 return "gettext-macro"
182 xml_id = re.sub(r'[,;]', '', xml_id)
183 xml_id = re.sub(r'[_ ]', '-', xml_id)
184 xml_id = re.sub(r'^-+', '', xml_id)
185 xml_id = xml_id.replace('::', '-')
186 xml_id = xml_id.replace(':', '--')
188 # Append ":CAPS" to all all-caps identifiers
189 # FIXME: there are some inconsistencies here, we have index files containing e.g. TRUE--CAPS
190 if xml_id.isupper() and not xml_id.endswith('-CAPS'):
191 xml_id += ':CAPS'
193 return xml_id
196 # Parsing helpers (move to mkdb ?)
198 class ParseError(Exception):
199 pass
202 def PreprocessStructOrEnum(declaration):
203 """Trim a type declaration for display.
205 Removes private sections and comments from the declaration.
207 Args:
208 declaration (str): the type declaration (struct or enum)
210 Returns:
211 str: the trimmed declaration
213 # Remove private symbols
214 # Assume end of declaration if line begins with '}'
215 declaration = re.sub(r'\n?[ \t]*/\*\s*<\s*(private|protected)\s*>\s*\*/.*?(?:/\*\s*<\s*public\s*>\s*\*/|(?=^\}))',
216 '', declaration, flags=re.MULTILINE | re.DOTALL)
218 # Remove all other comments
219 declaration = re.sub(r'\n\s*/\*.*?\*/\s*\n', r'\n', declaration, flags=re.MULTILINE | re.DOTALL)
220 declaration = re.sub(r'/\*([^*]+|\*(?!/))*\*/', r' ', declaration)
221 declaration = re.sub(r'\n\s*//.*?\n', r'\n', declaration, flags=re.MULTILINE | re.DOTALL)
222 declaration = re.sub(r'//.*', '', declaration)
224 return declaration
227 # TODO: output_function_params is always passed as 0
228 # TODO: we always pass both functions
229 def ParseStructDeclaration(declaration, is_object, output_function_params, typefunc=None, namefunc=None):
230 """ Parse a struct declaration.
232 Takes a structure declaration and breaks it into individual type declarations.
234 Args:
235 declaration (str): the declaration to parse
236 is_object (bool): true if this is an object structure
237 output_function_params (bool): true if full type is wanted for function pointer members
238 typefunc (func): function to apply to type
239 namefunc (func): function to apply to name
241 Returns:
242 dict: map of (symbol, decl) pairs describing the public declaration
245 # For forward struct declarations just return an empty array.
246 if re.search(r'(?:struct|union)\s+\S+\s*;', declaration, flags=re.MULTILINE | re.DOTALL):
247 return {}
249 # Remove all private parts of the declaration
250 # For objects, assume private
251 if is_object:
252 declaration = re.sub(r'''((?:struct|union)\s+\w*\s*\{)
254 (?:/\*\s*<\s*public\s*>\s*\*/|(?=\}))''',
255 r'\1', declaration, flags=re.MULTILINE | re.DOTALL | re.VERBOSE)
257 # Remove g_iface, parent_instance and parent_class if they are first member
258 declaration = re.sub(r'(\{)\s*(\w)+\s+(g_iface|parent_instance|parent_class)\s*;', r'\1', declaration)
260 declaration = PreprocessStructOrEnum(declaration)
262 if declaration.strip() == '':
263 return {}
265 # Prime match after "struct/union {" declaration
266 match = re.search(r'(?:struct|union)\s+\w*\s*\{', declaration, flags=re.MULTILINE | re.DOTALL)
267 if not match:
268 raise ParseError('Declaration "%s" does not begin with "struct/union [NAME] {"' % declaration)
270 logging.debug('public fields in struct/union: %s', declaration)
272 result = OrderedDict()
274 # Treat lines in sequence, allowing singly nested anonymous structs and unions.
275 for m in re.finditer(r'\s*([^{;]+(\{[^\}]*\}[^{;]+)?);', declaration[match.end():], flags=re.MULTILINE | re.DOTALL):
276 line = m.group(1)
278 logging.debug('checking "%s"', line)
280 if re.search(r'^\s*\}\s*\w*\s*$', line):
281 break
283 # FIXME: Just ignore nested structs and unions for now
284 if '{' in line:
285 continue
287 # ignore preprocessor directives
288 line = re.sub(r'^#.*?\n\s*', '', line, flags=re.MULTILINE | re.DOTALL)
290 if re.search(r'^\s*\}\s*\w*\s*$', line):
291 break
293 func_match = re.search(r'''^
294 (const\s+|G_CONST_RETURN\s+|unsigned\s+|signed\s+|long\s+|short\s+)*(struct\s+|enum\s+)? # mod1
295 (\w+)\s* # type
296 (\**(?:\s*restrict)?)\s* # ptr1
297 (const\s+)? # mod2
298 (\**\s*) # ptr2
299 (const\s+)? # mod3
300 \(\s*\*\s*(\w+)\s*\)\s* # name
301 \(([^)]*)\)\s* # func_params
302 $''', line, flags=re.VERBOSE)
303 vars_match = re.search(r'''^
304 ((?:const\s+|volatile\s+|unsigned\s+|signed\s+|short\s+|long\s+)?)(struct\s+|enum\s+)? # mod1
305 (\w+)\s* # type
306 (\** \s* const\s+)? # mod2
307 (.*) # variables
308 $''', line, flags=re.VERBOSE)
310 # Try to match structure members which are functions
311 if func_match:
312 mod1 = func_match.group(1) or ''
313 if func_match.group(2):
314 mod1 += func_match.group(2)
315 func_type = func_match.group(3)
316 ptr1 = func_match.group(4)
317 mod2 = func_match.group(5) or ''
318 ptr2 = func_match.group(6)
319 mod3 = func_match.group(7) or ''
320 name = func_match.group(8)
321 func_params = func_match.group(9)
322 ptype = func_type
323 if typefunc:
324 ptype = typefunc(func_type, '<type>%s</type>' % func_type)
325 pname = name
326 if namefunc:
327 pname = namefunc(name)
329 if output_function_params:
330 result[name] = '%s%s%s%s%s%s&#160;(*%s)&#160;(%s)' % (
331 mod1, ptype, ptr1, mod2, ptr2, mod3, pname, func_params)
332 else:
333 result[name] = '%s&#160;()' % pname
335 # Try to match normal struct fields of comma-separated variables/
336 elif vars_match:
337 mod1 = vars_match.group(1) or ''
338 if vars_match.group(2):
339 mod1 += vars_match.group(2)
340 vtype = vars_match.group(3)
341 ptype = vtype
342 if typefunc:
343 ptype = typefunc(vtype, '<type>%s</type>' % vtype)
344 mod2 = vars_match.group(4) or ''
345 if mod2:
346 mod2 = ' ' + mod2
347 var_list = vars_match.group(5)
349 logging.debug('"%s" "%s" "%s" "%s"', mod1, vtype, mod2, var_list)
351 mod1 = mod1.replace(' ', '&#160;')
352 mod2 = mod2.replace(' ', '&#160;')
354 for n in var_list.split(','):
355 # Each variable can have any number of '*' before the identifier,
356 # and be followed by any number of pairs of brackets or a bit field specifier.
357 # e.g. *foo, ***bar, *baz[12][23], foo : 25.
358 m = re.search(
359 r'^\s* (\**(?:\s*restrict\b)?) \s* (\w+) \s* (?: ((?:\[[^\]]*\]\s*)+) | (:\s*\d+)?) \s* $',
360 n, flags=re.VERBOSE)
361 if m:
362 ptrs = m.group(1)
363 name = m.group(2)
364 array = m.group(3) or ''
365 bits = m.group(4)
366 if bits:
367 bits = ' ' + bits
368 else:
369 bits = ''
370 if ptrs and not ptrs.endswith('*'):
371 ptrs += ' '
373 array = array.replace(' ', '&#160;')
374 bits = bits.replace(' ', '&#160;')
376 pname = name
377 if namefunc:
378 pname = namefunc(name)
380 result[name] = '%s%s%s&#160;%s%s%s%s;' % (mod1, ptype, mod2, ptrs, pname, array, bits)
382 logging.debug('Matched line: %s%s%s %s%s%s%s', mod1, ptype, mod2, ptrs, pname, array, bits)
383 else:
384 logging.warning('Cannot parse struct field: "%s"', n)
386 else:
387 logging.warning('Cannot parse structure field: "%s"', line)
389 return result
392 def ParseEnumDeclaration(declaration):
393 """Parse an enum declaration.
395 This function takes a enumeration declaration and breaks it into individual
396 enum member declarations.
398 Args:
399 declaration (str): the declaration to parse
401 Returns:
402 str: list of strings describing the public declaration
405 # For forward struct declarations just return an empty array.
406 if re.search(r'enum\s+\S+\s*;', declaration, flags=re.MULTILINE | re.DOTALL):
407 return ()
409 declaration = PreprocessStructOrEnum(declaration)
411 if declaration.strip() == '':
412 return ()
414 result = []
416 # Remove parenthesized expressions (in macros like GTK_BLAH = BLAH(1,3))
417 # to avoid getting confused by commas they might contain. This doesn't
418 # handle nested parentheses correctly.
419 declaration = re.sub(r'\([^)\n]+\)', '', declaration)
421 # Remove apostrophed characters (e.g. '}' or ',') values to avoid getting
422 # confused with end of enumeration.
423 # See https://bugzilla.gnome.org/show_bug.cgi?id=741305
424 declaration = re.sub(r'\'.\'', '', declaration)
426 # Remove comma from comma - possible whitespace - closing brace sequence
427 # since it is legal in GNU C and C99 to have a trailing comma but doesn't
428 # result in an actual enum member
429 declaration = re.sub(r',(\s*})', r'\1', declaration)
431 # Prime match after "typedef enum {" declaration
432 match = re.search(r'(typedef\s+)?enum\s*(\S+\s*)?\{', declaration, flags=re.MULTILINE | re.DOTALL)
433 if not match:
434 raise ParseError('Enum declaration "%s" does not begin with "typedef enum {" or "enum [NAME] {"' % declaration)
436 logging.debug("public fields in enum: %s'", declaration)
438 # Treat lines in sequence.
439 for m in re.finditer(r'\s*([^,\}]+)([,\}])', declaration[match.end():], flags=re.MULTILINE | re.DOTALL):
440 line = m.group(1)
441 terminator = m.group(2)
443 # ignore preprocessor directives
444 line = re.sub(r'^#.*?\n\s*', '', line, flags=re.MULTILINE | re.DOTALL)
446 m1 = re.search(r'^(\w+)\s*(=.*)?$', line, flags=re.MULTILINE | re.DOTALL)
447 # Special case for GIOCondition, where the values are specified by
448 # macros which expand to include the equal sign like '=1'.
449 m2 = re.search(r'^(\w+)\s*GLIB_SYSDEF_POLL', line, flags=re.MULTILINE | re.DOTALL)
450 if m1:
451 result.append(m1.group(1))
452 elif m2:
453 result.append(m2.group(1))
454 elif line.strip().startswith('#'):
455 # Special case include of <gdk/gdkcursors.h>, just ignore it
456 # Special case for #ifdef/#else/#endif, just ignore it
457 break
458 else:
459 logging.warning('Cannot parse enumeration member: %s', line)
461 if terminator == '}':
462 break
464 return result
467 def ParseFunctionDeclaration(declaration, typefunc, namefunc):
468 """Parse a function declaration.
470 This function takes a function declaration and breaks it into individual
471 parameter declarations.
473 Args:
474 declaration (str): the declaration to parse
475 typefunc (func): function to apply to type
476 namefunc (func): function to apply to name
478 Returns:
479 dict: map of (symbol, decl) pairs describing the prototype
482 result = OrderedDict()
484 param_num = 0
485 while declaration:
486 logging.debug('decl=[%s]', declaration)
488 # skip whitespace and commas
489 declaration, n = re.subn(r'^[\s,]+', '', declaration)
490 if n:
491 continue
493 declaration, n = re.subn(r'^void\s*[,\n]', '', declaration)
494 if n:
495 if param_num != 0:
496 logging.warning('void used as parameter %d in function %s', param_num, declaration)
497 result['void'] = namefunc('<type>void</type>')
498 param_num += 1
499 continue
501 declaration, n = re.subn(r'^\s*[_a-zA-Z0-9]*\.\.\.\s*[,\n]', '', declaration)
502 if n:
503 result['...'] = namefunc('...')
504 param_num += 1
505 continue
507 # allow alphanumerics, '_', '[' & ']' in param names, try to match a standard parameter
508 # $1 $2 $3 $4 $5
509 regex = r'^\s*((?:(?:G_CONST_RETURN|G_GNUC_[A-Z_]+\s+|unsigned long|unsigned short|signed long|signed short|unsigned|signed|long|short|volatile|const)\s+)*)((?:struct\b|enum\b)?\s*\w+)\s*((?:(?:const\b|restrict\b|G_GNUC_[A-Z_]+\b)?\s*\*?\s*(?:const\b|restrict\b|G_GNUC_[A-Z_]+\b)?\s*)*)(\w+)?\s*((?:\[\S*\])*)\s*(?:G_GNUC_[A-Z_]+)?\s*[,\n]'
510 m = re.match(regex, declaration)
511 if m:
512 declaration = re.sub(regex, '', declaration)
514 pre = m.group(1) or ''
515 type = m.group(2)
516 ptr = m.group(3) or ''
517 name = m.group(4) or ''
518 array = m.group(5) or ''
520 pre = re.sub(r'\s+', ' ', pre)
521 type = re.sub(r'\s+', ' ', type)
522 ptr = re.sub(r'\s+', ' ', ptr)
523 ptr = re.sub(r'\s+$', '', ptr)
524 if ptr and not ptr.endswith('*'):
525 ptr += ' '
527 logging.debug('"%s" "%s" "%s" "%s" "%s"', pre, type, ptr, name, array)
529 m = re.search(r'^((un)?signed .*)\s?', pre)
530 if name == '' and m:
531 name = type
532 type = m.group(1)
533 pre = ''
535 if name == '':
536 name = 'Param' + str(param_num + 1)
538 logging.debug('"%s" "%s" "%s" "%s" "%s"', pre, type, ptr, name, array)
540 xref = typefunc(type, '<type>%s</type>' % type)
541 result[name] = namefunc('%s%s %s%s%s' % (pre, xref, ptr, name, array))
542 param_num += 1
543 continue
545 # Try to match parameters which are functions
546 # $1 $2 $3 $4 $5 $6 $7 $8
547 regex = r'^(const\s+|G_CONST_RETURN\s+|G_GNUC_[A-Z_]+\s+|signed\s+|unsigned\s+)*(struct\s+)?(\w+)\s*(\**)\s*(?:restrict\b)?\s*(const\s+)?\(\s*(\*[\s\*]*)\s*(\w+)\s*\)\s*\(([^)]*)\)\s*[,\n]'
548 m = re.match(regex, declaration)
549 if m:
550 declaration = re.sub(regex, '', declaration)
552 mod1 = m.group(1) or ''
553 if m.group(2):
554 mod1 += m.group(2)
555 type = m.group(3)
556 ptr1 = m.group(4)
557 mod2 = m.group(5) or ''
558 func_ptr = m.group(6)
559 name = m.group(7)
560 func_params = m.group(8) or ''
562 if ptr1 and not ptr1.endswith('*'):
563 ptr1 += ' '
564 func_ptr = re.sub(r'\s+', ' ', func_ptr)
566 logging.debug('"%s" "%s" "%s" "%s" "%s"', mod1, type, mod2, func_ptr, name)
568 xref = typefunc(type, '<type>%s</type>' % type)
569 result[name] = namefunc('%s%s%s%s (%s%s) (%s)' % (mod1, xref, ptr1, mod2, func_ptr, name, func_params))
570 param_num += 1
571 continue
573 logging.warning('Cannnot parse args for function in "%s"', declaration)
574 break
576 return result
579 def ParseMacroDeclaration(declaration, namefunc):
580 """Parse a macro declaration.
582 This function takes a macro declaration and breaks it into individual
583 parameter declarations.
585 Args:
586 declaration (str): the declaration to parse
587 namefunc (func): function to apply to name
589 Returns:
590 dict: map of (symbol, decl) pairs describing the macro
593 result = OrderedDict()
595 logging.debug('decl=[%s]', declaration)
597 m = re.search(r'^\s*#\s*define\s+\w+\(([^\)]*)\)', declaration)
598 if m:
599 params = m.group(1)
600 params = re.sub(r'\n', '', params)
602 logging.debug('params=[%s]', params)
604 for param in params.split(','):
605 param = param.strip()
607 # Allow varargs variations
608 if param.endswith('...'):
609 param = '...'
611 if param != '':
612 result[param] = namefunc(param)
614 return result