bin/find-german-comments

   1 #!/usr/bin/env python
   2 ########################################################################
   3 #
   4 #  Copyright (c) 2010 Jonas Jensen, Miklos Vajna
   5 #
   6 #  Permission is hereby granted, free of charge, to any person
   7 #  obtaining a copy of this software and associated documentation
   8 #  files (the "Software"), to deal in the Software without
   9 #  restriction, including without limitation the rights to use,
  10 #  copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 #  copies of the Software, and to permit persons to whom the
  12 #  Software is furnished to do so, subject to the following
  13 #  conditions:
  14 #
  15 #  The above copyright notice and this permission notice shall be
  16 #  included in all copies or substantial portions of the Software.
  17 #
  18 #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19 #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  20 #  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  21 #  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  22 #  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  23 #  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24 #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  25 #  OTHER DEALINGS IN THE SOFTWARE.
  26 #
  27 ########################################################################
  28
  29
  30 import sys, re, subprocess, os, optparse, string
  31
  32 class Parser:
  33     """
  34     This parser extracts comments from source files, tries to guess
  35     their language and then prints out the german ones.
  36     """
  37     def __init__(self):
  38         self.strip = string.punctuation + " \n"
  39         op = optparse.OptionParser()
  40         op.set_usage("%prog [options] <rootdir>\n\n" +
  41             "Searches for german comments in cxx/hxx source files inside a given root\n" +
  42             "directory recursively.")
  43         op.add_option("-f", "--filenames-only", action="store_true", dest="filenames_only", default=False,
  44             help="Only print the filenames of files containing German comments")
  45         op.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False,
  46             help="Turn on verbose mode (print progress to stderr)")
  47         op.add_option("-l", "--line-numbers", action="store_true", dest="line_numbers", default=False,
  48             help="Prints the filenames and line numbers only.")
  49         op.add_option("-t", "--threshold", action="store", dest="THRESHOLD", default=0,
  50             help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
  51         self.options, args = op.parse_args()
  52         try:
  53             dir = args[0]
  54         except IndexError:
  55             dir = "."
  56         self.check_source_files(dir)
  57
  58     def get_comments(self, filename):
  59         """
  60         Extracts the source code comments.
  61         """
  62         linenum = 0
  63         if self.options.verbose:
  64             sys.stderr.write("processing file '%s'...\n" % filename)
  65         sock = open(filename)
  66         # add an empty line to trigger the output of collected oneliner
  67         # comment group
  68         lines = sock.readlines() + ["\n"]
  69         sock.close()
  70
  71         in_comment = False
  72         buf = []
  73         count = 1
  74         for i in lines:
  75             if "//" in i and not in_comment:
  76                 # if we find a new //-style comment, then we
  77                 # just append it to a previous one if: there is
  78                 # only whitespace before the // mark that is
  79                 # necessary to make comments longer, giving
  80                 # more reliable output
  81                 if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
  82                     s = re.sub(".*// ?", "", i).strip(self.strip)
  83                     if len(s):
  84                         buf.append(s)
  85                 else:
  86                     # otherwise it's an independent //-style comment in the next line
  87                     yield (count, "\n    ".join(buf))
  88                     buf = [re.sub(".*// ?", "", i.strip(self.strip))]
  89             elif "//" not in i and not in_comment and len(buf) > 0:
  90                 # first normal line after a // block
  91                 yield (count, "\n    ".join(buf))
  92                 buf = []
  93             elif "/*" in i and "*/" not in i and not in_comment:
  94                 # start of a real multiline comment
  95                 in_comment = True
  96                 linenum = count
  97                 s = re.sub(".*/\*+", "", i.strip(self.strip))
  98                 if len(s):
  99                     buf.append(s.strip(self.strip))
 100             elif in_comment and not "*/" in i:
 101                 # in multiline comment
 102                 s = re.sub("^( |\|)*\*?", "", i)
 103                 if len(s.strip(self.strip)):
 104                     buf.append(s.strip(self.strip))
 105             elif "*/" in i and in_comment:
 106                 # end of multiline comment
 107                 in_comment = False
 108                 s = re.sub(r"\*+/.*", "", i.strip(self.strip))
 109                 if len(s):
 110                     buf.append(s)
 111                 yield (count, "\n    ".join(buf))
 112                 buf = []
 113             elif "/*" in i and "*/" in i:
 114                 # c-style oneliner comment
 115                 yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
 116             count += 1
 117
 118     def get_lang(self, s):
 119         """ the output is 'german' or 'english' or 'german or english'. when
 120         unsure, just don't warn, there are strings where you just can't
 121         teremine the results reliably, like '#110680#' """
 122         cwd = os.getcwd()
 123         # change to our directory
 124         os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
 125         sock = subprocess.Popen(["text_cat/text_cat", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
 126         sock.stdin.write(s)
 127         sock.stdin.close()
 128         lang = sock.stdout.read().strip()
 129         sock.stdout.close()
 130         os.chdir(cwd)
 131         return lang
 132
 133     def is_german(self, s):
 134         """
 135         determines if a string is german or not
 136         """
 137         # for short strings we can't do reliable recognition, so skip
 138         # short strings and less than 4 words
 139         s = s.replace('\n', ' ')
 140         if len(s) < 32 or len(s.split()) < 4:
 141             return False
 142         return "german" == self.get_lang(s)
 143
 144     def check_file(self, path):
 145         """
 146         checks each comment in a file
 147         """
 148         def tab_calc (string):
 149             START = 40 #Default of 10 tabs
 150             if len(string) >= START:
 151                 return 1, 0
 152             diff = START - len(string)
 153             if diff % 4 is not 0:
 154                 padding = 1
 155             else:
 156                 padding = 0
 157             return (diff/4)+padding
 158
 159         if self.options.line_numbers:
 160             TABS = "\t"*10
 161             path_linenums = []
 162             for linenum, s in self.get_comments(path):
 163                 if self.is_german(s):
 164                     path_linenums.append(linenum)
 165             valid = len(path_linenums) > int(self.options.THRESHOLD)
 166             sys.stderr.write("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
 167             if valid:
 168                 if len(path) + (len(path_linenums)*4) > 75:
 169                     print "%s:\n" % path
 170                     while(path_linenums):
 171                         i = 0
 172                         numline = []
 173                         while i < 10:
 174                             try:
 175                                 numline.append(path_linenums[0])
 176                                 path_linenums.remove(path_linenums[0])
 177                             except IndexError:
 178                                 i = 10
 179                             i+=1
 180                         numline = [str(i) for i in numline]
 181                         print "%s%s" %(TABS, ",".join(numline))
 182                 else:
 183                     path_linenums = [str(i) for i in path_linenums]
 184                     print "%s:%s%s" % (path,"\t"*tab_calc(path),",".join(path_linenums))
 185
 186         elif not self.options.filenames_only:
 187             for linenum, s in self.get_comments(path):
 188                 if self.is_german(s):
 189                     print "%s:%s: %s" % (path, linenum, s)
 190         else:
 191             fnames = set([])
 192             for linenum, s in self.get_comments(path):
 193                 if self.is_german(s):
 194                     # Make sure we print each filename only once
 195                     fnames.add(path)
 196             # Print the filenames
 197             for f in fnames:
 198                 print f
 199
 200     def check_source_files(self, dir):
 201         """
 202         checks each _tracked_ file in a directory recursively
 203         """
 204         sock = os.popen(r"git ls-files '%s' |egrep '\.(c|h)xx$'" % dir)
 205         lines = sock.readlines()
 206         sock.close()
 207         for path in lines:
 208             self.check_file(path.strip())
 209
 210 try:
 211     Parser()
 212 except KeyboardInterrupt:
 213     print "Interrupted!"
 214     sys.exit(0)
 215
 216 # vim:set shiftwidth=4 softtabstop=4 expandtab: