2 ########################################################################
4 # Copyright (c) 2010 Jonas Jensen, Miklos Vajna
6 # Permission is hereby granted, free of charge, to any person
7 # obtaining a copy of this software and associated documentation
8 # files (the "Software"), to deal in the Software without
9 # restriction, including without limitation the rights to use,
10 # copy, modify, merge, publish, distribute, sublicense, and/or sell
11 # copies of the Software, and to permit persons to whom the
12 # Software is furnished to do so, subject to the following
15 # The above copyright notice and this permission notice shall be
16 # included in all copies or substantial portions of the Software.
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 # OTHER DEALINGS IN THE SOFTWARE.
27 ########################################################################
39 This parser extracts comments from source files, tries to guess
40 their language and then prints out the German ones.
43 self.strip = string.punctuation + " \n"
44 self.text_cat = self.start_text_cat()
45 parser = argparse.ArgumentParser(description='Searches for German comments in cxx/hxx source files inside a given root directory recursively.')
46 parser.add_argument("-f", "--filenames-only", action="store_true",
47 help="Only print the filenames of files containing German comments")
48 parser.add_argument("-v", "--verbose", action="store_true",
49 help="Turn on verbose mode (print only positives progress to stderr)")
50 parser.add_argument("-l", "--line-numbers", action="store_true",
51 help="Prints the filenames and line numbers only.")
52 parser.add_argument("-L", "--line-numbers-pos", action="store_true",
53 help="Prints the filenames and line numbers only (if positive).")
54 parser.add_argument("-t", "--threshold", action="store", default=0, type=int,
55 help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
56 parser.add_argument("directory", nargs='?', default='.', type=str, help='Give a directory to search in')
57 self.args = parser.parse_args()
58 self.check_source_files(self.args.directory)
60 def get_comments(self, filename):
62 Extracts the source code comments.
66 print("processing file '%s'...\n" % filename)
68 # add an empty line to trigger the output of collected oneliner
70 lines = sock.readlines() + ["\n"]
77 if "//" in i and not in_comment:
78 # if we find a new //-style comment, then we
79 # just append it to a previous one if: there is
80 # only whitespace before the // mark that is
81 # necessary to make comments longer, giving
82 # more reliable output
83 if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
84 s = re.sub(".*// ?", "", i).strip(self.strip)
88 # otherwise it's an independent //-style comment in the next line
89 yield (count, "\n ".join(buf))
90 buf = [re.sub(".*// ?", "", i.strip(self.strip))]
91 elif "//" not in i and not in_comment and len(buf) > 0:
92 # first normal line after a // block
93 yield (count, "\n ".join(buf))
95 elif "/*" in i and "*/" not in i and not in_comment:
96 # start of a real multiline comment
99 s = re.sub(".*/\*+", "", i.strip(self.strip))
101 buf.append(s.strip(self.strip))
102 elif in_comment and not "*/" in i:
103 # in multiline comment
104 s = re.sub("^( |\|)*\*?", "", i)
105 if len(s.strip(self.strip)):
106 buf.append(s.strip(self.strip))
107 elif "*/" in i and in_comment:
108 # end of multiline comment
110 s = re.sub(r"\*+/.*", "", i.strip(self.strip))
113 yield (count, "\n ".join(buf))
115 elif "/*" in i and "*/" in i:
116 # c-style oneliner comment
117 yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
120 def start_text_cat(self):
122 # change to our directory
123 os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
124 sock = subprocess.Popen(["text_cat/text_cat", "-s", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
128 def get_lang(self, s):
129 """ the output is 'german' or 'english' or 'german or english'. When
130 unsure, just don't warn, there are strings where you just can't
131 determine the results reliably, like '#110680#' """
133 self.text_cat.stdin.write(bytes(s, 'utf-8'))
134 self.text_cat.stdin.write(bytes("\n", 'utf-8'))
135 self.text_cat.stdin.flush()
136 lang = self.text_cat.stdout.readline().strip()
139 def is_german(self, s):
141 determines if a string is German or not
143 # for short strings we can't do reliable recognition, so skip
144 # short strings and less than 4 words
145 s = s.replace('\n', ' ')
146 if len(s) < 32 or len(s.split()) < 4:
148 return self.get_lang(s) == b"german"
150 def check_file(self, path):
152 checks each comment in a file
155 START = 40 #Default of 10 tabs
156 if len(path) >= START:
158 diff = START - len(path)
159 if diff % 4 is not 0:
163 return (diff/4)+padding
165 if self.args.line_numbers or self.args.line_numbers_pos:
168 for linenum, s in self.get_comments(path):
169 if self.is_german(s):
170 path_linenums.append(linenum)
171 valid = len(path_linenums) > int(self.args.threshold)
172 if self.args.line_numbers:
173 print("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
175 if self.args.line_numbers_pos:
176 print("%s ... %s positives\n" % (path, str(len(path_linenums))))
178 if len(path) + (len(path_linenums)*4) > 75:
179 print("%s:\n" % path)
185 numline.append(path_linenums[0])
186 path_linenums.remove(path_linenums[0])
190 numline = [str(i) for i in numline]
191 print("%s%s" % (TABS, ",".join(numline)))
193 if self.args.line_numbers:
194 path_linenums = [str(i) for i in path_linenums]
195 print("%s:%s%s" % (path, "\t"*int(tab_calc(path)), ",".join(path_linenums)))
197 elif not self.args.filenames_only:
198 for linenum, s in self.get_comments(path):
199 if self.is_german(s):
200 print("%s:%s: %s" % (path, linenum, s))
203 for linenum, s in self.get_comments(path):
204 if self.is_german(s):
205 # Make sure we print each filename only once
207 # Print the filenames
211 def first_elem(self, path):
213 Returns the root directory in our repo of a given path, so we can check against the whitelist.
215 lastElem = os.path.dirname(path)
218 nextElem = os.path.split(lastElem)[0]
219 if nextElem is not '':
225 def check_source_files(self, directory):
227 checks each _tracked_ file in a directory recursively
230 # top-level project directory -> use whitelist.
232 if os.path.exists(directory + "/.git/config"):
235 # Change into the given dir, so "git ls-tree" does work.
238 sock = os.popen(r"git ls-tree -r HEAD --name-only |egrep '\.(c|cc|cpp|cxx|h|hxx|mm)$'")
239 lines = sock.readlines()
242 # Helps to speedup a global scan
243 directory_whitelist = {
263 "compilerplugins" : 1,
300 "libreofficekit" : 1,
301 "lingucomponent" : 1,
358 "winaccessibility" : 1,
369 print("Scanning all files globally:")
370 elif directory == '.':
371 print("Scanning all files in our current directory:")
373 print("Scanning all files in", directory + ":")
378 baseDir = self.first_elem(path)
379 # If we have an globalscan use the whitelist.
381 if not baseDir in directory_whitelist:
382 sys.stderr.write("\n - Error: Missing path %s -\n\n" % baseDir)
384 elif directory_whitelist[baseDir] is 0:
385 self.check_file(path.strip())
386 num_checked = num_checked + 1
387 elif directory_whitelist[baseDir] is 1:
388 sys.stderr.write("Skipping whitelisted directory %s\n" % baseDir)
389 directory_whitelist[baseDir] = 2
391 self.check_file(path.strip())
392 num_checked = num_checked + 1
394 print("Scanned %s files\n" % num_checked)
398 except KeyboardInterrupt:
399 print("Interrupted!")
402 # vim:set shiftwidth=4 softtabstop=4 expandtab: