Updated core
[LibreOffice.git] / bin / find-german-comments
blob1cc9d511edc9944145ce45b8364759f49097ec1f
1 #!/usr/bin/env python
2 ########################################################################
4 # Copyright (c) 2010 Jonas Jensen, Miklos Vajna
6 # Permission is hereby granted, free of charge, to any person
7 # obtaining a copy of this software and associated documentation
8 # files (the "Software"), to deal in the Software without
9 # restriction, including without limitation the rights to use,
10 # copy, modify, merge, publish, distribute, sublicense, and/or sell
11 # copies of the Software, and to permit persons to whom the
12 # Software is furnished to do so, subject to the following
13 # conditions:
15 # The above copyright notice and this permission notice shall be
16 # included in all copies or substantial portions of the Software.
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 # OTHER DEALINGS IN THE SOFTWARE.
27 ########################################################################
30 import sys, re, subprocess, os, optparse, string
32 class Parser:
33 """
34 This parser extracts comments from source files, tries to guess
35 their language and then prints out the german ones.
36 """
37 def __init__(self):
38 self.strip = string.punctuation + " \n"
39 op = optparse.OptionParser()
40 op.set_usage("%prog [options] <rootdir>\n\n" +
41 "Searches for german comments in cxx/hxx source files inside a given root\n" +
42 "directory recursively.")
43 op.add_option("-f", "--filenames-only", action="store_true", dest="filenames_only", default=False,
44 help="Only print the filenames of files containing German comments")
45 op.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False,
46 help="Turn on verbose mode (print progress to stderr)")
47 op.add_option("-l", "--line-numbers", action="store_true", dest="line_numbers", default=False,
48 help="Prints the filenames and line numbers only.")
49 op.add_option("-t", "--threshold", action="store", dest="THRESHOLD", default=0,
50 help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
51 self.options, args = op.parse_args()
52 try:
53 dir = args[0]
54 except IndexError:
55 dir = "."
56 self.check_source_files(dir)
58 def get_comments(self, filename):
59 """
60 Extracts the source code comments.
61 """
62 linenum = 0
63 if self.options.verbose:
64 sys.stderr.write("processing file '%s'...\n" % filename)
65 sock = open(filename)
66 # add an empty line to trigger the output of collected oneliner
67 # comment group
68 lines = sock.readlines() + ["\n"]
69 sock.close()
71 in_comment = False
72 buf = []
73 count = 1
74 for i in lines:
75 if "//" in i and not in_comment:
76 # if we find a new //-style comment, then we
77 # just append it to a previous one if: there is
78 # only whitespace before the // mark that is
79 # necessary to make comments longer, giving
80 # more reliable output
81 if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
82 s = re.sub(".*// ?", "", i).strip(self.strip)
83 if len(s):
84 buf.append(s)
85 else:
86 # otherwise it's an independent //-style comment in the next line
87 yield (count, "\n ".join(buf))
88 buf = [re.sub(".*// ?", "", i.strip(self.strip))]
89 elif "//" not in i and not in_comment and len(buf) > 0:
90 # first normal line after a // block
91 yield (count, "\n ".join(buf))
92 buf = []
93 elif "/*" in i and "*/" not in i and not in_comment:
94 # start of a real multiline comment
95 in_comment = True
96 linenum = count
97 s = re.sub(".*/\*+", "", i.strip(self.strip))
98 if len(s):
99 buf.append(s.strip(self.strip))
100 elif in_comment and not "*/" in i:
101 # in multiline comment
102 s = re.sub("^( |\|)*\*?", "", i)
103 if len(s.strip(self.strip)):
104 buf.append(s.strip(self.strip))
105 elif "*/" in i and in_comment:
106 # end of multiline comment
107 in_comment = False
108 s = re.sub(r"\*+/.*", "", i.strip(self.strip))
109 if len(s):
110 buf.append(s)
111 yield (count, "\n ".join(buf))
112 buf = []
113 elif "/*" in i and "*/" in i:
114 # c-style oneliner comment
115 yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
116 count += 1
118 def get_lang(self, s):
119 """ the output is 'german' or 'english' or 'german or english'. when
120 unsure, just don't warn, there are strings where you just can't
121 teremine the results reliably, like '#110680#' """
122 cwd = os.getcwd()
123 # change to our directory
124 os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
125 sock = subprocess.Popen(["text_cat/text_cat", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
126 sock.stdin.write(s)
127 sock.stdin.close()
128 lang = sock.stdout.read().strip()
129 sock.stdout.close()
130 os.chdir(cwd)
131 return lang
133 def is_german(self, s):
135 determines if a string is german or not
137 # for short strings we can't do reliable recognition, so skip
138 # short strings and less than 4 words
139 s = s.replace('\n', ' ')
140 if len(s) < 32 or len(s.split()) < 4:
141 return False
142 return "german" == self.get_lang(s)
144 def check_file(self, path):
146 checks each comment in a file
148 def tab_calc (string):
149 START = 40 #Default of 10 tabs
150 if len(string) >= START:
151 return 1, 0
152 diff = START - len(string)
153 if diff % 4 is not 0:
154 padding = 1
155 else:
156 padding = 0
157 return (diff/4)+padding
159 if self.options.line_numbers:
160 TABS = "\t"*10
161 path_linenums = []
162 for linenum, s in self.get_comments(path):
163 if self.is_german(s):
164 path_linenums.append(linenum)
165 valid = len(path_linenums) > int(self.options.THRESHOLD)
166 sys.stderr.write("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
167 if valid:
168 if len(path) + (len(path_linenums)*4) > 75:
169 print "%s:\n" % path
170 while(path_linenums):
171 i = 0
172 numline = []
173 while i < 10:
174 try:
175 numline.append(path_linenums[0])
176 path_linenums.remove(path_linenums[0])
177 except IndexError:
178 i = 10
179 i+=1
180 numline = [str(i) for i in numline]
181 print "%s%s" %(TABS, ",".join(numline))
182 else:
183 path_linenums = [str(i) for i in path_linenums]
184 print "%s:%s%s" % (path,"\t"*tab_calc(path),",".join(path_linenums))
186 elif not self.options.filenames_only:
187 for linenum, s in self.get_comments(path):
188 if self.is_german(s):
189 print "%s:%s: %s" % (path, linenum, s)
190 else:
191 fnames = set([])
192 for linenum, s in self.get_comments(path):
193 if self.is_german(s):
194 # Make sure we print each filename only once
195 fnames.add(path)
196 # Print the filenames
197 for f in fnames:
198 print f
200 def check_source_files(self, dir):
202 checks each _tracked_ file in a directory recursively
204 sock = os.popen(r"git ls-files '%s' |egrep '\.(c|h)xx$'" % dir)
205 lines = sock.readlines()
206 sock.close()
207 for path in lines:
208 self.check_file(path.strip())
210 try:
211 Parser()
212 except KeyboardInterrupt:
213 print "Interrupted!"
214 sys.exit(0)
216 # vim:set shiftwidth=4 softtabstop=4 expandtab: