2 ########################################################################
4 # Copyright (c) 2010 Jonas Jensen, Miklos Vajna
6 # Permission is hereby granted, free of charge, to any person
7 # obtaining a copy of this software and associated documentation
8 # files (the "Software"), to deal in the Software without
9 # restriction, including without limitation the rights to use,
10 # copy, modify, merge, publish, distribute, sublicense, and/or sell
11 # copies of the Software, and to permit persons to whom the
12 # Software is furnished to do so, subject to the following
15 # The above copyright notice and this permission notice shall be
16 # included in all copies or substantial portions of the Software.
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 # OTHER DEALINGS IN THE SOFTWARE.
27 ########################################################################
30 import sys
, re
, subprocess
, os
, optparse
, string
34 This parser extracts comments from source files, tries to guess
35 their language and then prints out the german ones.
38 self
.strip
= string
.punctuation
+ " \n"
39 op
= optparse
.OptionParser()
40 op
.set_usage("%prog [options] <rootdir>\n\n" +
41 "Searches for german comments in cxx/hxx source files inside a given root\n" +
42 "directory recursively.")
43 op
.add_option("-f", "--filenames-only", action
="store_true", dest
="filenames_only", default
=False,
44 help="Only print the filenames of files containing German comments")
45 op
.add_option("-v", "--verbose", action
="store_true", dest
="verbose", default
=False,
46 help="Turn on verbose mode (print progress to stderr)")
47 op
.add_option("-l", "--line-numbers", action
="store_true", dest
="line_numbers", default
=False,
48 help="Prints the filenames and line numbers only.")
49 op
.add_option("-t", "--threshold", action
="store", dest
="THRESHOLD", default
=0,
50 help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
51 self
.options
, args
= op
.parse_args()
56 self
.check_source_files(dir)
58 def get_comments(self
, filename
):
60 Extracts the source code comments.
63 if self
.options
.verbose
:
64 sys
.stderr
.write("processing file '%s'...\n" % filename
)
66 # add an empty line to trigger the output of collected oneliner
68 lines
= sock
.readlines() + ["\n"]
75 if "//" in i
and not in_comment
:
76 # if we find a new //-style comment, then we
77 # just append it to a previous one if: there is
78 # only whitespace before the // mark that is
79 # necessary to make comments longer, giving
80 # more reliable output
81 if not len(re
.sub("(.*)//.*", r
"\1", i
).strip(self
.strip
)):
82 s
= re
.sub(".*// ?", "", i
).strip(self
.strip
)
86 # otherwise it's an independent //-style comment in the next line
87 yield (count
, "\n ".join(buf
))
88 buf
= [re
.sub(".*// ?", "", i
.strip(self
.strip
))]
89 elif "//" not in i
and not in_comment
and len(buf
) > 0:
90 # first normal line after a // block
91 yield (count
, "\n ".join(buf
))
93 elif "/*" in i
and "*/" not in i
and not in_comment
:
94 # start of a real multiline comment
97 s
= re
.sub(".*/\*+", "", i
.strip(self
.strip
))
99 buf
.append(s
.strip(self
.strip
))
100 elif in_comment
and not "*/" in i
:
101 # in multiline comment
102 s
= re
.sub("^( |\|)*\*?", "", i
)
103 if len(s
.strip(self
.strip
)):
104 buf
.append(s
.strip(self
.strip
))
105 elif "*/" in i
and in_comment
:
106 # end of multiline comment
108 s
= re
.sub(r
"\*+/.*", "", i
.strip(self
.strip
))
111 yield (count
, "\n ".join(buf
))
113 elif "/*" in i
and "*/" in i
:
114 # c-style oneliner comment
115 yield (count
, re
.sub(".*/\*(.*)\*/.*", r
"\1", i
).strip(self
.strip
))
118 def get_lang(self
, s
):
119 """ the output is 'german' or 'english' or 'german or english'. when
120 unsure, just don't warn, there are strings where you just can't
121 teremine the results reliably, like '#110680#' """
123 # change to our directory
124 os
.chdir(os
.path
.split(os
.path
.abspath(sys
.argv
[0]))[0])
125 sock
= subprocess
.Popen(["text_cat/text_cat", "-d", "text_cat/LM"], stdin
=subprocess
.PIPE
, stdout
=subprocess
.PIPE
)
128 lang
= sock
.stdout
.read().strip()
133 def is_german(self
, s
):
135 determines if a string is german or not
137 # for short strings we can't do reliable recognition, so skip
138 # short strings and less than 4 words
139 s
= s
.replace('\n', ' ')
140 if len(s
) < 32 or len(s
.split()) < 4:
142 return "german" == self
.get_lang(s
)
144 def check_file(self
, path
):
146 checks each comment in a file
148 def tab_calc (string
):
149 START
= 40 #Default of 10 tabs
150 if len(string
) >= START
:
152 diff
= START
- len(string
)
153 if diff
% 4 is not 0:
157 return (diff
/4)+padding
159 if self
.options
.line_numbers
:
162 for linenum
, s
in self
.get_comments(path
):
163 if self
.is_german(s
):
164 path_linenums
.append(linenum
)
165 valid
= len(path_linenums
) > int(self
.options
.THRESHOLD
)
166 sys
.stderr
.write("%s ... %s positives -- %s\n" % (path
, str(len(path_linenums
)), str(valid
)))
168 if len(path
) + (len(path_linenums
)*4) > 75:
170 while(path_linenums
):
175 numline
.append(path_linenums
[0])
176 path_linenums
.remove(path_linenums
[0])
180 numline
= [str(i
) for i
in numline
]
181 print "%s%s" %(TABS
, ",".join(numline
))
183 path_linenums
= [str(i
) for i
in path_linenums
]
184 print "%s:%s%s" % (path
,"\t"*tab_calc(path
),",".join(path_linenums
))
186 elif not self
.options
.filenames_only
:
187 for linenum
, s
in self
.get_comments(path
):
188 if self
.is_german(s
):
189 print "%s:%s: %s" % (path
, linenum
, s
)
192 for linenum
, s
in self
.get_comments(path
):
193 if self
.is_german(s
):
194 # Make sure we print each filename only once
196 # Print the filenames
200 def check_source_files(self
, dir):
202 checks each _tracked_ file in a directory recursively
204 sock
= os
.popen(r
"git ls-files '%s' |egrep '\.(c|h)xx$'" % dir)
205 lines
= sock
.readlines()
208 self
.check_file(path
.strip())
212 except KeyboardInterrupt:
216 # vim:set shiftwidth=4 softtabstop=4 expandtab: