WIP
[evolve-layout.git] / textcheck.py
blob4fa9f4b4a9b760e8b51adbbaeba1f902defab387
1 #!/usr/bin/env python3
2 # encoding: utf-8
4 """Check how much a given text diverges from a 1gram, 2gram and 3gram frequency.
6 usage: ./textcheck.py <textfile to check> [--best-lines]
8 --lines: check each line and return the 10 most similar lines.
10 idea: allow selecting different 1gram, 2gram and 3gram files.
12 """
14 from sys import argv
15 if "--help" in argv:
16 print(__doc__)
17 exit()
19 from math import log
21 def read_file(path):
22 """Get the data from a file.
24 >>> read_file("testfile")[:2]
25 'ui'
26 """
27 with open(path, "r") as f: #, encoding="UTF-8") as f:
28 data = f.read()
29 return data
31 def read_file_lines(path):
32 """Get the data from a file.
34 >>> read_file("testfile")[:2]
35 'ui'
36 """
37 with open(path) as f: #, encoding="UTF-8") as f:
38 data = f.readlines()
39 return data
41 def letters_in_file(data):
42 """Sort the repeats in a file by the number of occurrances.
44 >>> data = read_file("testfile")
45 >>> letters_in_file(data)[:3]
46 [(5, 'a'), (4, '\\n'), (2, '⇧')]
47 """
48 letters = {}
49 for letter in data:
50 if letter in letters:
51 letters[letter] += 1
52 else:
53 letters[letter] = 1
54 return letters
56 def letters_in_file_precalculated(data):
57 """Get the repeats from a precalculated file.
59 >>> data = read_file("1gramme.txt")
60 >>> letters_in_file_precalculated(data)[:2]
61 [(44034982, 'e'), (27012723, 'n')]
62 """
63 letters = [line.lstrip().split(" ", 1) for line in data.splitlines() if line.split()[1:]]
64 letters = [(int(num), let) for num, let in letters]
65 lett = {l: num for num, l in letters}
66 return lett
68 def repeats_in_file(data):
69 """Sort the repeats in a file by the number of occurrances.
71 >>> data = read_file("testfile")
72 >>> repeats_in_file(data)[:3]
73 [(2, 'aa'), (2, 'a\\n'), (1, '⇧a')]
74 """
75 repeats = {}
76 for i in range(len(data)-1):
77 rep = data[i] + data[i+1]
78 if rep in repeats:
79 repeats[rep] += 1
80 else:
81 repeats[rep] = 1
82 return repeats
84 def repeats_in_file_precalculated(data):
85 """Get the repeats from a precalculated file.
87 >>> data = read_file("2gramme.txt")
88 >>> repeats_in_file_precalculated(data)[:2]
89 [(10162743, 'en'), (10028050, 'er')]
90 """
91 reps = [line.lstrip().split(" ", 1) for line in data.splitlines() if line.split()[1:]]
92 reps = [(int(num), r) for num, r in reps if r[1:]]
93 r = {r: num for num, r in reps}
94 return r
96 def trigrams_in_file(data):
97 """Sort the trigrams in a file by the number of occurrances.
99 >>> data = read_file("testfile")
100 >>> trigrams_in_file(data)[:12]
101 [(1, '⇧aa'), (1, '⇧aa'), (1, '⇧aa'), (1, '⇧aa'), (1, '⇗aa'), (1, '⇗aa'), (1, '⇗aa'), (1, '⇗aa'), (1, 'uia'), (1, 't⇧a'), (1, 't⇧a'), (1, 't⇗a')]
103 trigs = {}
104 for i in range(len(data)-2):
105 trig = data[i] + data[i+1] + data[i+2]
106 if trig in trigs:
107 trigs[trig] += 1
108 else:
109 trigs[trig] = 1
110 return trigs
112 def trigrams_in_file_precalculated(data):
113 """Get the repeats from a precalculated file.
115 CAREFUL: SLOW!
117 >>> data = read_file("3gramme.txt")
118 >>> trigrams_in_file_precalculated(data)[:6]
119 [(5679632, 'en '), (4417443, 'er '), (2891983, ' de'), (2303238, 'der'), (2273056, 'ie '), (2039537, 'ich')]
121 trigs = [line.lstrip().split(" ", 1) for line in data.splitlines() if line.split()[1:]]
122 trigs = [(int(num), r) for num, r in trigs if r[1:]]
123 t = {t: num for num, t in trigs}
124 return t
126 def normalize_occurrence_dict(d):
127 """normalize a dict with keys and assorted occurrence numbers.
129 ⇒ sum([d[t] for t in d]) == 1.0
131 _sum = sum([d[t] for t in d])
132 d = {t: d[t]/_sum for t in d}
133 return d
135 def occurrence_dict_difference(d1, d2):
136 """Get the difference between two occurrence dicts.
138 TODO: Evaluate which difference calculation would be best.
140 @return: dict with all keys (in d1 or in d2) and the difference as value."""
141 diff1 = {}
142 # check d1
143 for t in d1:
144 if t in d2:
145 diff1[t] = abs(d1[t] - d2[t])
146 else:
147 diff1[t] = abs(d1[t])
148 # add all from d2 which are not in d1
149 for t in d2:
150 if not t in diff1:
151 diff1[t] = abs(d2[t])
152 return diff1
154 def check_dissimilarity(txt_1grams, txt_2grams, txt_3grams, ref_1grams, ref_2grams, ref_3grams):
155 """check the similarity of the txt and the ref (-erence)."""
157 # normalize all dicts
158 txt_1grams = normalize_occurrence_dict(txt_1grams)
159 txt_2grams = normalize_occurrence_dict(txt_2grams)
160 txt_3grams = normalize_occurrence_dict(txt_3grams)
161 ref_1grams = normalize_occurrence_dict(ref_1grams)
162 ref_2grams = normalize_occurrence_dict(ref_2grams)
163 ref_3grams = normalize_occurrence_dict(ref_3grams)
165 d1 = occurrence_dict_difference(txt_1grams, ref_1grams)
166 d2 = occurrence_dict_difference(txt_2grams, ref_2grams)
167 d3 = occurrence_dict_difference(txt_3grams, ref_3grams)
169 return 0.5*sum(d1.values()), 0.5*sum(d2.values()), 0.5*sum(d3.values())
171 def _help():
172 return __doc__
174 def cost(text, diff123):
175 """Cost for a text with the three differences (1gram, 2gram, 3gram)."""
176 #: prefer shorter text: 3% * log2. Double length means 1% more cost.
177 length_factor = 100 + 3*log(len(text), 2)
178 return sum(diff123) * length_factor
180 def shorten(text, max_len=270):
181 """shorten a line, breaking at a sentence-end, if possible, and otherwise at word-end."""
182 end = ". "
183 space = " "
184 shorted = text[:max_len]
185 if end in text[:max_len]:
186 shidx = text[:max_len].rindex(end)
187 shorted = text[:shidx+1]
188 elif space in text[:max_len]:
189 shidx = text[:max_len].rindex(space)
190 shorted = text[:shidx]
191 if len(shorted) >= max_len/2:
192 return shorted
193 return text[:max_len]
196 def run(textfile, best_lines=False, max_len=270):
197 """test the file."""
198 # reference data
199 data = read_file("1gramme.txt")
200 reference1grams = letters_in_file_precalculated(data)
201 data = read_file("2gramme.txt")
202 reference2grams = repeats_in_file_precalculated(data)
203 data = read_file("3gramme.txt")
204 reference3grams = trigrams_in_file_precalculated(data)
206 if best_lines:
207 data = read_file_lines(textfile)
208 best_10 = [] # [(sum, (1, 2, 3), text), …]
209 while data[1:]:
210 l = shorten(data[1], max_len=max_len)
211 data = data[1:]
212 if not l[2:]:
213 continue
214 text1grams = letters_in_file(l)
215 text2grams = repeats_in_file(l)
216 text3grams = trigrams_in_file(l)
217 diss = check_dissimilarity(text1grams, text2grams, text3grams, reference1grams, reference2grams, reference3grams)
218 if not best_10[9:] or cost(l, diss) < best_10[-1][0]:
219 best_10.append((cost(l, diss), diss, l))
220 best_10.sort()
221 best_10 = best_10[:10]
222 print("\n### new top 10:", cost(l, diss), diss, l, "\n")
223 print(cost(l, diss), diss, l)
224 print("\n### best 10 lines ###\n")
225 best_10.reverse()
226 for s, x, t in best_10:
227 print("### best:", s, x, t)
228 else:
229 data = read_file(textfile)
230 text1grams = letters_in_file(data)
231 text2grams = repeats_in_file(data)
232 text3grams = trigrams_in_file(data)
233 diss = check_dissimilarity(text1grams, text2grams, text3grams, reference1grams, reference2grams, reference3grams)
234 print(cost(data, diss), diss)
237 ### Self-Test
239 if __name__ == "__main__":
240 from sys import argv
241 if "--test" in argv:
242 from doctest import testmod
243 testmod()
244 exit()
246 if not argv[1:]:
247 print(_help())
248 exit()
250 if "--best-lines" in argv:
251 LINES = True
252 else:
253 LINES = False
255 # text to check
256 textfile = argv[1]
258 run(textfile, best_lines=LINES)