textcheck.py

   1 #!/usr/bin/env python3
   2 # encoding: utf-8
   3
   4 """Check how much a given text diverges from a 1gram, 2gram and 3gram frequency.
   5
   6 usage: ./textcheck.py <textfile to check> [--best-lines]
   7
   8 --lines: check each line and return the 10 most similar lines.
   9
  10 idea: allow selecting different 1gram, 2gram and 3gram files.
  11
  12 """
  13
  14 from sys import argv
  15 if "--help" in argv:
  16     print(__doc__)
  17     exit()
  18
  19 from math import log
  20
  21 def read_file(path):
  22     """Get the data from a file.
  23
  24     >>> read_file("testfile")[:2]
  25     'ui'
  26     """
  27     with open(path, "r") as f: #, encoding="UTF-8") as f:
  28         data = f.read()
  29     return data
  30
  31 def read_file_lines(path):
  32     """Get the data from a file.
  33
  34     >>> read_file("testfile")[:2]
  35     'ui'
  36     """
  37     with open(path) as f: #, encoding="UTF-8") as f:
  38         data = f.readlines()
  39     return data
  40
  41 def letters_in_file(data):
  42     """Sort the repeats in a file by the number of occurrances.
  43
  44     >>> data = read_file("testfile")
  45     >>> letters_in_file(data)[:3]
  46     [(5, 'a'), (4, '\\n'), (2, '⇧')]
  47     """
  48     letters = {}
  49     for letter in data:
  50         if letter in letters:
  51             letters[letter] += 1
  52         else:
  53             letters[letter] = 1
  54     return letters
  55
  56 def letters_in_file_precalculated(data):
  57     """Get the repeats from a precalculated file.
  58
  59     >>> data = read_file("1gramme.txt")
  60     >>> letters_in_file_precalculated(data)[:2]
  61     [(44034982, 'e'), (27012723, 'n')]
  62     """
  63     letters = [line.lstrip().split(" ", 1) for line in data.splitlines() if line.split()[1:]]
  64     letters = [(int(num), let) for num, let in letters]
  65     lett = {l: num for num, l in letters}
  66     return lett
  67
  68 def repeats_in_file(data):
  69     """Sort the repeats in a file by the number of occurrances.
  70
  71     >>> data = read_file("testfile")
  72     >>> repeats_in_file(data)[:3]
  73     [(2, 'aa'), (2, 'a\\n'), (1, '⇧a')]
  74     """
  75     repeats = {}
  76     for i in range(len(data)-1):
  77         rep = data[i] + data[i+1]
  78         if rep in repeats:
  79             repeats[rep] += 1
  80         else:
  81             repeats[rep] = 1
  82     return repeats
  83
  84 def repeats_in_file_precalculated(data):
  85     """Get the repeats from a precalculated file.
  86
  87     >>> data = read_file("2gramme.txt")
  88     >>> repeats_in_file_precalculated(data)[:2]
  89     [(10162743, 'en'), (10028050, 'er')]
  90     """
  91     reps = [line.lstrip().split(" ", 1) for line in data.splitlines() if line.split()[1:]]
  92     reps = [(int(num), r) for num, r in reps if r[1:]]
  93     r = {r: num for num, r in reps}
  94     return r
  95
  96 def trigrams_in_file(data):
  97     """Sort the trigrams in a file by the number of occurrances.
  98
  99     >>> data = read_file("testfile")
 100     >>> trigrams_in_file(data)[:12]
 101     [(1, '⇧aa'), (1, '⇧aa'), (1, '⇧aa'), (1, '⇧aa'), (1, '⇗aa'), (1, '⇗aa'), (1, '⇗aa'), (1, '⇗aa'), (1, 'uia'), (1, 't⇧a'), (1, 't⇧a'), (1, 't⇗a')]
 102     """
 103     trigs = {}
 104     for i in range(len(data)-2):
 105         trig = data[i] + data[i+1] + data[i+2]
 106         if trig in trigs:
 107             trigs[trig] += 1
 108         else:
 109             trigs[trig] = 1
 110     return trigs
 111
 112 def trigrams_in_file_precalculated(data):
 113     """Get the repeats from a precalculated file.
 114
 115     CAREFUL: SLOW!
 116
 117     >>> data = read_file("3gramme.txt")
 118     >>> trigrams_in_file_precalculated(data)[:6]
 119     [(5679632, 'en '), (4417443, 'er '), (2891983, ' de'), (2303238, 'der'), (2273056, 'ie '), (2039537, 'ich')]
 120     """
 121     trigs = [line.lstrip().split(" ", 1) for line in data.splitlines() if line.split()[1:]]
 122     trigs = [(int(num), r) for num, r in trigs if r[1:]]
 123     t = {t: num for num, t in trigs}
 124     return t
 125
 126 def normalize_occurrence_dict(d):
 127     """normalize a dict with keys and assorted occurrence numbers.
 128
 129     ⇒ sum([d[t] for t in d]) == 1.0
 130     """
 131     _sum = sum([d[t] for t in d])
 132     d = {t: d[t]/_sum for t in d}
 133     return d
 134
 135 def occurrence_dict_difference(d1, d2):
 136     """Get the difference between two occurrence dicts.
 137
 138     TODO: Evaluate which difference calculation would be best.
 139
 140     @return: dict with all keys (in d1 or in d2) and the difference as value."""
 141     diff1 = {}
 142     # check d1
 143     for t in d1:
 144         if t in d2:
 145             diff1[t] = abs(d1[t] - d2[t])
 146         else:
 147             diff1[t] = abs(d1[t])
 148     # add all from d2 which are not in d1
 149     for t in d2:
 150         if not t in diff1:
 151             diff1[t] = abs(d2[t])
 152     return diff1
 153
 154 def check_dissimilarity(txt_1grams, txt_2grams, txt_3grams, ref_1grams, ref_2grams, ref_3grams):
 155     """check the similarity of the txt and the ref (-erence)."""
 156
 157     # normalize all dicts
 158     txt_1grams = normalize_occurrence_dict(txt_1grams)
 159     txt_2grams = normalize_occurrence_dict(txt_2grams)
 160     txt_3grams = normalize_occurrence_dict(txt_3grams)
 161     ref_1grams = normalize_occurrence_dict(ref_1grams)
 162     ref_2grams = normalize_occurrence_dict(ref_2grams)
 163     ref_3grams = normalize_occurrence_dict(ref_3grams)
 164
 165     d1 = occurrence_dict_difference(txt_1grams, ref_1grams)
 166     d2 = occurrence_dict_difference(txt_2grams, ref_2grams)
 167     d3 = occurrence_dict_difference(txt_3grams, ref_3grams)
 168
 169     return 0.5*sum(d1.values()), 0.5*sum(d2.values()), 0.5*sum(d3.values())
 170
 171 def _help():
 172     return __doc__
 173
 174 def cost(text, diff123):
 175     """Cost for a text with the three differences (1gram, 2gram, 3gram)."""
 176     #: prefer shorter text: 3% * log2. Double length means 1% more cost.
 177     length_factor = 100 + 3*log(len(text), 2)
 178     return sum(diff123) * length_factor
 179
 180 def shorten(text, max_len=270):
 181     """shorten a line, breaking at a sentence-end, if possible, and otherwise at word-end."""
 182     end = ". "
 183     space = " "
 184     shorted = text[:max_len]
 185     if end in text[:max_len]:
 186         shidx = text[:max_len].rindex(end)
 187         shorted = text[:shidx+1]
 188     elif space in text[:max_len]:
 189         shidx = text[:max_len].rindex(space)
 190         shorted = text[:shidx]
 191     if len(shorted) >= max_len/2:
 192         return shorted
 193     return text[:max_len]
 194
 195
 196 def run(textfile, best_lines=False, max_len=270):
 197     """test the file."""
 198     # reference data
 199     data = read_file("1gramme.txt")
 200     reference1grams = letters_in_file_precalculated(data)
 201     data = read_file("2gramme.txt")
 202     reference2grams = repeats_in_file_precalculated(data)
 203     data = read_file("3gramme.txt")
 204     reference3grams = trigrams_in_file_precalculated(data)
 205
 206     if best_lines:
 207         data = read_file_lines(textfile)
 208         best_10 = [] # [(sum, (1, 2, 3), text), …]
 209         while data[1:]:
 210             l = shorten(data[1], max_len=max_len)
 211             data = data[1:]
 212             if not l[2:]:
 213                 continue
 214             text1grams = letters_in_file(l)
 215             text2grams = repeats_in_file(l)
 216             text3grams = trigrams_in_file(l)
 217             diss = check_dissimilarity(text1grams, text2grams, text3grams, reference1grams, reference2grams, reference3grams)
 218             if not best_10[9:] or cost(l, diss) < best_10[-1][0]:
 219                 best_10.append((cost(l, diss), diss, l))
 220                 best_10.sort()
 221                 best_10 = best_10[:10]
 222                 print("\n### new top 10:", cost(l, diss), diss, l, "\n")
 223             print(cost(l, diss), diss, l)
 224         print("\n### best 10 lines ###\n")
 225         best_10.reverse()
 226         for s, x, t in best_10:
 227             print("### best:", s, x, t)
 228     else:
 229         data = read_file(textfile)
 230         text1grams = letters_in_file(data)
 231         text2grams = repeats_in_file(data)
 232         text3grams = trigrams_in_file(data)
 233         diss = check_dissimilarity(text1grams, text2grams, text3grams, reference1grams, reference2grams, reference3grams)
 234         print(cost(data, diss), diss)
 235
 236
 237 ### Self-Test
 238
 239 if __name__ == "__main__":
 240     from sys import argv
 241     if "--test" in argv:
 242         from doctest import testmod
 243         testmod()
 244         exit()
 245
 246     if not argv[1:]:
 247         print(_help())
 248         exit()
 249
 250     if "--best-lines" in argv:
 251         LINES = True
 252     else:
 253         LINES = False
 254
 255     # text to check
 256     textfile = argv[1]
 257
 258     run(textfile, best_lines=LINES)