git-cola-sequence-editor: move the implementation to a module
[git-cola.git] / cola / spellcheck.py
blobc221ac71796d5aca104b29de7a388d2b48923a54
1 from __future__ import division, absolute_import, unicode_literals
2 import collections
3 import os
5 from cola import core
7 __copyright__ = """
8 2012 Peter Norvig (http://norvig.com/spell-correct.html)
9 2013-2018 David Aguilar <davvid@gmail.com>
10 """
12 alphabet = 'abcdefghijklmnopqrstuvwxyz'
15 def train(features, model):
16 for f in features:
17 model[f] += 1
18 return model
21 def edits1(word):
22 splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
23 deletes = [a + b[1:] for a, b in splits if b]
24 transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
25 replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
26 inserts = [a + c + b for a, b in splits for c in alphabet]
27 return set(deletes + transposes + replaces + inserts)
30 def known_edits2(word, words):
31 return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in words)
34 def known(word, words):
35 return set(w for w in word if w in words)
38 def suggest(word, words):
39 candidates = (
40 known([word], words)
41 or known(edits1(word), words)
42 or known_edits2(word, words)
43 or [word]
45 return candidates
48 def correct(word, words):
49 candidates = suggest(word, words)
50 return max(candidates, key=words.get)
53 class NorvigSpellCheck(object):
54 def __init__(
55 self,
56 words='/usr/share/dict/words',
57 cracklib='/usr/share/dict/cracklib-small',
58 propernames='/usr/share/dict/propernames',
60 self.dictwords = words
61 self.cracklib = cracklib
62 self.propernames = propernames
63 self.words = collections.defaultdict(lambda: 1)
64 self.extra_words = set()
65 self.dictionary = None
66 self.initialized = False
68 def set_dictionary(self, dictionary):
69 self.dictionary = dictionary
71 def init(self):
72 if self.initialized:
73 return
74 self.initialized = True
75 train(self.read(), self.words)
76 train(self.extra_words, self.words)
78 def add_word(self, word):
79 self.extra_words.add(word)
81 def suggest(self, word):
82 self.init()
83 return suggest(word, self.words)
85 def check(self, word):
86 self.init()
87 return word.replace('.', '') in self.words
89 def read(self):
90 """Read dictionary words"""
91 paths = []
93 words = self.dictwords
94 cracklib = self.cracklib
95 propernames = self.propernames
96 cfg_dictionary = self.dictionary
98 if cracklib and os.path.exists(cracklib):
99 paths.append((cracklib, True))
100 elif words and os.path.exists(words):
101 paths.append((words, True))
103 if propernames and os.path.exists(propernames):
104 paths.append((propernames, False))
106 if cfg_dictionary and os.path.exists(cfg_dictionary):
107 paths.append((cfg_dictionary, False))
109 for (path, title) in paths:
110 try:
111 with open(path, 'r') as f:
112 for word in f:
113 word = core.decode(word.rstrip())
114 yield word
115 if title:
116 yield word.title()
117 except IOError:
118 pass