dag: use argparse.REMAINDER so that double-dash -- is retained
[git-cola.git] / cola / spellcheck.py
blob88e29f6f15c7399c720dd2f717983b7a2c56dc5b
1 import codecs
2 import collections
3 import os
5 from . import resources
7 __copyright__ = """
8 2012 Peter Norvig (http://norvig.com/spell-correct.html)
9 2013-2018 David Aguilar <davvid@gmail.com>
10 """
12 ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
15 def train(features, model):
16 for f in features:
17 model[f] += 1
18 return model
21 def edits1(word):
22 splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
23 deletes = [a + b[1:] for a, b in splits if b]
24 transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
25 replaces = [a + c + b[1:] for a, b in splits for c in ALPHABET if b]
26 inserts = [a + c + b for a, b in splits for c in ALPHABET]
27 return set(deletes + transposes + replaces + inserts)
30 def known_edits2(word, words):
31 return {e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in words}
34 def known(word, words):
35 return {w for w in word if w in words}
38 def suggest(word, words):
39 candidates = (
40 known([word], words)
41 or known(edits1(word), words)
42 or known_edits2(word, words)
43 or [word]
45 return candidates
48 def correct(word, words):
49 candidates = suggest(word, words)
50 return max(candidates, key=words.get)
53 class NorvigSpellCheck:
54 def __init__(
55 self,
56 words='dict/words',
57 propernames='dict/propernames',
59 data_dirs = resources.xdg_data_dirs()
60 self.dictwords = resources.find_first(words, data_dirs)
61 self.propernames = resources.find_first(propernames, data_dirs)
62 self.words = collections.defaultdict(lambda: 1)
63 self.extra_words = set()
64 self.dictionary = None
65 self.initialized = False
67 def set_dictionary(self, dictionary):
68 self.dictionary = dictionary
70 def init(self):
71 if self.initialized:
72 return
73 self.initialized = True
74 train(self.read(), self.words)
75 train(self.extra_words, self.words)
77 def add_word(self, word):
78 self.extra_words.add(word)
80 def suggest(self, word):
81 self.init()
82 return suggest(word, self.words)
84 def check(self, word):
85 self.init()
86 return word.replace('.', '') in self.words
88 def read(self):
89 """Read dictionary words"""
90 paths = []
92 words = self.dictwords
93 propernames = self.propernames
94 cfg_dictionary = self.dictionary
96 if words and os.path.exists(words):
97 paths.append((words, True))
99 if propernames and os.path.exists(propernames):
100 paths.append((propernames, False))
102 if cfg_dictionary and os.path.exists(cfg_dictionary):
103 paths.append((cfg_dictionary, False))
105 for path, title in paths:
106 try:
107 with codecs.open(
108 path, 'r', encoding='utf-8', errors='ignore'
109 ) as words_file:
110 for line in words_file:
111 word = line.rstrip()
112 yield word
113 if title:
114 yield word.title()
115 except OSError:
116 pass