2to3 (compiles, not tested)
[tag_parser.git] / src / mjacob / algorithms / generate_random.py
blob4c55b8966035339315e47143fc40ba112e3d522a
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 14, 2011
5 helper methods for generating random sentences from a context free grammar
7 this is used both for performance testing and for generating content which
8 can be used to check the accuracy of parses against existing models.
10 @author: mjacob
11 '''
12 import random
13 from nltk.grammar import Nonterminal
14 from nltk.parse.earleychart import EarleyChartParser
16 def generate_random_sentence(cfg_grammar, maxdepth=8):
17 """
18 given a ContextFreeGrammar cfg_grammar, and an optional maximum depth (maxdepth, default = 8)
19 construct a string which is acceptable to the grammar.
21 the maxdepth option specifies the maximum number of recursive applications of the grammar
22 used to generate a string.
24 warning: if maxdepth is too small, in its current implementation, this method may never terminate.
25 """
26 start_rule = random.choice(tuple(cfg_grammar.productions(lhs=cfg_grammar.start())))
27 while True:
28 sent = " ".join(random_rule(start_rule, cfg_grammar, maxdepth))
29 if " " in sent or sent.endswith(" "): continue
30 return sent
32 def random_rule(rule, grammar, maxdepth=8):
33 """
34 randomly apply the given rule according to the given grammar.
36 if maxdepth is 0, stop the iteration.
37 """
38 if not maxdepth:
39 raise StopIteration
40 for rhs in rule.rhs():
41 if type(rhs) is Nonterminal:
42 start_rule = random.choice(tuple(grammar.productions(lhs=rhs)))
43 yield " ".join(random_rule(start_rule, grammar, maxdepth-1))
44 else:
45 yield rhs
47 if __name__ == "__main__":
48 import nltk
49 g = nltk.data.load('grammars/spanish_grammars/spanish2.cfg')
50 p = EarleyChartParser(g)
51 s = "un mesa con el amiga sobre los hombre vio una hombre sobre las mesa con una mesa con las noticia sobre el hombre"
52 parse = p.nbest_parse(s.split(' '))
53 print(len(set([str(tree) for tree in parse])))
55 exit(0)
56 for i in range(100):
57 sent = generate_random_sentence(g)
58 tokens = sent.split(' ')
59 #if len(tokens) > 15:
60 # continue
61 print(sent)
62 print(len(p.nbest_parse(tokens)))