1 # This Python file uses the following encoding: utf-8
3 Created on May 14, 2011
5 helper methods for generating random sentences from a context free grammar
7 this is used both for performance testing and for generating content which
8 can be used to check the accuracy of parses against existing models.
13 from nltk
.grammar
import Nonterminal
14 from nltk
.parse
.earleychart
import EarleyChartParser
16 def generate_random_sentence(cfg_grammar
, maxdepth
=8):
18 given a ContextFreeGrammar cfg_grammar, and an optional maximum depth (maxdepth, default = 8)
19 construct a string which is acceptable to the grammar.
21 the maxdepth option specifies the maximum number of recursive applications of the grammar
22 used to generate a string.
24 warning: if maxdepth is too small, in its current implementation, this method may never terminate.
26 start_rule
= random
.choice(tuple(cfg_grammar
.productions(lhs
=cfg_grammar
.start())))
28 sent
= " ".join(random_rule(start_rule
, cfg_grammar
, maxdepth
))
29 if " " in sent
or sent
.endswith(" "): continue
32 def random_rule(rule
, grammar
, maxdepth
=8):
34 randomly apply the given rule according to the given grammar.
36 if maxdepth is 0, stop the iteration.
40 for rhs
in rule
.rhs():
41 if type(rhs
) is Nonterminal
:
42 start_rule
= random
.choice(tuple(grammar
.productions(lhs
=rhs
)))
43 yield " ".join(random_rule(start_rule
, grammar
, maxdepth
-1))
47 if __name__
== "__main__":
49 g
= nltk
.data
.load('grammars/spanish_grammars/spanish2.cfg')
50 p
= EarleyChartParser(g
)
51 s
= "un mesa con el amiga sobre los hombre vio una hombre sobre las mesa con una mesa con las noticia sobre el hombre"
52 parse
= p
.nbest_parse(s
.split(' '))
53 print(len(set([str(tree
) for tree
in parse
])))
57 sent
= generate_random_sentence(g
)
58 tokens
= sent
.split(' ')
62 print(len(p
.nbest_parse(tokens
)))