src/mjacob/algorithms/generate_random.py

   1 # This Python file uses the following encoding: utf-8
   2 '''
   3 Created on May 14, 2011
   4
   5 helper methods for generating random sentences from a context free grammar
   6
   7 this is used both for performance testing and for generating content which
   8 can be used to check the accuracy of parses against existing models.
   9
  10 @author: mjacob
  11 '''
  12 import random
  13 from nltk.grammar import Nonterminal
  14 from nltk.parse.earleychart import EarleyChartParser
  15
  16 def generate_random_sentence(cfg_grammar, maxdepth=8):
  17     """
  18     given a ContextFreeGrammar cfg_grammar, and an optional maximum depth (maxdepth, default = 8)
  19     construct a string which is acceptable to the grammar.
  20
  21     the maxdepth option specifies the maximum number of recursive applications of the grammar
  22     used to generate a string.
  23
  24     warning: if maxdepth is too small, in its current implementation, this method may never terminate.
  25     """
  26     start_rule = random.choice(tuple(cfg_grammar.productions(lhs=cfg_grammar.start())))
  27     while True:
  28         sent = " ".join(random_rule(start_rule, cfg_grammar, maxdepth))
  29         if "  " in sent or sent.endswith(" "): continue
  30         return sent
  31
  32 def random_rule(rule, grammar, maxdepth=8):
  33     """
  34     randomly apply the given rule according to the given grammar.
  35
  36     if maxdepth is 0, stop the iteration.
  37     """
  38     if not maxdepth:
  39         raise StopIteration
  40     for rhs in rule.rhs():
  41         if type(rhs) is Nonterminal:
  42             start_rule = random.choice(tuple(grammar.productions(lhs=rhs)))
  43             yield " ".join(random_rule(start_rule, grammar, maxdepth-1))
  44         else:
  45             yield rhs
  46
  47 if __name__ == "__main__":
  48     import nltk
  49     g = nltk.data.load('grammars/spanish_grammars/spanish2.cfg')
  50     p = EarleyChartParser(g)
  51     s = "un mesa con el amiga sobre los hombre vio una hombre sobre las mesa con una mesa con las noticia sobre el hombre"
  52     parse = p.nbest_parse(s.split(' '))
  53     print(len(set([str(tree) for tree in parse])))
  54
  55     exit(0)
  56     for i in range(100):
  57         sent = generate_random_sentence(g)
  58         tokens = sent.split(' ')
  59         #if len(tokens) > 15:
  60         #    continue
  61         print(sent)
  62         print(len(p.nbest_parse(tokens)))