2to3 (compiles, not tested)
[tag_parser.git] / tests / performance / test_medium_grammar.py
blob2d50e2ee1c4397971e5e8b8f1b784acf473dbf89
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on Apr 29, 2011
5 compute some statistics of my parsers and some nltk parsers
6 using the toy grammar.
8 @author: mjacob
9 '''
10 from ParsePerformanceTester import ParsePerformanceTester, MY_PARSER, NLTK_BEST
11 import nltk
12 from mjacob.algorithms.generate_random import generate_random_sentence
13 import yaml
14 import os
15 from mjacob.nltk.grammar.TreeAdjoiningGrammar import TreeAdjoiningGrammar
17 g = '''S -> NP VP
18 PP -> P NP
19 NP -> Det NN
20 NN -> N | NN PP | AP NN | NN CP
21 VP -> V1 | V2 NP | VC CP | VP PP | AdvP VP
22 AP -> A | AP A
23 AdvP -> Adv | AdvP Adv
24 CP -> 'that' S | S
25 Det -> 'a' | 'the' | 'one' | 'no' | 'each'
26 N -> 'dog' | 'cat' | 'man' | 'hamster' | 'woman' | 'sherif' | 'pug' | 'bird' | 'demon' | 'spider' | 'horse' | 'donkey' | 'farmer' | 'mink' | 'marmot' | 'groundhog' | 'cuy' | 'rainbow'
27 V1 -> 'sat' | 'slept' | 'yelled' | 'died' | 'wept' | 'sashayed' | 'fell' | 'rose' | 'won' | 'lost'
28 V2 -> 'chased' | 'hit' | 'saw' | 'heard' | 'shot' | 'beat' | 'fed' | 'lifted' | 'befriended' | 'googled' | 'smelled' | 'sensed'
29 VC -> 'thought' | 'knew' | 'saw' | 'heard' | 'believed' | 'understood'
30 P -> 'on' | 'in' | 'under' | 'beside' | 'inside' | 'near' | 'below' | 'above' | 'before' | 'after' | 'through'
31 A -> 'red' | 'blue' | 'quick' | 'slow' | 'frusty' | 'insane' | 'smarmy' | 'slithy' | 'mimsy' | 'green' | 'yellow' | 'black' | 'white' | 'orange' | 'indigo' | 'violet' | 'purple' | 'colorless' | 'tasty'
32 Adv -> 'quickly' | 'slowly' | 'completely' | 'entirely' | 'vaguely'
33 '''
35 grammar = nltk.parse_cfg(g)
36 grammar_string = """nltk.parse_cfg('''%s ''')""" % (g)
39 gr = TreeAdjoiningGrammar(cfg=grammar)
40 print(gr)
41 print("nonterminals: %s" % (len(gr.nonterminals())))
42 print("terminals: %s" % (len(gr.terminals())))
44 print(" tree depth sentence length NLTK %s ratio %s ratio2" % (list(MY_PARSER.items())[0][0], list(MY_PARSER.items())[1][0]))
45 for tree_depth in range(4,11):
46 sentence_file = 'medium_sentences_%s.yaml' % (tree_depth,)
47 if os.path.exists(sentence_file):
48 sentences = yaml.load(open(sentence_file))
49 else:
50 sentences = [generate_random_sentence(grammar, tree_depth).split(' ')
51 for i in range(100)]
52 yaml.dump(sentences, open(sentence_file, 'w'))
54 tester = ParsePerformanceTester(grammar_string,
55 sentences)
57 for parser, parser_import in sorted(NLTK_BEST.items()):
58 nltk_time = 1000*tester.run(parser_import, method="parse")
59 parser, parser_import = list(MY_PARSER.items())[0]
60 my_time1 = 1000*tester.run(parser_import, method="parse")
61 parser, parser_import = list(MY_PARSER.items())[1]
62 my_time2 = 1000*tester.run(parser_import, method="parse")
64 print(" %2i %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f" % (tree_depth, tester.average_sentence_length(), nltk_time, my_time1, my_time1/nltk_time, my_time2, my_time2/nltk_time))