1 # This Python file uses the following encoding: utf-8
3 Created on Apr 29, 2011
5 compute some statistics of my parsers and some nltk parsers
10 from ParsePerformanceTester
import ParsePerformanceTester
, MY_PARSER
, NLTK_BEST
12 from mjacob
.algorithms
.generate_random
import generate_random_sentence
15 from mjacob
.nltk
.grammar
.TreeAdjoiningGrammar
import TreeAdjoiningGrammar
20 NN -> N | NN PP | AP NN | NN CP
21 VP -> V1 | V2 NP | VC CP | VP PP | AdvP VP
23 AdvP -> Adv | AdvP Adv
25 Det -> 'a' | 'the' | 'one' | 'no' | 'each'
26 N -> 'dog' | 'cat' | 'man' | 'hamster' | 'woman' | 'sherif' | 'pug' | 'bird' | 'demon' | 'spider' | 'horse' | 'donkey' | 'farmer' | 'mink' | 'marmot' | 'groundhog' | 'cuy' | 'rainbow'
27 V1 -> 'sat' | 'slept' | 'yelled' | 'died' | 'wept' | 'sashayed' | 'fell' | 'rose' | 'won' | 'lost'
28 V2 -> 'chased' | 'hit' | 'saw' | 'heard' | 'shot' | 'beat' | 'fed' | 'lifted' | 'befriended' | 'googled' | 'smelled' | 'sensed'
29 VC -> 'thought' | 'knew' | 'saw' | 'heard' | 'believed' | 'understood'
30 P -> 'on' | 'in' | 'under' | 'beside' | 'inside' | 'near' | 'below' | 'above' | 'before' | 'after' | 'through'
31 A -> 'red' | 'blue' | 'quick' | 'slow' | 'frusty' | 'insane' | 'smarmy' | 'slithy' | 'mimsy' | 'green' | 'yellow' | 'black' | 'white' | 'orange' | 'indigo' | 'violet' | 'purple' | 'colorless' | 'tasty'
32 Adv -> 'quickly' | 'slowly' | 'completely' | 'entirely' | 'vaguely'
35 grammar
= nltk
.parse_cfg(g
)
36 grammar_string
= """nltk.parse_cfg('''%s ''')""" % (g
)
39 gr
= TreeAdjoiningGrammar(cfg
=grammar
)
41 print("nonterminals: %s" % (len(gr
.nonterminals())))
42 print("terminals: %s" % (len(gr
.terminals())))
44 print(" tree depth sentence length NLTK %s ratio %s ratio2" % (list(MY_PARSER
.items())[0][0], list(MY_PARSER
.items())[1][0]))
45 for tree_depth
in range(4,11):
46 sentence_file
= 'medium_sentences_%s.yaml' % (tree_depth
,)
47 if os
.path
.exists(sentence_file
):
48 sentences
= yaml
.load(open(sentence_file
))
50 sentences
= [generate_random_sentence(grammar
, tree_depth
).split(' ')
52 yaml
.dump(sentences
, open(sentence_file
, 'w'))
54 tester
= ParsePerformanceTester(grammar_string
,
57 for parser
, parser_import
in sorted(NLTK_BEST
.items()):
58 nltk_time
= 1000*tester
.run(parser_import
, method
="parse")
59 parser
, parser_import
= list(MY_PARSER
.items())[0]
60 my_time1
= 1000*tester
.run(parser_import
, method
="parse")
61 parser
, parser_import
= list(MY_PARSER
.items())[1]
62 my_time2
= 1000*tester
.run(parser_import
, method
="parse")
64 print(" %2i %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f" % (tree_depth
, tester
.average_sentence_length(), nltk_time
, my_time1
, my_time1
/nltk_time
, my_time2
, my_time2
/nltk_time
))