2to3 (compiles, not tested)
[tag_parser.git] / tests / performance / test_large_grammar.py
blob5fdf6b15e4be2dedf53136ae314228832be582f0
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 17, 2011
5 compare the performance of various parsers using the ATIS grammar.
7 (extracted from a treebank of the DARPA ATIS3
8 http://www.informatics.sussex.ac.uk/research/groups/nlp/carroll/cfg-resources/
9 context free, *NOT* CNF (nodes w/ 10 children), 5517 productions, 549 non-terminals, 925 terminals
10 average sentence length: 11.408163265306122
12 Note that there are slight differences in the way that my parsers and the NLTK
13 parsers are invoked. Due to a design oversight, my parsers cannot feasibly
14 compute the parse trees of complex senteces with large grammars, so instead
15 I merely ask whether they recognize a string as valid. As the NLTK parsers
16 do not have a similar method, I cannot compare them directly.
18 In any event, my parsers cannot even recognize sentences from this grammar
19 in a reasonable length of time.
21 @author: mjacob
22 '''
23 import nltk
24 from ParsePerformanceTester import ParsePerformanceTester, NLTK_BEST, MY_PARSER
26 grammar = nltk.data.load('grammars/large_grammars/atis.cfg')
27 grammar_string = "nltk.data.load('grammars/large_grammars/atis.cfg')"
29 raw_data = nltk.data.load('grammars/large_grammars/atis_sentences.txt', format='raw')
30 sentences = [x[0] for x in nltk.parse.util.extract_test_sentences(raw_data)]
32 tester = ParsePerformanceTester(grammar_string,
33 sentences)
35 parser, parser_import = list(NLTK_BEST.items())[0]
36 nltk_time = 1000*tester.run(parser_import, method="parse")
37 parser, parser_import = list(MY_PARSER.items())[0]
38 my_time1 = 1000*tester.run(parser_import, method="parse")
39 parser, parser_import = list(MY_PARSER.items())[1]
40 my_time2 = 1000*tester.run(parser_import, method="parse")
42 print(" sentence length NLTK %s ratio %s ratio2" % (list(MY_PARSER.items())[0][0], list(MY_PARSER.items())[1][0]))
43 print(" %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f" % (tester.average_sentence_length(), nltk_time, my_time1, my_time1/nltk_time, my_time2, my_time2/nltk_time))