tests/performance/test_large_grammar.py

   1 # This Python file uses the following encoding: utf-8
   2 '''
   3 Created on May 17, 2011
   4
   5 compare the performance of various parsers using the ATIS grammar.
   6
   7 (extracted from a treebank of the DARPA ATIS3
   8 http://www.informatics.sussex.ac.uk/research/groups/nlp/carroll/cfg-resources/
   9 context free, *NOT* CNF (nodes w/ 10 children), 5517 productions, 549 non-terminals, 925 terminals
  10 average sentence length: 11.408163265306122
  11
  12 Note that there are slight differences in the way that my parsers and the NLTK
  13 parsers are invoked. Due to a design oversight, my parsers cannot feasibly
  14 compute the parse trees of complex senteces with large grammars, so instead
  15 I merely ask whether they recognize a string as valid. As the NLTK parsers
  16 do not have a similar method, I cannot compare them directly.
  17
  18 In any event, my parsers cannot even recognize sentences from this grammar
  19 in a reasonable length of time.
  20
  21 @author: mjacob
  22 '''
  23 import nltk
  24 from ParsePerformanceTester import ParsePerformanceTester, NLTK_BEST, MY_PARSER
  25
  26 grammar = nltk.data.load('grammars/large_grammars/atis.cfg')
  27 grammar_string = "nltk.data.load('grammars/large_grammars/atis.cfg')"
  28
  29 raw_data = nltk.data.load('grammars/large_grammars/atis_sentences.txt', format='raw')
  30 sentences = [x[0] for x in nltk.parse.util.extract_test_sentences(raw_data)]
  31
  32 tester = ParsePerformanceTester(grammar_string,
  33                                 sentences)
  34
  35 parser, parser_import = list(NLTK_BEST.items())[0]
  36 nltk_time = 1000*tester.run(parser_import, method="parse")
  37 parser, parser_import = list(MY_PARSER.items())[0]
  38 my_time1 = 1000*tester.run(parser_import, method="parse")
  39 parser, parser_import = list(MY_PARSER.items())[1]
  40 my_time2 = 1000*tester.run(parser_import, method="parse")
  41
  42 print("    sentence length  NLTK    %s ratio    %s ratio2" % (list(MY_PARSER.items())[0][0], list(MY_PARSER.items())[1][0]))
  43 print("    %5.2f            %5.2f    %5.2f    %5.2f    %5.2f    %5.2f" % (tester.average_sentence_length(), nltk_time, my_time1, my_time1/nltk_time, my_time2, my_time2/nltk_time))