eeb6a666aa9923eadb2a2593515f7ae5ae0ee56f
1 from nltk
.corpus
.reader
.util
import *
2 from nltk
.corpus
.reader
.bracket_parse
import BracketParseCorpusReader
4 class WSJDepCorpusReader(BracketParseCorpusReader
):
5 ''' Reader for the dependency parsed WSJ10. Will not include one-word
6 sentences, since these are not parsed (and thus not
7 POS-tagged!). All implemented foo_sents() functions should now be
8 of length 6268 since there are 38 one-word sentences. '''
9 def __init__(self
, root
):
10 BracketParseCorpusReader
.__init
__(self
,
11 "../corpus/wsjdep", # path to files
12 ['wsj.combined.10.dep']) # file-list or regexp
14 def _read_block(self
, stream
):
15 return read_regexp_block(stream
,
16 start_re
=r
'<sentence id=".+">')
18 def _normalize(self
, t
):
19 # convert XML to sexpr notation, more or less
20 t
= re
.sub(r
'<sentence id="10.+">', r
"[ ", t
)
22 t
= re
.sub(r
"\s+<text>\s+(.*)\s+</text>", r
"<text>\1</text>", t
)
23 t
= re
.sub(r
"<text>((.|\n)*)</text>", r
"(\1)\\n", t
)
25 t
= re
.sub(r
'<rel label=".*?">', r
'', t
)
26 t
= re
.sub(r
'\s+<head id="(\d+?)" pos="(.+?)">(.+)</head>', r
'((\2 \1 \3)', t
)
27 t
= re
.sub(r
'\s+<dep id="(\d+?)" pos="(.+?)">(.+)</dep>', r
'(\2 \1 \3)', t
)
28 t
= re
.sub(r
'\s+</rel>', r
')\\n', t
)
30 t
= re
.sub(r
"\s*</sentence>", r
"]", t
)
32 # \\n means "add an \n later", since we keep removing them
33 t
= re
.sub(r
"\\n", r
"\n", t
)
37 return dep_parse(self
._normalize
(t
))
40 tagonly
= self
._tagonly
(t
)
41 tagged_sent
= zip(self
._word
(t
), tagonly
)
45 PARENS
= re
.compile(r
'\(.+\)')
46 sentence
= PARENS
.findall(self
._normalize
(t
))[0]
47 WORD
= re
.compile(r
'([^\s()]+)')
48 words
= WORD
.findall(sentence
)
50 return [] # skip one-word sentences!
54 def _get_tagonly_sent(self
, parse
):
55 "Convert dependency parse into a sorted taglist"
60 for head
, dep
in parse
:
63 taglist
= list(tagset
)
64 taglist
.sort(lambda x
,y
: x
[1]-y
[1])
66 return [tag
for tag
,loc
in taglist
]
69 def _tags_and_parse(self
, t
):
70 parse
= dep_parse(self
._normalize
(t
))
71 return (self
._get
_tagonly
_sent
(parse
), parse
)
73 def _read_tags_and_parse_sent_block(self
, stream
):
74 tags_and_parse_sents
= [self
._tags
_and
_parse
(t
) for t
in self
._read
_block
(stream
)]
75 return [(tag
,parse
) for (tag
,parse
) in tags_and_parse_sents
if tag
and parse
]
77 def tagged_and_parsed_sents(self
, files
=None):
78 return concat([StreamBackedCorpusView(filename
,
79 self
._read
_tags
_and
_parse
_sent
_block
)
80 for filename
in self
.abspaths(files
)])
83 def _tagonly(self
, t
):
84 parse
= dep_parse(self
._normalize
(t
))
85 return self
._get
_tagonly
_sent
(parse
)
87 def _read_tagonly_sent_block(self
, stream
):
88 tagonly_sents
= [self
._tagonly
(t
) for t
in self
._read
_block
(stream
)]
89 return [tagonly_sent
for tagonly_sent
in tagonly_sents
if tagonly_sent
]
91 def tagonly_sents(self
, files
=None):
92 return concat([StreamBackedCorpusView(filename
,
93 self
._read
_tagonly
_sent
_block
)
94 for filename
in self
.abspaths(files
)])
99 "todo: add ROOT, which is implicitly the only non-dependent tagloc"
100 def read_tagloc(pos
):
101 match
= WORD
.match(s
, pos
+2)
105 match
= WORD
.match(s
, pos
)
106 loc
= int(match
.group(1))
109 match
= WORD
.match(s
, pos
) # skip the actual word
114 SPACE
= re
.compile(r
'\s*')
115 WORD
= re
.compile(r
'\s*([^\s\(\)]*)\s*')
116 RELSTART
= re
.compile(r
'\(\(')
118 # Skip any initial whitespace and actual sentence
119 match
= RELSTART
.search(s
, 0)
123 # eg. one word sentence, no dependency relation
127 head
, loc_h
= None, None
129 # Beginning of a sentence
131 pos
= SPACE
.match(s
, pos
+1).end()
134 pos
= SPACE
.match(s
, pos
+1).end()
135 if pos
!= len(s
): raise ValueError, "Trailing garbage following sentence"
137 # Beginning of a relation, head:
138 elif s
[pos
:pos
+2] == '((':
139 pos
, head
, loc_h
= read_tagloc(pos
)
141 elif s
[pos
:pos
+2] == ')(':
142 pos
, arg
, loc_a
= read_tagloc(pos
)
143 # Each head-arg relation gets its own pair in parse,
144 # although in xml we may have
145 # <rel><head/><dep/><dep/><dep/></rel>
146 parse
.add( ((head
,loc_h
),(arg
,loc_a
)) )
147 elif s
[pos
:pos
+2] == '))':
148 pos
= SPACE
.match(s
, pos
+2).end()
150 print "s: %s\ns[%d]=%s"%(s
,pos
,s
[pos
])
151 raise ValueError, 'unexpected token'
153 print "s: %s\ns[%d]=%s"%(s
,pos
,s
[pos
])
154 raise ValueError, 'mismatched parens (or something)'
158 "Return parse with ROOT added."
160 for (head
,loc_h
) in set([h
for h
,a
in parse
]):
161 if (head
,loc_h
) not in set([a
for h
,a
in parse
]):
163 raise ValueError, "Several possible roots in parse"
165 rooted
= (head
,loc_h
)
168 raise ValueError, "No root in parse!"
170 parse
.add( (('ROOT',-1), rooted
) )
173 if __name__
== "__main__":
174 print "WSJDepCorpusReader tests:"
175 reader
= WSJDepCorpusReader(None)
180 print "Tagged sentences:"
181 print reader
.tagged_sents()
183 parsedsents
= reader
.parsed_sents()
184 # print "Number of sentences: %d"%len(parsedsents) # takes a while
186 print "First parsed sentence:"
187 pprint
.pprint(parsedsents
[0])
189 tags_and_parses
= reader
.tagged_and_parsed_sents()
190 print "121st tagged and then parsed sentence:"
191 pprint
.pprint(tags_and_parses
[121])