loc_h_dmv reestimation conforms to formulas.pdf, but cnf_ version still differs somehow
[dmvccm.git] / tex / formulas.tex
blob740ee3efdfba0b86a04043c91996e1bec153ecc3
1 % Created 2008-06-13 Fri 17:05
2 \documentclass[11pt,a4paper]{article}
3 \usepackage[utf8]{inputenc}
4 \usepackage[T1]{fontenc}
5 \usepackage{hyperref}
6 \usepackage{natbib}
8 \usepackage{pslatex}
9 \usepackage{pdfsync}
10 \pdfoutput=1
12 \usepackage{qtree}
13 \usepackage{amsmath}
14 \usepackage{amssymb}
16 \usepackage{avm}
17 \avmfont{\sc}
18 \avmoptions{sorted,active}
19 \avmvalfont{\rm}
20 \avmsortfont{\scriptsize\it}
22 \usepackage{array} % for a tabular with math in it
24 \title{DMV formulas}
25 \author{Kevin Brubeck Unhammer}
26 \date{21 August 2008}
28 \begin{document}
30 \maketitle
32 This is an attempt at fleshing out the details of the inside-outside
33 algorithm \citep{lari-csl90} applied to the DMV model of
34 \citet{klein-thesis}.
36 \tableofcontents
38 \newcommand{\LOC}[1]{\textbf{#1}}
39 \newcommand{\GOR}[1]{\overrightarrow{#1}}
40 \newcommand{\RGOL}[1]{\overleftarrow{\overrightarrow{#1}}}
41 \newcommand{\SEAL}[1]{\overline{#1}}
42 \newcommand{\LGOR}[1]{\overrightarrow{\overleftarrow{#1}}}
43 \newcommand{\GOL}[1]{\overleftarrow{#1}}
44 \newcommand{\LN}[1]{\underleftarrow{#1}}
45 \newcommand{\RN}[1]{\underrightarrow{#1}}
46 \newcommand{\XI}{\lessdot}
47 \newcommand{\XJ}{\gtrdot}
48 \newcommand{\SMTR}[1]{\dddot{#1}}
49 \newcommand{\SDTR}[1]{\ddot{#1}}
51 \section{Note on notation}
52 $i, j, k$ are sentence positions (between words), where $i$ and $j$
53 are always the start and end, respectively, for what we're calculating
54 ($k$ is between $i$ and $j$ for $P_{INSIDE}$, to their right or left
55 for $P_{OUTSIDE}$). $s \in S$ are sentences in the corpus. $\LOC{w}$
56 is a word token (actually POS-token) of type $w$ at a certain sentence
57 location. If $\LOC{w}$ is between $i$ and $i+1$, $loc(\LOC{w})=i$
58 following \citet{klein-thesis}, meaning $i$ is adjacent to $\LOC{w}$
59 on the left, while $j=loc(\LOC{w})+1$ means that $j$ is adjacent to
60 $\LOC{w}$ on the right. To simplify, $loc_l(\LOC{w}):=loc(\LOC{w})$ and
61 $loc_r(\LOC{w}):=loc(\LOC{w})+1$. We write $\LOC{h}$ if this is a head
62 in the rule being used, $\LOC{a}$ if it is an attached argument.
64 Notational differences between the thesis \citep{klein-thesis} and the
65 paper \citep{km-dmv}:
67 \begin{tabular}{cc}
68 Paper: & Thesis: \\
69 $w$ & $\GOR{w}$ \\
70 $w\urcorner$ & $\RGOL{w}$ \\
71 $\ulcorner{w}\urcorner$ & $\SEAL{w}$ \\
72 \end{tabular}
74 I use $\SMTR{w}$ (or $\SDTR{w}$) to signify one of either $w, \GOR{w},
75 \RGOL{w}, \LGOR{w}, \GOL{w}$ or $\SEAL{w}$\footnote{This means that
76 $\SMTR{\LOC{w}}$ is the triplet of the actual POS-tag, its sentence
77 location as a token, and the ``level of seals''.}.
80 \section{Inside probabilities}
81 $P_{INSIDE}$ is defined in \citet[pp.~106-108]{klein-thesis}, the only
82 thing we need to add is that for right attachments,
83 $i \leq loc_l(w)<k \leq loc_l(\LOC{a})<j$ while for left attachments,
84 $i \leq loc_l(\LOC{a})<k \leq loc_l(w)<j$.
86 (For now, let
87 \[ \forall{w}[P_{ORDER}(right\text{-}first|w)=1.0] \] since the DMV implementation
88 is not yet generalized to both directions.)
94 \subsection{Sentence probability}
96 $P_s$ is the sentence probability, based on
97 \citet[p.~38]{lari-csl90}. Since the ROOT rules are different from the
98 rest, we sum them explicitly in this definition:
99 \begin{align*}
100 P_s = \sum_{\LOC{w} \in s} P_{ROOT}(\LOC{w}) P_{INSIDE}(\SEAL{\LOC{w}}, 0, len(s))
101 \end{align*}
103 \section{Outside probabilities}
105 \begin{align*}
106 P_{OUTSIDE_s}(ROOT, i, j) = \begin{cases}
107 1.0 & \text{ if $i = 0$ and $j = len(s)$,}\\
108 0.0 & \text{ otherwise}
109 \end{cases}
110 \end{align*}
112 For $P_{OUTSIDE}(\SEAL{w}, i, j)$, $w$ is attached to under something
113 else ($\SEAL{w}$ is what we elsewhere call $\SEAL{a}$). Adjacency is
114 thus calculated on the basis of $h$, the head of the rule. If we are
115 attached to from the left we have $i \leq loc_l(\LOC{w}) < j \leq loc_l(\LOC{h}) < k$, while
116 from the right we have $k \leq loc_l(\LOC{h}) < i \leq loc_l(\LOC{w}) < j$:
117 \begin{align*}
118 P_{OUTSIDE}&(\SEAL{\LOC{w}}, i, j) = \\
119 & P_{ROOT}(w) P_{OUTSIDE}(ROOT, i, j) + \\
120 & [ \sum_{k > j} ~ \sum_{\LOC{h}:j\leq loc_l(\LOC{h})<k} \sum_{\SMTR{\LOC{h}} \in \{\RGOL{\LOC{h}},\GOL{\LOC{h}}\}} P_{STOP}(\neg stop|h, left, adj(j, \LOC{h})) P_{ATTACH}(w|h, left) \\
121 & \qquad \qquad \qquad \qquad \qquad P_{OUTSIDE}(\SMTR{\LOC{h}}, i, k) P_{INSIDE}(\SMTR{\LOC{h}}, j, k) ] ~ + \\
122 & [ \sum_{k < i} ~ \sum_{\LOC{h}:k\leq loc_l(\LOC{h})<i} \sum_{\SMTR{\LOC{h}} \in \{\LGOR{\LOC{h}},\GOR{\LOC{h}}\}} P_{STOP}(\neg stop|h, right, adj(i, \LOC{h})) P_{ATTACH}(w|h, right) \\
123 & \qquad \qquad \qquad \qquad \qquad P_{INSIDE}(\SMTR{\LOC{h}}, k, i) P_{OUTSIDE}(\SMTR{\LOC{h}}, k, j) ]
124 \end{align*}
126 For $\RGOL{w}$ we know it is either under a left stop rule or it is
127 the right daughter of a left attachment rule ($k \leq loc_l(\LOC{a}) <
128 i \leq loc_l(\LOC{w}) < j$), and these are adjacent if the start point
129 ($i$) equals $loc_l(\LOC{w})$:
130 \begin{align*}
131 P_{OUTSIDE}(\RGOL{\LOC{w}}, i, j) = & P_{STOP}(stop|w, left, adj(i,
132 \LOC{w}))P_{OUTSIDE}(\SEAL{\LOC{w}}, i, j) ~ + \\
133 & [ \sum_{k < i} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<i} P_{STOP}(\neg stop|w, left, adj(i, \LOC{w})) P_{ATTACH}(a|w, left) \\
134 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{\LOC{a}}, k, i) P_{OUTSIDE}(\RGOL{\LOC{w}}, k, j) ]
135 \end{align*}
137 For $\GOR{w}$ we are either under a right stop or the left daughter of
138 a right attachment rule ($i \leq loc_l(\LOC{w}) < j \leq
139 loc_l(\LOC{a}) < k$), adjacent iff the the end point ($j$) equals
140 $loc_r(\LOC{w})$:
141 \begin{align*}
142 P_{OUTSIDE}(\GOR{\LOC{w}}, i, j) = & P_{STOP}(stop|w, right, adj(j,
143 \LOC{w}))P_{OUTSIDE}(\RGOL{\LOC{w}}, i, j) ~ + \\
144 & [ \sum_{k > j} ~ \sum_{\LOC{a}:j\leq loc_l(\LOC{a})<k} P_{STOP}(\neg stop|w, right, adj(j, \LOC{w})) P_{ATTACH}(a|w, right) \\
145 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\GOR{\LOC{w}}, i, k) P_{INSIDE}(\SEAL{\LOC{a}}, j, k) ]
146 \end{align*}
148 $\GOL{w}$ is just like $\RGOL{w}$, except for the outside probability
149 of having a stop above, where we use $\LGOR{w}$:
150 \begin{align*}
151 P_{OUTSIDE}(\GOL{\LOC{w}}, i, j) = & P_{STOP}(stop|w, left, adj(i,
152 \LOC{w}))P_{OUTSIDE}(\LGOR{\LOC{w}}, i, j) ~ + \\
153 & [ \sum_{k < i} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<i} P_{STOP}(\neg stop|w, left, adj(i, \LOC{w})) P_{ATTACH}(a|w, left) \\
154 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{\LOC{a}}, k, i) P_{OUTSIDE}(\GOL{\LOC{w}}, k, j) ]
155 \end{align*}
157 $\LGOR{w}$ is just like $\GOR{w}$, except for the outside probability
158 of having a stop above, where we use $\SEAL{w}$:
159 \begin{align*}
160 P_{OUTSIDE}(\LGOR{\LOC{w}}, i, j) = & P_{STOP}(stop|w, right, adj(j,
161 \LOC{w}))P_{OUTSIDE}(\SEAL{\LOC{w}}, i, j) ~ + \\
162 & [ \sum_{k > j} ~ \sum_{\LOC{a}:j\leq loc_l(\LOC{a})<k} P_{STOP}(\neg stop|w, right, adj(j, \LOC{w})) P_{ATTACH}(a|w, right) \\
163 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\LGOR{\LOC{w}}, i, k) P_{INSIDE}(\SEAL{\LOC{a}}, j, k) ]
164 \end{align*}
167 \section{Reestimating the rules} % todo
169 \subsection{$c$ and $w$ (helper formulas used below)}
170 % todo: left out rule-probability! wops (also, make P_{fooside}
171 % sentence-specific)
173 $c_s(\SMTR{\LOC{w}} : i, j)$ is ``the expected fraction of parses of $s$ with a
174 node labeled $\SMTR{w}$ extending from position $i$ to position $j$''
175 \citep[p.~88]{klein-thesis}, here defined to equal $v_{q}$ of
176 \citet[p.~41]{lari-csl90}:
177 \begin{align*}
178 c_s(\SMTR{\LOC{w}} : i, j) = P_{INSIDE_s}(\SMTR{\LOC{w}}, i, j) P_{OUTSIDE_s}(\SMTR{\LOC{w}}, i, j) / P_s
179 \end{align*}
181 $w_s$ is $w_{q}$ from \citet[p.~41]{lari-csl90}, generalized to $\SMTR{h}$ and $dir$:
182 \begin{align*}
183 w_s(\SEAL{a} & : \SMTR{\LOC{h}}, left, i, j) = \\
184 & 1/P_s \sum_{k:i<k<j} ~ \sum_{\LOC{a}:i\leq loc_l(\LOC{a})<k}
185 & P_{STOP}(\neg stop|h, left, adj(k, \SMTR{\LOC{h}})) P_{CHOOSE}(a|h, left) \\
186 & & P_{INSIDE_s}(\SEAL{\LOC{a}}, i, k) P_{INSIDE_s}(\SMTR{\LOC{h}}, k, j) P_{OUTSIDE_s}(\SMTR{\LOC{h}}, i, j)
187 \end{align*}
188 \begin{align*}
189 w_s(\SEAL{a} & : \SMTR{\LOC{h}}, right, i, j) = \\
190 & 1/P_s \sum_{k:i<k<j} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<j}
191 & P_{STOP}(\neg stop|h, right, adj(k, \SMTR{\LOC{h}})) P_{CHOOSE}(a|h, right) \\
192 & & P_{INSIDE_s}(\SMTR{\LOC{h}}, i, k) P_{INSIDE_s}(\SEAL{\LOC{a}}, k, j) P_{OUTSIDE_s}(\SMTR{\LOC{h}}, i, j)
193 \end{align*}
195 Let $\hat{P}$ be the new STOP/CHOOSE-probabilities (since the old $P$
196 are used in $P_{INSIDE}$ and $P_{OUTSIDE}$).
198 \subsection{Attachment reestimation}
200 $\hat{a}$ is given in \citet[p.~41]{lari-csl90}. Here $i<loc_l(\LOC{h})$
201 since we want trees with at least one attachment:
202 \begin{align*}
203 \hat{a} (a | \SMTR{h}, left) = \frac
204 { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i<loc_l(\LOC{h})} \sum_{j\geq loc_r(\LOC{h})} w_s(\SEAL{a} : \SMTR{\LOC{h}}, left, i, j) }
205 { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i<loc_l(\LOC{h})} \sum_{j\geq loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
206 \end{align*}
208 Here $j>loc_r(\SMTR{\LOC{h}})$ since we want at least one attachment:
209 \begin{align*}
210 \hat{a} (a | \SMTR{h}, right) = \frac
211 { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i\leq loc_l(\LOC{h})} \sum_{j>loc_r(\LOC{h})} w_s(\SEAL{a} : \SMTR{\LOC{h}}, right, i, j) }
212 { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i\leq loc_l(\LOC{h})} \sum_{j>loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
213 \end{align*}
215 For the first/lowest attachments, $w_s$ and $c_s$ have zero probability
216 where $i<loc_l(\LOC{h})$ (for $\GOR{h}$) or $j>loc_r(\LOC{h})$ (for $\GOL{h}$),
217 this is implicit in $P_{INSIDE}$.
221 \begin{align*}
222 \hat{P}_{CHOOSE} (a | h, left) = \frac
223 { \hat{a} (a | \GOL{h}, left) }
224 { \sum_{w} \hat{a} (w | \GOL{h}, left) }
226 \frac
227 { \hat{a} (a | \RGOL{h}, left) }
228 { \sum_{w} \hat{a} (w | \RGOL{h}, left) }
229 \end{align*}
230 \begin{align*}
231 \hat{P}_{CHOOSE} (a | h, right) = \frac
232 { \hat{a} (a | \GOR{h},right) }
233 { \sum_{w} \hat{a} (w | \GOR{h},right) }
235 \frac
236 { \hat{a} (a | \LGOR{h},right) }
237 { \sum_{w} \hat{a} (w | \LGOR{h},right) }
238 \end{align*}
240 \subsection{Stop reestimation}
241 The following is based on \citet[p.~88]{klein-thesis}. For the
242 non-adjacent rules, $i<loc_l(\LOC{h})$ on the left and $j>loc_r(\LOC{h})$ on the
243 right, while for the adjacent rules these are equal (respectively).
245 To avoid some redundancy below, define a helper function $\hat{d}$ as follows:
246 \begin{align*}
247 \hat{d}(\SMTR{h},\SDTR{h},\XI,\XJ) = \frac
248 { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i:i \XI loc_l(\LOC{h})} \sum_{j:j \XJ loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
249 { \sum_{s \in S} \sum_{\SDTR{\LOC{h}}:\LOC{h} \in s} \sum_{i:i \XI loc_l(\LOC{h})} \sum_{j:j \XJ loc_r(\LOC{h})} c_s(\SDTR{\LOC{h}} : i, j) }
250 \end{align*}
252 Then these are our reestimated stop probabilities:
253 \begin{align*}
254 \hat{P}_{STOP} (STOP|h, left, non\text{-}adj) =
255 \hat{d}(\SEAL{h}, \RGOL{h},<,\geq) +
256 \hat{d}(\LGOR{h}, \GOL{h},<,=)
257 \end{align*}
259 \begin{align*}
260 \hat{P}_{STOP} (STOP|h, left, adj) =
261 \hat{d}(\SEAL{h}, \RGOL{h},=,\geq) +
262 \hat{d}(\LGOR{h}, \GOL{h},=,=)
263 \end{align*}
265 \begin{align*}
266 \hat{P}_{STOP} (STOP|h, right, non\text{-}adj) =
267 \hat{d}(\RGOL{h}, \GOR{h},=,>) +
268 \hat{d}(\SEAL{h}, \LGOR{h},\leq,>)
269 \end{align*}
271 \begin{align*}
272 \hat{P}_{STOP} (STOP|h, right, adj) =
273 \hat{d}(\RGOL{h}, \GOR{h},=,=) +
274 \hat{d}(\SEAL{h}, \LGOR{h},\leq,=)
275 \end{align*}
278 \subsection{Root reestimation} % todo grok: no \sum_{i} \sum_{j} ?
279 \begin{align*}
280 \hat{P}_{ROOT} (h) = \frac
281 {\sum_{s\in S} P_{ROOT}(h) P_{INSIDE_s}(\SEAL{h}, 0, len(s)) }
282 {\sum_{s\in S} P_s}
283 \end{align*}
288 \section{Alternate CNF rules}
289 Since the IO algorithm as described in \citet{lari-csl90} is made for
290 pure CNF style rules, we have an alternate grammar (figure \ref{cnf})
291 for running parallell tests where we don't have to sum over the
292 different $loc(h)$ in IO. This is
293 not yet generalized to include left-first attachment.
295 \begin{figure}[htp]
296 \centering
297 \begin{tabular} % four left-aligned math tabs, one vertical line
298 { >{$}l<{$} >{$}l<{$} >{$}l<{$} | >{$}l<{$} }
299 \multicolumn{3}{c}{Rule} & \multicolumn{1}{c}{$P_{RULE}$ ($a[i,j,k]$ in \citet{lari-csl90})}\\
300 \hline{}
302 \RN{\GOR{h}} \rightarrow& \GOR{h} &\SEAL{a} &P_{STOP}(\neg stop|h, right, adj) \cdot P_{ATTACH}(a|h, right) \\
303 &&&\\
304 \RN{\GOR{h}} \rightarrow& \RN{\GOR{h}} &\SEAL{a} &P_{STOP}(\neg stop|h, right, non\text{-}adj) \cdot P_{ATTACH}(a|h, right) \\
305 &&&\\
306 \RGOL{h} \rightarrow& \GOR{h} &STOP &P_{STOP}(stop|h, right, adj) \\
307 &&&\\
308 \RGOL{h} \rightarrow& \RN{\GOR{h}} &STOP &P_{STOP}(stop|h, right, non\text{-}adj) \\
309 &&&\\
310 \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\RGOL{h} &P_{STOP}(\neg stop|h, left, adj) \cdot P_{ATTACH}(a|h, left) \\
311 &&&\\
312 \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\LN{\RGOL{h}} &P_{STOP}(\neg stop|h, left, non\text{-}adj) \cdot P_{ATTACH}(a|h, left) \\
313 &&&\\
314 \SEAL{h} \rightarrow& STOP &\RGOL{h} &P_{STOP}(stop|h, left, adj) \\
315 &&&\\
316 \SEAL{h} \rightarrow& STOP &\LN{\RGOL{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\
317 \end{tabular}
318 \caption{Alternate CFG rules (where a child node has an arrow below,
319 we use non-adjacent probabilities), defined for all words/POS-tags
320 $h$.}\label{cnf}
321 \end{figure}
323 The inside probabilities are the same as those given in
324 \citet{lari-csl90}, with the following exceptions:
326 When calculating $P_{INSIDE}(\SMTR{h}, i, j)$ and summing through possible
327 rules which rewrite $\SMTR{h}$, if a rule is of the form $\SMTR{h} \rightarrow
328 STOP ~ \SDTR{h}$ or $\SMTR{h} \rightarrow \SDTR{h} ~ STOP$, we add $P_{RULE}\cdot
329 P_{INSIDE}(\SDTR{h}, i, j)$ (that is, rewrite for the same sentence range);
330 and, as a consequence of these unary rules: for ``terminal rules''
331 ($P_{ORDER}$) to be applicable, not only must $i = j-1$, but also the
332 left-hand side symbol of the rule must be of the form $\GOR{h}$.
334 Similarly, the outside probabilities are the same as those for pure
335 CNF rules, with the exception that we add the unary rewrite
336 probabilities
337 \begin{align*}
338 \sum_{\SMTR{h}} [&P_{OUTSIDE}(\SMTR{h},i,j)\cdot P_{RULE}(\SMTR{h} \rightarrow \SDTR{h} ~ STOP) \\
339 + &P_{OUTSIDE}(\SMTR{h},i,j)\cdot P_{RULE}(\SMTR{h} \rightarrow STOP ~ \SDTR{h})]
340 \end{align*}
341 to $P_{OUTSIDE}(\SDTR{h},i,j)$ (eg. $f(s,t,i)$).
343 The reestimation just needs to be expanded to allow unary stop rules,
344 this is similar to the formula for binary rules in
345 \citet[p.~41]{lari-csl90}:
346 \begin{align*}
347 \hat{P}_{RULE}(\SMTR{h} \rightarrow \SDTR{h}) = \frac
348 { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} 1/P_s \cdot P_{RULE}(\SMTR{h}\rightarrow \SDTR{h})P_{INSIDE}(\SDTR{h}, i, j)P_{OUTSIDE}(\SMTR{h}, i, j) }
349 { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} c_s(\SMTR{h}, i, j) }
350 \end{align*}
351 (the denominator is unchanged, but the numerator sums $w_s$ defined
352 for unary rules; $\SMTR{h}\rightarrow \SDTR{h}$ is meant to include both left and
353 right stop rules).
355 % todo: add ROOT rule to CNF grammar?
358 \bibliography{./statistical.bib}
359 \bibliographystyle{plainnat}
361 \end{document}