found a bug in w-formula, 1/P_s should be outside sums, starting to think we should...
[dmvccm.git] / tex / formulas.tex
blob20607c9332366fd6751f438e73a2b8264d6c7ffe
1 % Created 2008-06-13 Fri 17:05
2 \documentclass[11pt,a4paper]{article}
3 \usepackage[utf8]{inputenc}
4 \usepackage[T1]{fontenc}
5 \usepackage{hyperref}
6 \usepackage{natbib}
8 \usepackage{pslatex}
9 \usepackage{pdfsync}
10 \pdfoutput=1
12 \usepackage{qtree}
13 \usepackage{amsmath}
14 \usepackage{amssymb}
16 \usepackage{avm}
17 \avmfont{\sc}
18 \avmoptions{sorted,active}
19 \avmvalfont{\rm}
20 \avmsortfont{\scriptsize\it}
22 \usepackage{array} % for a tabular with math in it
24 \title{DMV formulas}
25 \author{Kevin Brubeck Unhammer}
26 \date{2 July 2008}
28 \begin{document}
30 \maketitle
31 \tableofcontents
33 \newcommand{\GOR}[1]{\overrightarrow{#1}}
34 \newcommand{\RGOL}[1]{\overleftarrow{\overrightarrow{#1}}}
35 \newcommand{\SEAL}[1]{\overline{#1}}
36 \newcommand{\LGOR}[1]{\overrightarrow{\overleftarrow{#1}}}
37 \newcommand{\GOL}[1]{\overleftarrow{#1}}
38 \newcommand{\LN}[1]{\underleftarrow{#1}}
39 \newcommand{\RN}[1]{\underrightarrow{#1}}
41 \section{Note on notation}
42 $i, j, k$ are sentence positions (between words), where $i$ and $j$
43 are always the start and end, respectively, for what we're calculating
44 ($k$ is between $i$ and $j$ for $P_{INSIDE}$, to their right or left
45 for $P_{OUTSIDE}$). $s \in S$ are sentences in the corpus. $w$ is a
46 word at a certain sentence location. If $w$ is between $i$ and $i+1$,
47 $loc(w)=i$ following \citet{klein-thesis}, meaning $i$ is adjacent to
48 $w$ on the left, while $j=loc(w)+1$ means that $j$ is adjacent to $h$
49 on the right. To simplify, $loc_l(w):=loc(w)$ and
50 $loc_r(w):=loc(w)+1$. We write $h$ if this is a
51 head in the rule being used, $a$ if it is an attached argument.
53 Notational differences between the thesis \citep{klein-thesis} and the
54 paper \citep{km-dmv}:
56 \begin{tabular}{cc}
57 Paper: & Thesis: \\
58 $w$ & $\GOR{w}$ \\
59 $w\urcorner$ & $\RGOL{w}$ \\
60 $\ulcorner{w}\urcorner$ & $\SEAL{w}$ \\
61 \end{tabular}
63 $x$ can mean $w, \GOR{w}, \RGOL{w}$ or $\SEAL{w}$\footnote{This means
64 that in the implementation, each such $x$ is represented by the
65 triplet of the actual POS-tag, its sentence location, and the
66 ``level of seals''.}.
69 \section{Inside probabilities}
70 $P_{INSIDE}$ is defined in \citet[pp.~106-108]{klein-thesis}, the only
71 thing we need to add is that for right attachments,
72 $i \leq loc_l(w)<k \leq loc_l(a)<j$ while for left attachments,
73 $i \leq loc_l(a)<k \leq loc_l(w)<j$.
75 (For now, let
76 \[ \forall{w}[P_{ORDER}(right\text{-}first|w)=1.0] \] since the DMV implementation
77 is not yet generalized to both directions.)
83 \subsection{Sentence probability}
85 $P_s$ is the sentence probability, based on
86 \citet[p.~38]{lari-csl90}. Since the ROOT rules are different from the
87 rest, we sum them explicitly in this definition:
88 \begin{align*}
89 P_s = \sum_{w \in s} P_{ROOT}(w) P_{INSIDE}(\SEAL{w}, 0, len(s))
90 \end{align*}
92 \section{Outside probabilities}
94 \begin{align*}
95 P_{OUTSIDE_s}(ROOT, i, j) = \begin{cases}
96 1.0 & \text{ if $i = 0$ and $j = len(s)$,}\\
97 0.0 & \text{ otherwise}
98 \end{cases}
99 \end{align*}
101 For $P_{OUTSIDE}(\SEAL{w}, i, j)$, $w$ is attached to under something
102 else ($\SEAL{w}$ is what we elsewhere call $\SEAL{a}$). Adjacency is
103 thus calculated on the basis of $h$, the head of the rule. If we are
104 attached to from the left we have $i \leq loc_l(w) < j \leq loc_l(h) < k$, while
105 from the right we have $k \leq loc_l(h) < i \leq loc_l(w) < j$:
106 \begin{align*}
107 P_{OUTSIDE}&(\SEAL{w}, i, j) = \\
108 & P_{ROOT}(w) P_{OUTSIDE}(ROOT, i, j) + \\
109 & [ \sum_{k > j} ~ \sum_{h:j\leq loc_l(h)<k} \sum_{x \in \{\RGOL{h},\GOL{h}\}} P_{STOP}(\neg stop|h, left, adj(j, h)) P_{ATTACH}(w|h, left) \\
110 & \qquad \qquad \qquad \qquad \qquad P_{OUTSIDE}(x, i, k) P_{INSIDE}(x, j, k) ] ~ + \\
111 & [ \sum_{k < i} ~ \sum_{h:k\leq loc_l(h)<i} \sum_{x \in \{\LGOR{h},\GOR{h}\}} P_{STOP}(\neg stop|h, right, adj(i, h)) P_{ATTACH}(w|h, right) \\
112 & \qquad \qquad \qquad \qquad \qquad P_{INSIDE}(x, k, i) P_{OUTSIDE}(x, k, j) ]
113 \end{align*}
115 For $\RGOL{w}$ we know it is either under a left stop rule or it is
116 the right daughter of a left attachment rule ($k \leq loc_l(a) < i \leq loc_l(w)
117 < j$), and these are adjacent if the start point ($i$) equals
118 $loc_l(w)$:
119 \begin{align*}
120 P_{OUTSIDE}(\RGOL{w}, i, j) = & P_{STOP}(stop|w, left, adj(i,
121 w))P_{OUTSIDE}(\SEAL{w}, i, j) ~ + \\
122 & [ \sum_{k < i} ~ \sum_{a:k\leq loc_l(a)<i} P_{STOP}(\neg stop|w, left, adj(i, w)) P_{ATTACH}(a|w, left) \\
123 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{a}, k, i) P_{OUTSIDE}(\RGOL{w}, k, j) ]
124 \end{align*}
126 For $\GOR{w}$ we are either under a right stop or the left daughter of
127 a right attachment rule ($i \leq loc_l(w) < j \leq loc_l(a) < k$), adjacent iff
128 the the end point ($j$) equals $loc_r(w)$:
129 \begin{align*}
130 P_{OUTSIDE}(\GOR{w}, i, j) = & P_{STOP}(stop|w, right, adj(j,
131 w))P_{OUTSIDE}(\RGOL{w}, i, j) ~ + \\
132 & [ \sum_{k > j} ~ \sum_{a:j\leq loc_l(a)<k} P_{STOP}(\neg stop|w, right, adj(j, w)) P_{ATTACH}(a|w, right) \\
133 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\GOR{w}, i, k) P_{INSIDE}(\SEAL{a}, j, k) ]
134 \end{align*}
136 $\GOL{w}$ is just like $\RGOL{w}$, except for the outside probability
137 of having a stop above, where we use $\LGOR{w}$:
138 \begin{align*}
139 P_{OUTSIDE}(\GOL{w}, i, j) = & P_{STOP}(stop|w, left, adj(i,
140 w))P_{OUTSIDE}(\LGOR{w}, i, j) ~ + \\
141 & [ \sum_{k < i} ~ \sum_{a:k\leq loc_l(a)<i} P_{STOP}(\neg stop|w, left, adj(i, w)) P_{ATTACH}(a|w, left) \\
142 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{a}, k, i) P_{OUTSIDE}(\GOL{w}, k, j) ]
143 \end{align*}
145 $\LGOR{w}$ is just like $\GOR{w}$, except for the outside probability
146 of having a stop above, where we use $\SEAL{w}$:
147 \begin{align*}
148 P_{OUTSIDE}(\LGOR{w}, i, j) = & P_{STOP}(stop|w, right, adj(j,
149 w))P_{OUTSIDE}(\SEAL{w}, i, j) ~ + \\
150 & [ \sum_{k > j} ~ \sum_{a:j\leq loc_l(a)<k} P_{STOP}(\neg stop|w, right, adj(j, w)) P_{ATTACH}(a|w, right) \\
151 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\LGOR{w}, i, k) P_{INSIDE}(\SEAL{a}, j, k) ]
152 \end{align*}
155 \section{Reestimating the rules} % todo
157 \subsection{$c$ and $w$ (helper formulas used below)}
158 % todo: left out rule-probability! wops (also, make P_{fooside}
159 % sentence-specific)
161 $c_s(x : i, j)$ is ``the expected fraction of parses of $s$ with a
162 node labeled $x$ extending from position $i$ to position $j$''
163 \citep[p.~88]{klein-thesis}, here defined to equal $v_{q}$ of
164 \citet[p.~41]{lari-csl90}:
165 \begin{align*}
166 c_s(x : i, j) = P_{INSIDE_s}(x, i, j) P_{OUTSIDE_s}(x, i, j) / P_s
167 \end{align*}
169 $w_s$ is $w_{q}$ from \citet[p.~41]{lari-csl90}, generalized to $x$
170 \footnote{$x$ here includes a ``level of seals'', to look up the
171 $P_{STOP}$ and $P_{CHOOSE}$ formulas we of course only need the POS
172 of $x$.} and $dir$:
173 \begin{align*}
174 w_s(\SEAL{a} & : x, left, i, j) = \\
175 & 1/P_s \sum_{k:i<k<j} ~ \sum_{a:i\leq loc_l(a)<k}
176 & P_{STOP}(\neg stop|x, left, adj(k, x)) P_{CHOOSE}(a|x, left) \\
177 & & P_{INSIDE_s}(\SEAL{a}, i, k) P_{INSIDE_s}(x, k, j) P_{OUTSIDE_s}(x, i, j)
178 \end{align*}
179 \begin{align*}
180 w_s(\SEAL{a} & : x, right, i, j) = \\
181 & 1/P_s \sum_{k:i<k<j} ~ \sum_{a:k\leq loc_l(a)<j}
182 & P_{STOP}(\neg stop|x, right, adj(k, x)) P_{CHOOSE}(a|x, right) \\
183 & & P_{INSIDE_s}(x, i, k) P_{INSIDE_s}(\SEAL{a}, k, j) P_{OUTSIDE_s}(x, i, j)
184 \end{align*}
186 Let $\hat{P}$ be the new STOP/CHOOSE-probabilities (since the old $P$
187 are used in $P_{INSIDE}$ and $P_{OUTSIDE}$).
189 \subsection{Attachment reestimation}
191 $\hat{a}$ is given in \citet[p.~41]{lari-csl90}. Here $i<loc_l(h)$
192 since we want trees with at least one attachment:
193 \begin{align*}
194 \hat{a} (a | x, left) = \frac
195 { \sum_{s \in S} \sum_{h \in s} \sum_{i<loc_l(h)} \sum_{j\geq loc_r(h)} w_s(\SEAL{a} : x, left, i, j) }
196 { \sum_{s \in S} \sum_{h \in s} \sum_{i<loc_l(h)} \sum_{j\geq loc_r(h)} c_s(x : i, j) }
197 \end{align*}
199 Here $j>loc_r(h)$ since we want at least one attachment:
200 \begin{align*}
201 \hat{a} (a | x, right) = \frac
202 { \sum_{s \in S} \sum_{h \in s} \sum_{i\leq loc_l(h)} \sum_{j>loc_r(h)} w_s(\SEAL{a} : x, right, i, j) }
203 { \sum_{s \in S} \sum_{h \in s} \sum_{i\leq loc_l(h)} \sum_{j>loc_r(h)} c_s(x : i, j) }
204 \end{align*}
206 For the first/lowest attachments, $w$ and $c$ have zero probability
207 where $i<loc_l(h)$ (for $\GOR{h}$) or $j>loc_r(h)$ (for $\GOL{h}$),
208 this is implicit in $P_{INSIDE}$.
212 \begin{align*}
213 \hat{P}_{CHOOSE} (a | h, left) = \frac
214 { \hat{a} (a | \GOL{h}, left) }
215 { \sum_{w} \hat{a} (w | \GOL{h}, left) }
217 \frac
218 { \hat{a} (a | \RGOL{h}, left) }
219 { \sum_{w} \hat{a} (w | \RGOL{h}, left) }
220 \end{align*}
221 \begin{align*}
222 \hat{P}_{CHOOSE} (a | h, right) = \frac
223 { \hat{a} (a | \GOR{h},right) }
224 { \sum_{w} \hat{a} (w | \GOR{h},right) }
226 \frac
227 { \hat{a} (a | \LGOR{h},right) }
228 { \sum_{w} \hat{a} (w | \LGOR{h},right) }
229 \end{align*}
231 \subsection{Stop reestimation}
232 The following is based on \citet[p.~88]{klein-thesis}. For the
233 non-adjacent rules, $i<loc_l(h)$ on the left and $j>loc_r(h)$ on the
234 right, while for the adjacent rules these are equal (respectively).
236 To avoid some redundancy below, define a helper function $\hat{d}$ as follows:
237 \begin{align*}
238 \hat{d}(x,left,non\text{-}adj) = { \sum_{s \in S} \sum_{h \in s} \sum_{i<loc_l(h)} \sum_{j\geq loc_r(h)} c_s(x : i, j) }
239 \end{align*}
240 \begin{align*}
241 \hat{d}(x,left, adj) = { \sum_{s \in S} \sum_{h \in s} \sum_{j\geq loc_r(h)} c_s(x : loc_l(h), j) }
242 \end{align*}
243 \begin{align*}
244 \hat{d}(x,right,non\text{-}adj) = { \sum_{s \in S} \sum_{h \in s} \sum_{i\leq loc_l(h)} \sum_{j> loc_r(h)} c_s(x : i, j) }
245 \end{align*}
246 \begin{align*}
247 \hat{d}(x,right, adj) = { \sum_{s \in S} \sum_{h \in s} \sum_{i\leq loc_l(h)} c_s(x : i, loc_r(h)) }
248 \end{align*}
250 \begin{align*}
251 \hat{P}_{STOP} (STOP|h, left, non\text{-}adj) =
252 \frac
253 { \hat{d}(\SEAL{h},left,non\text{-}adj) }
254 { \hat{d}(\RGOL{h},left,non\text{-}adj) }
256 \frac
257 { \hat{d}(\LGOR{h},left,non\text{-}adj) }
258 { \hat{d}(\GOL{h},left,non\text{-}adj) }
259 \end{align*}
261 \begin{align*}
262 \hat{P}_{STOP} (STOP|h, left, adj) =
263 \frac
264 { \hat{d}(\SEAL{h},left,adj) }
265 { \hat{d}(\RGOL{h},left,adj) }
267 \frac
268 { \hat{d}(\LGOR{h},left,adj) }
269 { \hat{d}(\GOL{h},left,adj) }
270 \end{align*}
272 \begin{align*}
273 \hat{P}_{STOP} (STOP|h, right, non\text{-}adj) =
274 \frac
275 { \hat{d}(\RGOL{h},right,non\text{-}adj) }
276 { \hat{d}(\GOR{h},right,non\text{-}adj) }
278 \frac
279 { \hat{d}(\SEAL{h},right,non\text{-}adj) }
280 { \hat{d}(\LGOR{h},right,non\text{-}adj) }
281 \end{align*}
283 \begin{align*}
284 \hat{P}_{STOP} (STOP|h, right, adj) =
285 \frac
286 { \hat{d}(\RGOL{h},right,adj) }
287 { \hat{d}(\GOR{h},right,adj) }
289 \frac
290 { \hat{d}(\SEAL{h},right,adj) }
291 { \hat{d}(\LGOR{h},right,adj) }
292 \end{align*}
295 \subsection{Root reestimation} % todo grok: no \sum_{i} \sum_{j} ?
296 \begin{align*}
297 \hat{P}_{ROOT} (h) = \frac
298 {\sum_{s\in S} P_{ROOT}(h) P_{INSIDE_s}(\SEAL{h}, 0, len(s)) }
299 {\sum_{s\in S} P_s}
300 \end{align*}
305 \section{Alternate CNF rules}
306 Since the IO algorithm as described in \citet{lari-csl90} is made for
307 pure CNF style rules, we have an alternate grammar (figure \ref{cnf})
308 for running parallell tests where we don't have to sum over the
309 different $loc(h)$ in IO. This is
310 not yet generalized to include left-first attachment.
312 \begin{figure}[htp]
313 \centering
314 \begin{tabular} % four left-aligned math tabs, one vertical line
315 { >{$}l<{$} >{$}l<{$} >{$}l<{$} | >{$}l<{$} }
316 \multicolumn{3}{c}{Rule} & \multicolumn{1}{c}{$P_{RULE}$ ($a[i,j,k]$ in \citet{lari-csl90})}\\
317 \hline{}
319 \RN{\GOR{h}} \rightarrow& \GOR{h} &\SEAL{a} &P_{STOP}(\neg stop|h, right, adj) \cdot P_{ATTACH}(a|h, right) \\
320 &&&\\
321 \RN{\GOR{h}} \rightarrow& \RN{\GOR{h}} &\SEAL{a} &P_{STOP}(\neg stop|h, right, non\text{-}adj) \cdot P_{ATTACH}(a|h, right) \\
322 &&&\\
323 \RGOL{h} \rightarrow& \GOR{h} &STOP &P_{STOP}(stop|h, right, adj) \\
324 &&&\\
325 \RGOL{h} \rightarrow& \RN{\GOR{h}} &STOP &P_{STOP}(stop|h, right, non\text{-}adj) \\
326 &&&\\
327 \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\RGOL{h} &P_{STOP}(\neg stop|h, left, adj) \cdot P_{ATTACH}(a|h, left) \\
328 &&&\\
329 \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\LN{\RGOL{h}} &P_{STOP}(\neg stop|h, left, non\text{-}adj) \cdot P_{ATTACH}(a|h, left) \\
330 &&&\\
331 \SEAL{h} \rightarrow& STOP &\RGOL{h} &P_{STOP}(stop|h, left, adj) \\
332 &&&\\
333 \SEAL{h} \rightarrow& STOP &\LN{\RGOL{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\
334 \end{tabular}
335 \caption{Alternate CFG rules (where a child node has an arrow below,
336 we use non-adjacent probabilities), defined for all words/POS-tags
337 $h$.}\label{cnf}
338 \end{figure}
340 The inside probabilities are the same as those given in
341 \citet{lari-csl90}, with the following exceptions:
343 When calculating $P_{INSIDE}(x, i, j)$ and summing through possible
344 rules which rewrite $x$, if a rule is of the form $x \rightarrow
345 STOP ~ x'$ or $x \rightarrow x' ~ STOP$, we add $P_{RULE}\cdot
346 P_{INSIDE}(x', i, j)$ (that is, rewrite for the same sentence range);
347 and, as a consequence of these unary rules: for ``terminal rules''
348 ($P_{ORDER}$) to be applicable, not only must $i = j-1$, but also the
349 left-hand side symbol of the rule must be of the form $\GOR{h}$.
351 Similarly, the outside probabilities are the same as those for pure
352 CNF rules, with the exception that we add the unary rewrite
353 probabilities
354 \begin{align*}
355 \sum_{x'} [&P_{OUTSIDE}(x,i,j)\cdot P_{RULE}(x' \rightarrow x ~ STOP) \\
356 + &P_{OUTSIDE}(x,i,j)\cdot P_{RULE}(x' \rightarrow STOP ~ x)]
357 \end{align*}
358 to $P_{OUTSIDE}(x,i,j)$ (eg. $f(s,t,i)$).
360 The reestimation just needs to be expanded to allow unary stop rules,
361 this is similar to the formula for binary rules in
362 \citet[p.~41]{lari-csl90}:
363 \begin{align*}
364 \hat{P}_{RULE}(x' \rightarrow x) = \frac
365 { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} 1/P_s \cdot P_{RULE}(x'\rightarrow x)P_{INSIDE}(x, i, j)P_{OUTSIDE}(x', i, j) }
366 { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} c_s(x', i, j) }
367 \end{align*}
368 (the denominator is unchanged, but the numerator sums $w_s$ defined
369 for unary rules; $x'\rightarrow x$ is meant to include both left and
370 right stop rules).
372 % todo: add ROOT rule to CNF grammar?
375 \bibliography{./statistical.bib}
376 \bibliographystyle{plainnat}
378 \end{document}