report almost done
[dmvccm.git] / tex / formulas.tex
blobe82b9107399d5677caa8accdf75109208aef1883
1 % Created 2008-06-13 Fri 17:05
3 % todo: fix stop and attachment formulas so they divide before summing
5 \documentclass[11pt,a4paper]{article}
6 \usepackage[utf8]{inputenc}
7 \usepackage[T1]{fontenc}
8 \usepackage{hyperref}
9 \usepackage{natbib}
11 \usepackage{pslatex}
12 \usepackage{pdfsync}
13 \pdfoutput=1
15 \usepackage{qtree}
16 \usepackage{amsmath}
17 \usepackage{amssymb}
19 \usepackage{avm}
20 \avmfont{\sc}
21 \avmoptions{sorted,active}
22 \avmvalfont{\rm}
23 \avmsortfont{\scriptsize\it}
25 \usepackage{array} % for a tabular with math in it
27 \title{DMV formulas}
28 \author{Kevin Brubeck Unhammer}
29 \date{21 August 2008}
31 \begin{document}
33 \maketitle
35 This is an attempt at fleshing out the details of the inside-outside
36 algorithm \citep{lari-csl90} applied to the DMV model of
37 \citet{klein-thesis}.
39 \tableofcontents
41 \newcommand{\LOC}[1]{\textbf{#1}}
42 \newcommand{\GOR}[1]{\overrightarrow{#1}}
43 \newcommand{\RGOL}[1]{\overleftarrow{\overrightarrow{#1}}}
44 \newcommand{\SEAL}[1]{\overline{#1}}
45 \newcommand{\LGOR}[1]{\overrightarrow{\overleftarrow{#1}}}
46 \newcommand{\GOL}[1]{\overleftarrow{#1}}
47 \newcommand{\LN}[1]{\underleftarrow{#1}}
48 \newcommand{\RN}[1]{\underrightarrow{#1}}
49 \newcommand{\XI}{\lessdot}
50 \newcommand{\XJ}{\gtrdot}
51 \newcommand{\SMTR}[1]{\dddot{#1}}
52 \newcommand{\SDTR}[1]{\ddot{#1}}
54 \section{Note on notation}
55 $i, j, k$ are sentence positions (between words), where $i$ and $j$
56 are always the start and end, respectively, for what we're calculating
57 ($k$ is between $i$ and $j$ for $P_{INSIDE}$, to their right or left
58 for $P_{OUTSIDE}$). $s \in S$ are sentences in the corpus. $\LOC{w}$
59 is a word token (actually POS-token) of type $w$ at a certain sentence
60 location. If $\LOC{w}$ is between $i$ and $i+1$, $loc(\LOC{w})=i$
61 following \citet{klein-thesis}, meaning $i$ is adjacent to $\LOC{w}$
62 on the left, while $j=loc(\LOC{w})+1$ means that $j$ is adjacent to
63 $\LOC{w}$ on the right. To simplify, $loc_l(\LOC{w}):=loc(\LOC{w})$ and
64 $loc_r(\LOC{w}):=loc(\LOC{w})+1$. We write $\LOC{h}$ if this is a head
65 in the rule being used, $\LOC{a}$ if it is an attached argument.
67 Notational differences between the thesis \citep{klein-thesis} and the
68 paper \citep{km-dmv}:
70 \begin{tabular}{cc}
71 Paper: & Thesis: \\
72 $w$ & $\GOR{w}$ \\
73 $w\urcorner$ & $\RGOL{w}$ \\
74 $\ulcorner{w}\urcorner$ & $\SEAL{w}$ \\
75 \end{tabular}
77 I use $\SMTR{w}$ (or $\SDTR{w}$) to signify one of either $w, \GOR{w},
78 \RGOL{w}, \LGOR{w}, \GOL{w}$ or $\SEAL{w}$\footnote{This means that
79 $\SMTR{\LOC{w}}$ is the triplet of the actual POS-tag, its sentence
80 location as a token, and the ``level of seals''.}.
83 \section{Inside probabilities}
84 $P_{INSIDE}$ is defined in \citet[pp.~106-108]{klein-thesis}, the only
85 thing we need to add is that for right attachments,
86 $i \leq loc_l(w)<k \leq loc_l(\LOC{a})<j$ while for left attachments,
87 $i \leq loc_l(\LOC{a})<k \leq loc_l(w)<j$.
89 (For now, let
90 \[ \forall{w}[P_{ORDER}(right\text{-}first|w)=1.0] \] since the DMV implementation
91 is not yet generalized to both directions.)
97 \subsection{Sentence probability}
99 $P_s$ is the sentence probability, based on
100 \citet[p.~38]{lari-csl90}. Since the ROOT rules are different from the
101 rest, we sum them explicitly in this definition:
102 \begin{align*}
103 P_s = \sum_{\LOC{w} \in s} P_{ROOT}(\LOC{w}) P_{INSIDE}(\SEAL{\LOC{w}}, 0, len(s))
104 \end{align*}
106 \section{Outside probabilities}
108 \begin{align*}
109 P_{OUTSIDE_s}(ROOT, i, j) = \begin{cases}
110 1.0 & \text{ if $i = 0$ and $j = len(s)$,}\\
111 0.0 & \text{ otherwise}
112 \end{cases}
113 \end{align*}
115 For $P_{OUTSIDE}(\SEAL{w}, i, j)$, $w$ is attached to under something
116 else ($\SEAL{w}$ is what we elsewhere call $\SEAL{a}$). Adjacency is
117 thus calculated on the basis of $h$, the head of the rule. If we are
118 attached to from the left we have $i \leq loc_l(\LOC{w}) < j \leq loc_l(\LOC{h}) < k$, while
119 from the right we have $k \leq loc_l(\LOC{h}) < i \leq loc_l(\LOC{w}) < j$:
120 \begin{align*}
121 P_{OUTSIDE}&(\SEAL{\LOC{w}}, i, j) = \\
122 & P_{ROOT}(w) P_{OUTSIDE}(ROOT, i, j) + \\
123 & [ \sum_{k > j} ~ \sum_{\LOC{h}:j\leq loc_l(\LOC{h})<k} \sum_{\SMTR{\LOC{h}} \in \{\RGOL{\LOC{h}},\GOL{\LOC{h}}\}} P_{STOP}(\neg stop|h, left, adj(j, \LOC{h})) P_{ATTACH}(w|h, left) \\
124 & \qquad \qquad \qquad \qquad \qquad P_{OUTSIDE}(\SMTR{\LOC{h}}, i, k) P_{INSIDE}(\SMTR{\LOC{h}}, j, k) ] ~ + \\
125 & [ \sum_{k < i} ~ \sum_{\LOC{h}:k\leq loc_l(\LOC{h})<i} \sum_{\SMTR{\LOC{h}} \in \{\LGOR{\LOC{h}},\GOR{\LOC{h}}\}} P_{STOP}(\neg stop|h, right, adj(i, \LOC{h})) P_{ATTACH}(w|h, right) \\
126 & \qquad \qquad \qquad \qquad \qquad P_{INSIDE}(\SMTR{\LOC{h}}, k, i) P_{OUTSIDE}(\SMTR{\LOC{h}}, k, j) ]
127 \end{align*}
129 For $\RGOL{w}$ we know it is either under a left stop rule or it is
130 the right daughter of a left attachment rule ($k \leq loc_l(\LOC{a}) <
131 i \leq loc_l(\LOC{w}) < j$), and these are adjacent if the start point
132 ($i$) equals $loc_l(\LOC{w})$:
133 \begin{align*}
134 P_{OUTSIDE}(\RGOL{\LOC{w}}, i, j) = & P_{STOP}(stop|w, left, adj(i,
135 \LOC{w}))P_{OUTSIDE}(\SEAL{\LOC{w}}, i, j) ~ + \\
136 & [ \sum_{k < i} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<i} P_{STOP}(\neg stop|w, left, adj(i, \LOC{w})) P_{ATTACH}(a|w, left) \\
137 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{\LOC{a}}, k, i) P_{OUTSIDE}(\RGOL{\LOC{w}}, k, j) ]
138 \end{align*}
140 For $\GOR{w}$ we are either under a right stop or the left daughter of
141 a right attachment rule ($i \leq loc_l(\LOC{w}) < j \leq
142 loc_l(\LOC{a}) < k$), adjacent iff the the end point ($j$) equals
143 $loc_r(\LOC{w})$:
144 \begin{align*}
145 P_{OUTSIDE}(\GOR{\LOC{w}}, i, j) = & P_{STOP}(stop|w, right, adj(j,
146 \LOC{w}))P_{OUTSIDE}(\RGOL{\LOC{w}}, i, j) ~ + \\
147 & [ \sum_{k > j} ~ \sum_{\LOC{a}:j\leq loc_l(\LOC{a})<k} P_{STOP}(\neg stop|w, right, adj(j, \LOC{w})) P_{ATTACH}(a|w, right) \\
148 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\GOR{\LOC{w}}, i, k) P_{INSIDE}(\SEAL{\LOC{a}}, j, k) ]
149 \end{align*}
151 $\GOL{w}$ is just like $\RGOL{w}$, except for the outside probability
152 of having a stop above, where we use $\LGOR{w}$:
153 \begin{align*}
154 P_{OUTSIDE}(\GOL{\LOC{w}}, i, j) = & P_{STOP}(stop|w, left, adj(i,
155 \LOC{w}))P_{OUTSIDE}(\LGOR{\LOC{w}}, i, j) ~ + \\
156 & [ \sum_{k < i} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<i} P_{STOP}(\neg stop|w, left, adj(i, \LOC{w})) P_{ATTACH}(a|w, left) \\
157 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{\LOC{a}}, k, i) P_{OUTSIDE}(\GOL{\LOC{w}}, k, j) ]
158 \end{align*}
160 $\LGOR{w}$ is just like $\GOR{w}$, except for the outside probability
161 of having a stop above, where we use $\SEAL{w}$:
162 \begin{align*}
163 P_{OUTSIDE}(\LGOR{\LOC{w}}, i, j) = & P_{STOP}(stop|w, right, adj(j,
164 \LOC{w}))P_{OUTSIDE}(\SEAL{\LOC{w}}, i, j) ~ + \\
165 & [ \sum_{k > j} ~ \sum_{\LOC{a}:j\leq loc_l(\LOC{a})<k} P_{STOP}(\neg stop|w, right, adj(j, \LOC{w})) P_{ATTACH}(a|w, right) \\
166 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\LGOR{\LOC{w}}, i, k) P_{INSIDE}(\SEAL{\LOC{a}}, j, k) ]
167 \end{align*}
170 \section{Reestimating the rules} % todo
172 \subsection{$c$ and $w$ (helper formulas used below)}
173 % todo: left out rule-probability! wops (also, make P_{fooside}
174 % sentence-specific)
176 $c_s(\SMTR{\LOC{w}} : i, j)$ is ``the expected fraction of parses of
177 $s$ with a node labeled $\SMTR{w}$ extending from position $i$ to
178 position $j$'' \citep[p.~88]{klein-thesis}, here defined to equal
179 $v_{q}$ of \citet[p.~41]{lari-csl90}\footnote{In terms of regular EM,
180 this is the count of trees ($f_{T_q}(x)$ in
181 \citet[p.~46]{prescher-em}) in which the node extended from $i$ to
182 $j$.}:
183 \begin{align*}
184 c_s(\SMTR{\LOC{w}} : i, j) = P_{INSIDE_s}(\SMTR{\LOC{w}}, i, j) P_{OUTSIDE_s}(\SMTR{\LOC{w}}, i, j) / P_s
185 \end{align*}
187 $w_s$ is $w_{q}$ from \citet[p.~41]{lari-csl90}, generalized to $\SMTR{h}$ and $dir$:
188 \begin{align*}
189 w_s(\SEAL{a} & : \SMTR{\LOC{h}}, left, i, j) = \\
190 & 1/P_s \sum_{k:i<k<j} ~ \sum_{\LOC{a}:i\leq loc_l(\LOC{a})<k}
191 & P_{STOP}(\neg stop|h, left, adj(k, \LOC{h})) P_{CHOOSE}(a|h, left) \\
192 & & P_{INSIDE_s}(\SEAL{\LOC{a}}, i, k) P_{INSIDE_s}(\SMTR{\LOC{h}}, k, j) P_{OUTSIDE_s}(\SMTR{\LOC{h}}, i, j)
193 \end{align*}
194 \begin{align*}
195 w_s(\SEAL{a} & : \SMTR{\LOC{h}}, right, i, j) = \\
196 & 1/P_s \sum_{k:i<k<j} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<j}
197 & P_{STOP}(\neg stop|h, right, adj(k, \LOC{h})) P_{CHOOSE}(a|h, right) \\
198 & & P_{INSIDE_s}(\SMTR{\LOC{h}}, i, k) P_{INSIDE_s}(\SEAL{\LOC{a}}, k, j) P_{OUTSIDE_s}(\SMTR{\LOC{h}}, i, j)
199 \end{align*}
201 Let $\hat{P}$ be the new STOP/CHOOSE-probabilities (since the old $P$
202 are used in $P_{INSIDE}$ and $P_{OUTSIDE}$).
204 \subsection{Attachment reestimation}
206 $\hat{a}$ is given in \citet[p.~41]{lari-csl90}. Here $i<loc_l(\LOC{h})$
207 since we want trees with at least one attachment:
208 \begin{align*}
209 \hat{a} (a | \SMTR{h}, left) = \frac
210 { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i<loc_l(\LOC{h})} \sum_{j\geq loc_r(\LOC{h})} w_s(\SEAL{a} : \SMTR{\LOC{h}}, left, i, j) }
211 { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i<loc_l(\LOC{h})} \sum_{j\geq loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
212 \end{align*}
214 Here $j>loc_r(\SMTR{\LOC{h}})$ since we want at least one attachment:
215 \begin{align*}
216 \hat{a} (a | \SMTR{h}, right) = \frac
217 { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i\leq loc_l(\LOC{h})} \sum_{j>loc_r(\LOC{h})} w_s(\SEAL{a} : \SMTR{\LOC{h}}, right, i, j) }
218 { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i\leq loc_l(\LOC{h})} \sum_{j>loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
219 \end{align*}
221 For the first/lowest attachments, $w_s$ and $c_s$ have zero probability
222 where $i<loc_l(\LOC{h})$ (for $\GOR{h}$) or $j>loc_r(\LOC{h})$ (for $\GOL{h}$),
223 this is implicit in $P_{INSIDE}$.
227 \begin{align*}
228 \hat{P}_{CHOOSE} (a | h, left) =
229 \hat{a} (a | \GOL{h}, left)
230 + \hat{a} (a | \RGOL{h}, left)
231 \end{align*}
232 \begin{align*}
233 \hat{P}_{CHOOSE} (a | h, right) =
234 \hat{a} (a | \GOR{h},right)
235 + \hat{a} (a | \LGOR{h},right)
236 \end{align*}
238 \subsection{Stop reestimation}
239 The following is based on \citet[p.~88]{klein-thesis}. For the
240 non-adjacent rules, $i<loc_l(\LOC{h})$ on the left and $j>loc_r(\LOC{h})$ on the
241 right, while for the adjacent rules these are equal (respectively).
243 To avoid some redundancy below, define a helper function $\hat{d}$ as follows:
244 \begin{align*}
245 \hat{d}(\SMTR{h},\SDTR{h},\XI,\XJ) = \frac
246 { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i:i \XI loc_l(\LOC{h})} \sum_{j:j \XJ loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
247 { \sum_{s \in S} \sum_{\SDTR{\LOC{h}}:\LOC{h} \in s} \sum_{i:i \XI loc_l(\LOC{h})} \sum_{j:j \XJ loc_r(\LOC{h})} c_s(\SDTR{\LOC{h}} : i, j) }
248 \end{align*}
250 Then these are our reestimated stop probabilities:
251 \begin{align*}
252 \hat{P}_{STOP} (STOP|h, left, non\text{-}adj) =
253 \hat{d}(\SEAL{h}, \RGOL{h},<,\geq) +
254 \hat{d}(\LGOR{h}, \GOL{h},<,=)
255 \end{align*}
257 \begin{align*}
258 \hat{P}_{STOP} (STOP|h, left, adj) =
259 \hat{d}(\SEAL{h}, \RGOL{h},=,\geq) +
260 \hat{d}(\LGOR{h}, \GOL{h},=,=)
261 \end{align*}
263 \begin{align*}
264 \hat{P}_{STOP} (STOP|h, right, non\text{-}adj) =
265 \hat{d}(\RGOL{h}, \GOR{h},=,>) +
266 \hat{d}(\SEAL{h}, \LGOR{h},\leq,>)
267 \end{align*}
269 \begin{align*}
270 \hat{P}_{STOP} (STOP|h, right, adj) =
271 \hat{d}(\RGOL{h}, \GOR{h},=,=) +
272 \hat{d}(\SEAL{h}, \LGOR{h},\leq,=)
273 \end{align*}
276 \subsection{Root reestimation}
277 Following \citet[p.~46]{prescher-em}, to find the reestimated
278 probability of a PCFG rule, we first find the new treebank frequencies
279 $f_{T_P}(tree)=P(tree)/P_s$, then for a rule $X' \rightarrow X$ we
280 divide the new frequencies of the trees which use this rule and by
281 those of the trees containing the node $X'$. $ROOT$ appears once per
282 tree, meaning we divide by $1$ per sentence\footnote{Assuming each
283 tree has frequency $1$.}, so $\hat{P}_{ROOT}(h)=\sum_{tree:ROOT
284 \rightarrow \SEAL{h} \text{ used in } tree} f_{T_P}(tree)=\sum_{tree:ROOT
285 \rightarrow \SEAL{h} \text{ used in } tree} P(tree)/P_s$, which turns into:
287 \begin{align*}
288 \hat{P}_{ROOT} (h) = \frac
289 {\sum_{s\in S} 1 / P_s \cdot \sum_{\LOC{h}\in s} P_{ROOT}(\LOC{h}) P_{INSIDE_s}(\SEAL{h}, 0, len(s))}
290 {\sum_{s\in S} 1}
291 \end{align*}
293 % todo: the following just didn't turn out right, but it ought to be
294 % possible to use this to check the CNF-like unary attachments...
296 % The root attachment is just a unary PCFG rule $ROOT \rightarrow
297 % \SEAL{h}$. Modifiying binary rule reestimation from
298 % \citet[p.~41]{lari-csl90} to unary rules gives:
300 % \begin{align*}
301 % \hat{P}_{RULE}(ROOT \rightarrow \SEAL{h}) = \frac
302 % { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} 1/P_s \cdot P_{RULE}(ROOT \rightarrow \SEAL{h})P_{INSIDE_s}(\SEAL{h}, i, j)P_{OUTSIDE_s}(ROOT, i, j) }
303 % { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} c_s(\SEAL{h}, i, j) }
304 % \end{align*}
306 % Since $P_{OUTSIDE}(ROOT, i, j)$ is $1$ if $i=0$ and $j=len(s)$, and $0$ otherwise, this reduces into:
308 % \begin{align*}
309 % \hat{P}_{RULE}(ROOT \rightarrow \SEAL{h}) = \frac
310 % { \sum_{s\in S} 1/P_s \cdot P_{RULE}(ROOT \rightarrow \SEAL{h})P_{INSIDE_s}(\SEAL{h}, 0, len(s)) }
311 % { \sum_{s\in S} 1/P_s \cdot P_{INSIDE_s}(\SEAL{h}, 0, len(s)) }
312 % \end{align*}
318 \section{Alternate CNF-like rules}
319 Since the IO algorithm as described in \citet{lari-csl90} is made for
320 rules in CNF, we have an alternate grammar (figure \ref{cnf-like}) for
321 running parallell tests where we don't have to sum over the different
322 $loc(h)$ in IO. This is not yet generalized to include left-first
323 attachment. It is also not quite CNF, since it includes some unary
324 rewrite rules.
326 \begin{figure}[htp]
327 \centering
328 \begin{tabular} % four left-aligned math tabs, one vertical line
329 { >{$}l<{$} >{$}l<{$} >{$}l<{$} | >{$}l<{$} }
330 \multicolumn{3}{c}{Rule} & \multicolumn{1}{c}{$P_{RULE}$ ($a[i,j,k]$ in \citet{lari-csl90})}\\
331 \hline{}
333 \RN{\GOR{h}} \rightarrow& \GOR{h} &\SEAL{a} &P_{STOP}(\neg stop|h, right, adj) \cdot P_{ATTACH}(a|h, right) \\
334 &&&\\
335 \RN{\GOR{h}} \rightarrow& \RN{\GOR{h}} &\SEAL{a} &P_{STOP}(\neg stop|h, right, non\text{-}adj) \cdot P_{ATTACH}(a|h, right) \\
336 &&&\\
337 \RGOL{h} \rightarrow& \GOR{h} &STOP &P_{STOP}(stop|h, right, adj) \\
338 &&&\\
339 \RGOL{h} \rightarrow& \RN{\GOR{h}} &STOP &P_{STOP}(stop|h, right, non\text{-}adj) \\
340 &&&\\
341 \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\RGOL{h} &P_{STOP}(\neg stop|h, left, adj) \cdot P_{ATTACH}(a|h, left) \\
342 &&&\\
343 \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\LN{\RGOL{h}} &P_{STOP}(\neg stop|h, left, non\text{-}adj) \cdot P_{ATTACH}(a|h, left) \\
344 &&&\\
345 \SEAL{h} \rightarrow& STOP &\RGOL{h} &P_{STOP}(stop|h, left, adj) \\
346 &&&\\
347 \SEAL{h} \rightarrow& STOP &\LN{\RGOL{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\
348 \end{tabular}
349 \caption{Alternate CFG rules (where a child node has an arrow below,
350 we use non-adjacent probabilities), defined for all words/POS-tags
351 $h$.}\label{cnf-like}
352 \end{figure}
354 The inside probabilities are the same as those given in
355 \citet{lari-csl90}, with the following exceptions:
357 When calculating $P_{INSIDE}(\SMTR{h}, i, j)$ and summing through possible
358 rules which rewrite $\SMTR{h}$, if a rule is of the form $\SMTR{h} \rightarrow
359 STOP ~ \SDTR{h}$ or $\SMTR{h} \rightarrow \SDTR{h} ~ STOP$, we add $P_{RULE}\cdot
360 P_{INSIDE}(\SDTR{h}, i, j)$ (that is, rewrite for the same sentence range);
361 and, as a consequence of these unary rules: for ``terminal rules''
362 ($P_{ORDER}$) to be applicable, not only must $i = j-1$, but also the
363 left-hand side symbol of the rule must be of the form $\GOR{h}$.
365 Similarly, the outside probabilities are the same as those for pure
366 CNF rules, with the exception that we add the unary rewrite
367 probabilities
368 \begin{align*}
369 \sum_{\SMTR{h}} [&P_{OUTSIDE}(\SMTR{h},i,j)\cdot P_{RULE}(\SMTR{h} \rightarrow \SDTR{h} ~ STOP) \\
370 + &P_{OUTSIDE}(\SMTR{h},i,j)\cdot P_{RULE}(\SMTR{h} \rightarrow STOP ~ \SDTR{h})]
371 \end{align*}
372 to $P_{OUTSIDE}(\SDTR{h},i,j)$ (eg. $f(s,t,i)$).
374 The reestimation just needs to be expanded to allow unary stop rules,
375 this is similar to the formula for binary rules in
376 \citet[p.~41]{lari-csl90}:
377 \begin{align*}
378 \hat{P}_{RULE}(\SMTR{h} \rightarrow \SDTR{h}) = \frac
379 { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} 1/P_s \cdot P_{RULE}(\SMTR{h}\rightarrow \SDTR{h})P_{INSIDE}(\SDTR{h}, i, j)P_{OUTSIDE}(\SMTR{h}, i, j) }
380 { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} c_s(\SMTR{h}, i, j) }
381 \end{align*}
382 (the denominator is unchanged, but the numerator sums $w_s$ defined
383 for unary rules; $\SMTR{h}\rightarrow \SDTR{h}$ is meant to include both left and
384 right stop rules).
386 % todo: add ROOT rule to CNF grammar?
389 \section{Pure CNF rules}
390 Alternatively, one may use the ``pure CNF'' grammar of figure
391 \ref{cnf-T} and figure \ref{cnf-NT}, below (not yet
392 implemented). Using the harmonic distribution to initialize this will
393 still yield a mass-deficient grammar, but at least this allows the
394 regular IO algorithm to be run.
396 \begin{figure}[htp]
397 \centering
398 \begin{tabular} % four left-aligned math tabs, one vertical line
399 { >{$}l<{$} >{$}l<{$} >{$}l<{$} | >{$}l<{$} }
400 \multicolumn{3}{c}{Rule} & \multicolumn{1}{c}{$P_{RULE}$ ($b[i,m]$ in \citet{lari-csl90})}\\
401 \hline{}
403 h \rightarrow& \text{``h''}& &1 \\
404 &&&\\
405 \SEAL{h} \rightarrow& \text{``h''}& &P_{STOP}(stop|h, left, adj) \cdot P_{STOP}(stop|h, right, adj) \\
406 &&&\\
407 ROOT \rightarrow& \text{``h''}& &P_{STOP}(stop|h, left, adj) \cdot P_{STOP}(stop|h, right, adj) \cdot P_{ROOT}(h) \\
408 \end{tabular}
409 \caption{Terminals of pure CNF rules, defined for all POS-tags
410 $h$.}\label{cnf-T}
411 \end{figure}
413 \begin{figure}[htp]
414 \centering
415 \begin{tabular} % four left-aligned math tabs, one vertical line
416 { >{$}l<{$} >{$}l<{$} >{$}l<{$} | >{$}l<{$} }
417 \multicolumn{3}{c}{Rule} & \multicolumn{1}{c}{$P_{RULE}$ ($a[i,j,k]$ in \citet{lari-csl90})}\\
418 \hline{}
420 \SEAL{h} \rightarrow& h &\SEAL{a} &P_{STOP}(stop|h, left, adj) \cdot P_{STOP}(stop|h, right, non\text{-}adj) \\
421 &&&\cdot P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, adj) \\[3.5pt]
422 \SEAL{h} \rightarrow& h &\GOR{.h} &P_{STOP}(stop|h, left, adj) \cdot P_{STOP}(stop|h, right, non\text{-}adj) \\[3.5pt]
423 \GOR{.h} \rightarrow& \SEAL{a} &\SEAL{b} &P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, adj) \\
424 &&&\cdot P_{ATTACH}(b|h, right)\cdot P_{STOP}(\neg stop|h, right, non\text{-}adj) \\[3.5pt]
425 \GOR{.h} \rightarrow& \SEAL{a} &\GOR{h} &P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, adj) \\[3.5pt]
426 \GOR{h} \rightarrow& \SEAL{a} &\SEAL{b} &P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, non\text{-}adj) \\
427 &&&\cdot P_{ATTACH}(b|h, right)\cdot P_{STOP}(\neg stop|h, right, non\text{-}adj) \\[3.5pt]
428 \GOR{h} \rightarrow& \SEAL{a} &\GOR{h} &P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, non\text{-}adj) \\[3.5pt]
429 \SEAL{h} \rightarrow& \SEAL{a} &h &P_{STOP}(stop|h, left, non\text{-}adj) \cdot P_{STOP}(stop|h, right, adj) \\
430 &&&\cdot P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \\[3.5pt]
431 \SEAL{h} \rightarrow& \GOL{h.} &h &P_{STOP}(stop|h, left, non\text{-}adj) \cdot P_{STOP}(stop|h, right, adj) \\[3.5pt]
432 \GOL{h.} \rightarrow& \SEAL{b} &\SEAL{a} &P_{ATTACH}(b|h, left)\cdot P_{STOP}(\neg stop|h, left, non\text{-}adj) \\
433 &&&\cdot P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \\[3.5pt]
434 \GOL{h.} \rightarrow& \GOL{h} &\SEAL{a} &P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \\[3.5pt]
435 \GOL{h} \rightarrow& \SEAL{a} &\SEAL{b} &P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, non\text{-}adj) \\
436 &&&\cdot P_{ATTACH}(b|h, left)\cdot P_{STOP}(\neg stop|h, left, non\text{-}adj) \\[3.5pt]
437 \GOL{h} \rightarrow& \GOL{h} &\SEAL{a} &P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, non\text{-}adj) \\[3.5pt]
438 \SEAL{h} \rightarrow& \GOL{h.} &\SEAL{\GOR{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\[3.5pt]
439 \SEAL{h} \rightarrow& \SEAL{a} &\SEAL{\GOR{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\
440 &&&\cdot P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \\[3.5pt]
441 \SEAL{\GOR{h}} \rightarrow& h &\GOR{.h} &P_{STOP}(stop|h, right, non\text{-}adj) \\[3.5pt]
442 \SEAL{\GOR{h}} \rightarrow& h &\SEAL{a} &P_{STOP}(stop|h, right, non\text{-}adj) \\
443 &&&\cdot P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, adj) \\[3.5pt]
444 ROOT \rightarrow& h &\SEAL{a} &P_{STOP}(stop|h, left, adj) \cdot P_{STOP}(stop|h, right, non\text{-}adj) \\
445 &&&\cdot P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, adj) \cdot P_{ROOT}(h) \\[3.5pt]
446 ROOT \rightarrow& h &\GOR{.h} &P_{STOP}(stop|h, left, adj) \cdot P_{STOP}(stop|h, right, non\text{-}adj) \cdot P_{ROOT}(h) \\[3.5pt]
447 ROOT \rightarrow& \SEAL{a} &h &P_{STOP}(stop|h, left, non\text{-}adj) \cdot P_{STOP}(stop|h, right, adj) \\
448 &&&\cdot P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \cdot P_{ROOT}(h) \\[3.5pt]
449 ROOT \rightarrow& \GOL{h.} &h &P_{STOP}(stop|h, left, non\text{-}adj) \cdot P_{STOP}(stop|h, right, adj) \cdot P_{ROOT}(h) \\[3.5pt]
450 ROOT \rightarrow& \GOL{h.} &\SEAL{\GOR{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \cdot P_{ROOT}(h) \\[3.5pt]
451 ROOT \rightarrow& \SEAL{a} &\SEAL{\GOR{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\
452 &&&\cdot P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \cdot P_{ROOT}(h) \\[3.5pt]
453 \end{tabular}
454 \caption{Non-terminals of pure CNF rules, defined for all POS-tags
455 $h$, $a$ and $b$.}\label{cnf-NT}
456 \end{figure}
460 \section{Initialization}
461 \citet{klein-thesis} describes DMV initialization using a ``harmonic
462 distribution'' for the initial probabilities, where the probability of
463 one word heading another is higher if they appear closer to one
464 another.
466 There are several ways this could be implemented. We initialized
467 attachment probabilities with the following formula:
469 \begin{align*}
470 P_{ATTACH}(a|h,right) = \frac
471 {\sum_{s \in S}\sum_{\LOC{h} \in s} \sum_{\LOC{a} \in s:loc(\LOC{a})>loc(\LOC{h})} 1/(loc(\LOC{a})-loc(\LOC{h})) + C_A}
472 {\sum_{s \in S}\sum_{\LOC{h} \in s} \sum_{\LOC{w} \in s:loc(\LOC{w})>loc(\LOC{h})} 1/(loc(\LOC{w})-loc(\LOC{h})) + C_A}
473 \end{align*}
475 The probability of stopping adjacently (left or right) was increased
476 whenever a word occured at a (left or right) sentence
477 border\footnote{For non-adjacent stopping we checked for occurence at
478 the second(-to-last) position.}:
480 \begin{align*}
481 f(stop:\LOC{h},left,adj)=\begin{cases}
482 C_S \text{, if } loc(\LOC{h}) = 0,\\
483 0 \text{, otherwise}
484 \end{cases}
485 \end{align*}
487 \begin{align*}
488 P_{STOP}(stop|h,left,adj) = \frac
489 {C_{M} + \sum_{s \in S}\sum_{\LOC{h} \in s} f(stop:\LOC{h},left,adj)}
490 {C_{M} + \sum_{s \in S}\sum_{\LOC{h} \in s} C_S+C_N}
491 \end{align*}
493 Tweaking the initialization constants $C_A, C_M, C_S$ and $C_N$
494 allowed us to easily try different inital distributions.
497 \bibliography{./statistical.bib}
498 \bibliographystyle{plainnat}
500 \end{document}