tex/formulas.tex

   1 % Created 2008-06-13 Fri 17:05
   2
   3 % todo: fix stop and attachment formulas so they divide before summing
   4
   5 \documentclass[11pt,a4paper]{article}
   6 \usepackage[utf8]{inputenc}
   7 \usepackage[T1]{fontenc}
   8 \usepackage{hyperref}
   9 \usepackage{natbib}
  10
  11 \usepackage{pslatex}
  12 \usepackage{pdfsync}
  13 \pdfoutput=1
  14
  15 \usepackage{qtree}
  16 \usepackage{amsmath}
  17 \usepackage{amssymb}
  18
  19 \usepackage{avm}
  20 \avmfont{\sc}
  21 \avmoptions{sorted,active}
  22 \avmvalfont{\rm}
  23 \avmsortfont{\scriptsize\it}
  24
  25 \usepackage{array} % for a tabular with math in it
  26
  27 \title{DMV formulas}
  28 \author{Kevin Brubeck Unhammer}
  29 \date{21 August 2008}
  30
  31 \begin{document}
  32
  33 \maketitle
  34
  35 This is an attempt at fleshing out the details of the inside-outside
  36 algorithm \citep{lari-csl90} applied to the DMV model of
  37 \citet{klein-thesis}.
  38
  39 \tableofcontents
  40
  41 \newcommand{\LOC}[1]{\textbf{#1}}
  42 \newcommand{\GOR}[1]{\overrightarrow{#1}}
  43 \newcommand{\RGOL}[1]{\overleftarrow{\overrightarrow{#1}}}
  44 \newcommand{\SEAL}[1]{\overline{#1}}
  45 \newcommand{\LGOR}[1]{\overrightarrow{\overleftarrow{#1}}}
  46 \newcommand{\GOL}[1]{\overleftarrow{#1}}
  47 \newcommand{\LN}[1]{\underleftarrow{#1}}
  48 \newcommand{\RN}[1]{\underrightarrow{#1}}
  49 \newcommand{\XI}{\lessdot}
  50 \newcommand{\XJ}{\gtrdot}
  51 \newcommand{\SMTR}[1]{\dddot{#1}}
  52 \newcommand{\SDTR}[1]{\ddot{#1}}
  53
  54 \section{Note on notation}
  55 $i, j, k$ are sentence positions (between words), where $i$ and $j$
  56 are always the start and end, respectively, for what we're calculating
  57 ($k$ is between $i$ and $j$ for $P_{INSIDE}$, to their right or left
  58 for $P_{OUTSIDE}$). $s \in S$ are sentences in the corpus. $\LOC{w}$
  59 is a word token (actually POS-token) of type $w$ at a certain sentence
  60 location. If $\LOC{w}$ is between $i$ and $i+1$, $loc(\LOC{w})=i$
  61 following \citet{klein-thesis}, meaning $i$ is adjacent to $\LOC{w}$
  62 on the left, while $j=loc(\LOC{w})+1$ means that $j$ is adjacent to
  63 $\LOC{w}$ on the right. To simplify, $loc_l(\LOC{w}):=loc(\LOC{w})$ and
  64 $loc_r(\LOC{w}):=loc(\LOC{w})+1$. We write $\LOC{h}$ if this is a head
  65 in the rule being used, $\LOC{a}$ if it is an attached argument.
  66
  67 Notational differences between the thesis \citep{klein-thesis} and the
  68 paper \citep{km-dmv}:
  69
  70 \begin{tabular}{cc}
  71 Paper: & Thesis: \\
  72 $w$ & $\GOR{w}$ \\
  73 $w\urcorner$ & $\RGOL{w}$ \\
  74 $\ulcorner{w}\urcorner$ & $\SEAL{w}$ \\
  75 \end{tabular}
  76
  77 I use $\SMTR{w}$ (or $\SDTR{w}$) to signify one of either $w, \GOR{w},
  78 \RGOL{w}, \LGOR{w}, \GOL{w}$ or $\SEAL{w}$\footnote{This means that
  79   $\SMTR{\LOC{w}}$ is the triplet of the actual POS-tag, its sentence
  80   location as a token, and the ``level of seals''.}.
  81
  82
  83 \section{Inside probabilities}
  84 $P_{INSIDE}$ is defined in \citet[pp.~106-108]{klein-thesis}, the only
  85 thing we need to add is that for right attachments,
  86 $i \leq loc_l(w)<k \leq loc_l(\LOC{a})<j$ while for left attachments,
  87 $i \leq loc_l(\LOC{a})<k \leq loc_l(w)<j$.
  88
  89 (For now, let
  90 \[ \forall{w}[P_{ORDER}(right\text{-}first|w)=1.0] \] since the DMV implementation
  91 is not yet generalized to both directions.)
  92
  93
  94
  95
  96
  97 \subsection{Sentence probability}
  98
  99 $P_s$ is the sentence probability, based on
 100 \citet[p.~38]{lari-csl90}. Since the ROOT rules are different from the
 101 rest, we sum them explicitly in this definition:
 102 \begin{align*}
 103   P_s = \sum_{\LOC{w} \in s} P_{ROOT}(\LOC{w}) P_{INSIDE}(\SEAL{\LOC{w}}, 0, len(s))
 104 \end{align*}
 105
 106 \section{Outside probabilities}
 107
 108 \begin{align*}
 109   P_{OUTSIDE_s}(ROOT, i, j) = \begin{cases}
 110     1.0 & \text{ if $i = 0$ and $j = len(s)$,}\\
 111     0.0 & \text{ otherwise}
 112   \end{cases}
 113 \end{align*}
 114
 115 For $P_{OUTSIDE}(\SEAL{w}, i, j)$, $w$ is attached to under something
 116 else ($\SEAL{w}$ is what we elsewhere call $\SEAL{a}$). Adjacency is
 117 thus calculated on the basis of $h$, the head of the rule. If we are
 118 attached to from the left we have $i \leq  loc_l(\LOC{w}) < j \leq  loc_l(\LOC{h}) < k$, while
 119 from the right we have $k \leq  loc_l(\LOC{h}) < i \leq  loc_l(\LOC{w}) < j$:
 120 \begin{align*}
 121   P_{OUTSIDE}&(\SEAL{\LOC{w}}, i, j) = \\
 122   & P_{ROOT}(w) P_{OUTSIDE}(ROOT, i, j) + \\
 123   & [ \sum_{k > j} ~ \sum_{\LOC{h}:j\leq loc_l(\LOC{h})<k} \sum_{\SMTR{\LOC{h}} \in \{\RGOL{\LOC{h}},\GOL{\LOC{h}}\}} P_{STOP}(\neg stop|h, left,  adj(j, \LOC{h})) P_{ATTACH}(w|h, left) \\
 124   & \qquad \qquad \qquad \qquad \qquad P_{OUTSIDE}(\SMTR{\LOC{h}}, i, k) P_{INSIDE}(\SMTR{\LOC{h}}, j, k) ] ~ + \\
 125   & [ \sum_{k < i} ~ \sum_{\LOC{h}:k\leq loc_l(\LOC{h})<i} \sum_{\SMTR{\LOC{h}} \in \{\LGOR{\LOC{h}},\GOR{\LOC{h}}\}} P_{STOP}(\neg stop|h, right, adj(i, \LOC{h})) P_{ATTACH}(w|h, right) \\
 126   & \qquad \qquad \qquad \qquad \qquad P_{INSIDE}(\SMTR{\LOC{h}}, k, i) P_{OUTSIDE}(\SMTR{\LOC{h}}, k, j) ]
 127 \end{align*}
 128
 129 For $\RGOL{w}$ we know it is either under a left stop rule or it is
 130 the right daughter of a left attachment rule ($k \leq loc_l(\LOC{a}) <
 131 i \leq loc_l(\LOC{w}) < j$), and these are adjacent if the start point
 132 ($i$) equals $loc_l(\LOC{w})$:
 133 \begin{align*}
 134   P_{OUTSIDE}(\RGOL{\LOC{w}}, i, j) = & P_{STOP}(stop|w, left, adj(i,
 135   \LOC{w}))P_{OUTSIDE}(\SEAL{\LOC{w}}, i, j) ~ + \\
 136   & [ \sum_{k < i} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<i} P_{STOP}(\neg stop|w, left, adj(i, \LOC{w})) P_{ATTACH}(a|w, left) \\
 137   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{\LOC{a}}, k, i) P_{OUTSIDE}(\RGOL{\LOC{w}}, k, j) ]
 138 \end{align*}
 139
 140 For $\GOR{w}$ we are either under a right stop or the left daughter of
 141 a right attachment rule ($i \leq loc_l(\LOC{w}) < j \leq
 142 loc_l(\LOC{a}) < k$), adjacent iff the the end point ($j$) equals
 143 $loc_r(\LOC{w})$:
 144 \begin{align*}
 145   P_{OUTSIDE}(\GOR{\LOC{w}}, i, j) = & P_{STOP}(stop|w, right, adj(j,
 146   \LOC{w}))P_{OUTSIDE}(\RGOL{\LOC{w}}, i, j) ~ + \\
 147   & [ \sum_{k > j} ~ \sum_{\LOC{a}:j\leq loc_l(\LOC{a})<k} P_{STOP}(\neg stop|w, right, adj(j, \LOC{w})) P_{ATTACH}(a|w, right) \\
 148   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\GOR{\LOC{w}}, i, k) P_{INSIDE}(\SEAL{\LOC{a}}, j, k) ]
 149 \end{align*}
 150
 151 $\GOL{w}$ is just like $\RGOL{w}$, except for the outside probability
 152 of having a stop above, where we use $\LGOR{w}$:
 153 \begin{align*}
 154   P_{OUTSIDE}(\GOL{\LOC{w}}, i, j) = & P_{STOP}(stop|w, left, adj(i,
 155   \LOC{w}))P_{OUTSIDE}(\LGOR{\LOC{w}}, i, j) ~ + \\
 156   & [ \sum_{k < i} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<i} P_{STOP}(\neg stop|w, left, adj(i, \LOC{w})) P_{ATTACH}(a|w, left) \\
 157   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{\LOC{a}}, k, i) P_{OUTSIDE}(\GOL{\LOC{w}}, k, j) ]
 158 \end{align*}
 159
 160 $\LGOR{w}$ is just like $\GOR{w}$, except for the outside probability
 161 of having a stop above, where we use $\SEAL{w}$:
 162 \begin{align*}
 163   P_{OUTSIDE}(\LGOR{\LOC{w}}, i, j) = & P_{STOP}(stop|w, right, adj(j,
 164   \LOC{w}))P_{OUTSIDE}(\SEAL{\LOC{w}}, i, j) ~ + \\
 165   & [ \sum_{k > j} ~ \sum_{\LOC{a}:j\leq loc_l(\LOC{a})<k} P_{STOP}(\neg stop|w, right, adj(j, \LOC{w})) P_{ATTACH}(a|w, right) \\
 166   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\LGOR{\LOC{w}}, i, k) P_{INSIDE}(\SEAL{\LOC{a}}, j, k) ]
 167 \end{align*}
 168
 169
 170 \section{Reestimating the rules} % todo
 171
 172 \subsection{$c$ and $w$ (helper formulas used below)}
 173 % todo: left out rule-probability! wops (also, make P_{fooside}
 174 % sentence-specific)
 175
 176 $c_s(\SMTR{\LOC{w}} : i, j)$ is ``the expected fraction of parses of
 177 $s$ with a node labeled $\SMTR{w}$ extending from position $i$ to
 178 position $j$'' \citep[p.~88]{klein-thesis}, here defined to equal
 179 $v_{q}$ of \citet[p.~41]{lari-csl90}\footnote{In terms of regular EM,
 180   this is the count of trees ($f_{T_q}(x)$ in
 181   \citet[p.~46]{prescher-em}) in which the node extended from $i$ to
 182   $j$.}:
 183 \begin{align*}
 184   c_s(\SMTR{\LOC{w}} : i, j) = P_{INSIDE_s}(\SMTR{\LOC{w}}, i, j) P_{OUTSIDE_s}(\SMTR{\LOC{w}}, i, j) / P_s
 185 \end{align*}
 186
 187 $w_s$ is $w_{q}$ from \citet[p.~41]{lari-csl90}, generalized to $\SMTR{h}$ and $dir$:
 188 \begin{align*}
 189   w_s(\SEAL{a} & : \SMTR{\LOC{h}}, left, i, j) = \\
 190   & 1/P_s \sum_{k:i<k<j} ~ \sum_{\LOC{a}:i\leq loc_l(\LOC{a})<k}
 191           & P_{STOP}(\neg stop|h, left, adj(k, \LOC{h})) P_{CHOOSE}(a|h, left) \\
 192   &       & P_{INSIDE_s}(\SEAL{\LOC{a}}, i, k) P_{INSIDE_s}(\SMTR{\LOC{h}}, k, j) P_{OUTSIDE_s}(\SMTR{\LOC{h}}, i, j)
 193 \end{align*}
 194 \begin{align*}
 195   w_s(\SEAL{a} & : \SMTR{\LOC{h}}, right,  i, j) = \\
 196   & 1/P_s \sum_{k:i<k<j} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<j}
 197           & P_{STOP}(\neg stop|h, right, adj(k, \LOC{h})) P_{CHOOSE}(a|h, right) \\
 198   &       & P_{INSIDE_s}(\SMTR{\LOC{h}}, i, k) P_{INSIDE_s}(\SEAL{\LOC{a}}, k, j) P_{OUTSIDE_s}(\SMTR{\LOC{h}}, i, j)
 199 \end{align*}
 200
 201 Let $\hat{P}$ be the new STOP/CHOOSE-probabilities (since the old $P$
 202 are used in $P_{INSIDE}$ and $P_{OUTSIDE}$).
 203
 204 \subsection{Attachment reestimation}
 205
 206 $\hat{a}$ is given in \citet[p.~41]{lari-csl90}. Here $i<loc_l(\LOC{h})$
 207 since we want trees with at least one attachment:
 208 \begin{align*}
 209   \hat{a} (a | \SMTR{h}, left) =  \frac
 210   { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i<loc_l(\LOC{h})} \sum_{j\geq loc_r(\LOC{h})} w_s(\SEAL{a} : \SMTR{\LOC{h}}, left, i, j) }
 211   { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i<loc_l(\LOC{h})} \sum_{j\geq loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
 212 \end{align*}
 213
 214 Here $j>loc_r(\SMTR{\LOC{h}})$ since we want at least one attachment:
 215 \begin{align*}
 216   \hat{a} (a | \SMTR{h}, right) = \frac
 217   { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i\leq loc_l(\LOC{h})} \sum_{j>loc_r(\LOC{h})} w_s(\SEAL{a} : \SMTR{\LOC{h}}, right, i, j) }
 218   { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i\leq loc_l(\LOC{h})} \sum_{j>loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
 219 \end{align*}
 220
 221 For the first/lowest attachments, $w_s$ and $c_s$ have zero probability
 222 where $i<loc_l(\LOC{h})$ (for $\GOR{h}$) or $j>loc_r(\LOC{h})$ (for $\GOL{h}$),
 223 this is implicit in $P_{INSIDE}$.
 224
 225
 226
 227 \begin{align*}
 228   \hat{P}_{CHOOSE} (a | h, left) =
 229   \hat{a} (a | \GOL{h}, left)
 230   + \hat{a} (a | \RGOL{h}, left)
 231 \end{align*}
 232 \begin{align*}
 233   \hat{P}_{CHOOSE} (a | h, right) =
 234   \hat{a} (a | \GOR{h},right)
 235   + \hat{a} (a | \LGOR{h},right)
 236 \end{align*}
 237
 238 \subsection{Stop reestimation}
 239 The following is based on \citet[p.~88]{klein-thesis}. For the
 240 non-adjacent rules, $i<loc_l(\LOC{h})$ on the left and $j>loc_r(\LOC{h})$ on the
 241 right, while for the adjacent rules these are equal (respectively).
 242
 243 To avoid some redundancy below, define a helper function $\hat{d}$ as follows:
 244 \begin{align*}
 245   \hat{d}(\SMTR{h},\SDTR{h},\XI,\XJ) = \frac
 246   { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i:i \XI loc_l(\LOC{h})} \sum_{j:j \XJ loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
 247   { \sum_{s \in S} \sum_{\SDTR{\LOC{h}}:\LOC{h} \in s} \sum_{i:i \XI loc_l(\LOC{h})} \sum_{j:j \XJ loc_r(\LOC{h})} c_s(\SDTR{\LOC{h}} : i, j) }
 248 \end{align*}
 249
 250 Then these are our reestimated stop probabilities:
 251 \begin{align*}
 252   \hat{P}_{STOP} (STOP|h, left, non\text{-}adj) =
 253   \hat{d}(\SEAL{h}, \RGOL{h},<,\geq)  +
 254   \hat{d}(\LGOR{h}, \GOL{h},<,=)
 255 \end{align*}
 256
 257 \begin{align*}
 258   \hat{P}_{STOP} (STOP|h, left, adj) =
 259   \hat{d}(\SEAL{h}, \RGOL{h},=,\geq)  +
 260   \hat{d}(\LGOR{h}, \GOL{h},=,=)
 261 \end{align*}
 262
 263 \begin{align*}
 264   \hat{P}_{STOP} (STOP|h, right, non\text{-}adj) =
 265   \hat{d}(\RGOL{h}, \GOR{h},=,>)  +
 266   \hat{d}(\SEAL{h}, \LGOR{h},\leq,>)
 267 \end{align*}
 268
 269 \begin{align*}
 270   \hat{P}_{STOP} (STOP|h, right, adj) =
 271   \hat{d}(\RGOL{h}, \GOR{h},=,=)  +
 272   \hat{d}(\SEAL{h}, \LGOR{h},\leq,=)
 273 \end{align*}
 274
 275
 276 \subsection{Root reestimation}
 277 Following \citet[p.~46]{prescher-em}, to find the reestimated
 278 probability of a PCFG rule, we first find the new treebank frequencies
 279 $f_{T_P}(tree)=P(tree)/P_s$, then for a rule $X' \rightarrow X$ we
 280 divide the new frequencies of the trees which use this rule and by
 281 those of the trees containing the node $X'$. $ROOT$ appears once per
 282 tree, meaning we divide by $1$ per sentence\footnote{Assuming each
 283   tree has frequency $1$.}, so $\hat{P}_{ROOT}(h)=\sum_{tree:ROOT
 284   \rightarrow \SEAL{h} \text{ used in } tree} f_{T_P}(tree)=\sum_{tree:ROOT
 285   \rightarrow \SEAL{h} \text{ used in } tree} P(tree)/P_s$, which turns into:
 286
 287 \begin{align*}
 288   \hat{P}_{ROOT} (h) = \frac
 289   {\sum_{s\in S} 1 / P_s \cdot \sum_{\LOC{h}\in s} P_{ROOT}(\LOC{h}) P_{INSIDE_s}(\SEAL{h}, 0, len(s))}
 290   {\sum_{s\in S} 1}
 291 \end{align*}
 292
 293 % todo: the following just didn't turn out right, but it ought to be
 294 % possible to use this to check the CNF-like unary attachments...
 295 %
 296 % The root attachment is just a unary PCFG rule $ROOT \rightarrow
 297 % \SEAL{h}$. Modifiying binary rule reestimation from
 298 % \citet[p.~41]{lari-csl90} to unary rules gives:
 299
 300 % \begin{align*}
 301 %   \hat{P}_{RULE}(ROOT \rightarrow \SEAL{h}) = \frac
 302 %   { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} 1/P_s \cdot P_{RULE}(ROOT \rightarrow \SEAL{h})P_{INSIDE_s}(\SEAL{h}, i, j)P_{OUTSIDE_s}(ROOT, i, j) }
 303 %   { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} c_s(\SEAL{h}, i, j) }
 304 % \end{align*}
 305
 306 % Since $P_{OUTSIDE}(ROOT, i, j)$ is $1$ if $i=0$ and $j=len(s)$, and $0$ otherwise, this reduces into:
 307
 308 % \begin{align*}
 309 %   \hat{P}_{RULE}(ROOT \rightarrow \SEAL{h}) = \frac
 310 %   { \sum_{s\in S} 1/P_s \cdot P_{RULE}(ROOT \rightarrow \SEAL{h})P_{INSIDE_s}(\SEAL{h}, 0, len(s)) }
 311 %   { \sum_{s\in S} 1/P_s \cdot P_{INSIDE_s}(\SEAL{h}, 0, len(s)) }
 312 % \end{align*}
 313
 314
 315
 316
 317
 318 \section{Alternate CNF-like rules}
 319 Since the IO algorithm as described in \citet{lari-csl90} is made for
 320 rules in CNF, we have an alternate grammar (figure \ref{cnf-like}) for
 321 running parallell tests where we don't have to sum over the different
 322 $loc(h)$ in IO. This is not yet generalized to include left-first
 323 attachment. It is also not quite CNF, since it includes some unary
 324 rewrite rules.
 325
 326 \begin{figure}[htp]
 327   \centering
 328   \begin{tabular} % four left-aligned math tabs, one vertical line
 329     { >{$}l<{$} >{$}l<{$} >{$}l<{$} | >{$}l<{$} }
 330     \multicolumn{3}{c}{Rule} & \multicolumn{1}{c}{$P_{RULE}$ ($a[i,j,k]$ in \citet{lari-csl90})}\\
 331     \hline{}
 332
 333     \RN{\GOR{h}} \rightarrow& \GOR{h} &\SEAL{a}        &P_{STOP}(\neg stop|h, right, adj) \cdot P_{ATTACH}(a|h, right) \\
 334     &&&\\
 335     \RN{\GOR{h}} \rightarrow& \RN{\GOR{h}} &\SEAL{a}   &P_{STOP}(\neg stop|h, right, non\text{-}adj) \cdot P_{ATTACH}(a|h, right) \\
 336     &&&\\
 337     \RGOL{h} \rightarrow& \GOR{h} &STOP       &P_{STOP}(stop|h, right, adj) \\
 338     &&&\\
 339     \RGOL{h} \rightarrow& \RN{\GOR{h}} &STOP  &P_{STOP}(stop|h, right, non\text{-}adj) \\
 340     &&&\\
 341     \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\RGOL{h}      &P_{STOP}(\neg stop|h, left, adj) \cdot P_{ATTACH}(a|h, left) \\
 342     &&&\\
 343     \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\LN{\RGOL{h}} &P_{STOP}(\neg stop|h, left, non\text{-}adj) \cdot P_{ATTACH}(a|h, left) \\
 344     &&&\\
 345     \SEAL{h} \rightarrow& STOP &\RGOL{h}      &P_{STOP}(stop|h, left, adj) \\
 346     &&&\\
 347     \SEAL{h} \rightarrow& STOP &\LN{\RGOL{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\
 348   \end{tabular}
 349   \caption{Alternate CFG rules (where a child node has an arrow below,
 350     we use non-adjacent probabilities), defined for all words/POS-tags
 351     $h$.}\label{cnf-like}
 352 \end{figure}
 353
 354 The inside probabilities are the same as those given in
 355 \citet{lari-csl90}, with the following exceptions:
 356
 357 When calculating $P_{INSIDE}(\SMTR{h}, i, j)$ and summing through possible
 358 rules which rewrite $\SMTR{h}$, if a rule is of the form $\SMTR{h} \rightarrow
 359 STOP ~ \SDTR{h}$ or $\SMTR{h} \rightarrow \SDTR{h} ~ STOP$, we add $P_{RULE}\cdot
 360 P_{INSIDE}(\SDTR{h}, i, j)$ (that is, rewrite for the same sentence range);
 361 and, as a consequence of these unary rules: for ``terminal rules''
 362 ($P_{ORDER}$) to be applicable, not only must $i = j-1$, but also the
 363 left-hand side symbol of the rule must be of the form $\GOR{h}$.
 364
 365 Similarly, the outside probabilities are the same as those for pure
 366 CNF rules, with the exception that we add the unary rewrite
 367 probabilities
 368 \begin{align*}
 369   \sum_{\SMTR{h}} [&P_{OUTSIDE}(\SMTR{h},i,j)\cdot P_{RULE}(\SMTR{h} \rightarrow \SDTR{h} ~ STOP) \\
 370           + &P_{OUTSIDE}(\SMTR{h},i,j)\cdot P_{RULE}(\SMTR{h} \rightarrow STOP ~ \SDTR{h})]
 371 \end{align*}
 372 to $P_{OUTSIDE}(\SDTR{h},i,j)$ (eg. $f(s,t,i)$).
 373
 374 The reestimation just needs to be expanded to allow unary stop rules,
 375 this is similar to the formula for binary rules in
 376 \citet[p.~41]{lari-csl90}:
 377 \begin{align*}
 378   \hat{P}_{RULE}(\SMTR{h} \rightarrow \SDTR{h}) = \frac
 379   { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} 1/P_s \cdot P_{RULE}(\SMTR{h}\rightarrow \SDTR{h})P_{INSIDE}(\SDTR{h}, i, j)P_{OUTSIDE}(\SMTR{h}, i, j) }
 380   { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} c_s(\SMTR{h}, i, j) }
 381 \end{align*}
 382 (the denominator is unchanged, but the numerator sums $w_s$ defined
 383 for unary rules; $\SMTR{h}\rightarrow \SDTR{h}$ is meant to include both left and
 384 right stop rules).
 385
 386 % todo: add ROOT rule to CNF grammar?
 387
 388
 389 \section{Pure CNF rules}
 390 Alternatively, one may use the ``pure CNF'' grammar of figure
 391 \ref{cnf-T} and figure \ref{cnf-NT}, below (not yet
 392 implemented). Using the harmonic distribution to initialize this will
 393 still yield a mass-deficient grammar, but at least this allows the
 394 regular IO algorithm to be run.
 395
 396 \begin{figure}[htp]
 397   \centering
 398   \begin{tabular} % four left-aligned math tabs, one vertical line
 399     { >{$}l<{$} >{$}l<{$} >{$}l<{$} | >{$}l<{$} }
 400     \multicolumn{3}{c}{Rule} & \multicolumn{1}{c}{$P_{RULE}$ ($b[i,m]$ in \citet{lari-csl90})}\\
 401     \hline{}
 402
 403     h    \rightarrow&   \text{``h''}&         &1 \\
 404     &&&\\
 405     \SEAL{h}   \rightarrow&   \text{``h''}&         &P_{STOP}(stop|h, left, adj) \cdot  P_{STOP}(stop|h, right, adj) \\
 406     &&&\\
 407     ROOT  \rightarrow&   \text{``h''}&         &P_{STOP}(stop|h, left, adj) \cdot  P_{STOP}(stop|h, right, adj) \cdot  P_{ROOT}(h) \\
 408   \end{tabular}
 409   \caption{Terminals of pure CNF rules, defined for all POS-tags
 410     $h$.}\label{cnf-T}
 411 \end{figure}
 412
 413 \begin{figure}[htp]
 414   \centering
 415   \begin{tabular} % four left-aligned math tabs, one vertical line
 416     { >{$}l<{$} >{$}l<{$} >{$}l<{$} | >{$}l<{$} }
 417     \multicolumn{3}{c}{Rule} & \multicolumn{1}{c}{$P_{RULE}$ ($a[i,j,k]$ in \citet{lari-csl90})}\\
 418     \hline{}
 419
 420     \SEAL{h}   \rightarrow&    h &\SEAL{a}   &P_{STOP}(stop|h, left, adj) \cdot  P_{STOP}(stop|h, right, non\text{-}adj) \\
 421     &&&\cdot  P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, adj) \\[3.5pt]
 422     \SEAL{h}   \rightarrow&    h &\GOR{.h}   &P_{STOP}(stop|h, left, adj) \cdot  P_{STOP}(stop|h, right, non\text{-}adj) \\[3.5pt]
 423     \GOR{.h}   \rightarrow& \SEAL{a} &\SEAL{b} &P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, adj) \\
 424     &&&\cdot  P_{ATTACH}(b|h, right)\cdot P_{STOP}(\neg stop|h, right, non\text{-}adj) \\[3.5pt]
 425     \GOR{.h}   \rightarrow& \SEAL{a} &\GOR{h} &P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, adj) \\[3.5pt]
 426     \GOR{h}   \rightarrow& \SEAL{a} &\SEAL{b} &P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, non\text{-}adj) \\
 427     &&&\cdot  P_{ATTACH}(b|h, right)\cdot P_{STOP}(\neg stop|h, right, non\text{-}adj) \\[3.5pt]
 428     \GOR{h}   \rightarrow& \SEAL{a} &\GOR{h} &P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, non\text{-}adj) \\[3.5pt]
 429     \SEAL{h}   \rightarrow& \SEAL{a} &h &P_{STOP}(stop|h, left, non\text{-}adj) \cdot  P_{STOP}(stop|h, right, adj) \\
 430     &&&\cdot  P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \\[3.5pt]
 431     \SEAL{h}   \rightarrow& \GOL{h.} &h &P_{STOP}(stop|h, left, non\text{-}adj) \cdot  P_{STOP}(stop|h, right, adj) \\[3.5pt]
 432     \GOL{h.}   \rightarrow& \SEAL{b} &\SEAL{a} &P_{ATTACH}(b|h, left)\cdot P_{STOP}(\neg stop|h, left, non\text{-}adj) \\
 433     &&&\cdot  P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \\[3.5pt]
 434     \GOL{h.}   \rightarrow& \GOL{h} &\SEAL{a} &P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \\[3.5pt]
 435     \GOL{h}   \rightarrow& \SEAL{a} &\SEAL{b} &P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, non\text{-}adj) \\
 436     &&&\cdot  P_{ATTACH}(b|h, left)\cdot P_{STOP}(\neg stop|h, left, non\text{-}adj) \\[3.5pt]
 437     \GOL{h}   \rightarrow& \GOL{h} &\SEAL{a} &P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, non\text{-}adj) \\[3.5pt]
 438     \SEAL{h}   \rightarrow& \GOL{h.} &\SEAL{\GOR{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\[3.5pt]
 439     \SEAL{h}   \rightarrow& \SEAL{a} &\SEAL{\GOR{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\
 440     &&&\cdot  P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \\[3.5pt]
 441     \SEAL{\GOR{h}}  \rightarrow& h &\GOR{.h} &P_{STOP}(stop|h, right, non\text{-}adj) \\[3.5pt]
 442     \SEAL{\GOR{h}}  \rightarrow& h &\SEAL{a} &P_{STOP}(stop|h, right, non\text{-}adj) \\
 443     &&&\cdot  P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, adj) \\[3.5pt]
 444     ROOT  \rightarrow& h &\SEAL{a} &P_{STOP}(stop|h, left, adj) \cdot  P_{STOP}(stop|h, right, non\text{-}adj) \\
 445     &&&\cdot  P_{ATTACH}(a|h, right)\cdot P_{STOP}(\neg stop|h, right, adj) \cdot  P_{ROOT}(h) \\[3.5pt]
 446     ROOT  \rightarrow& h &\GOR{.h} &P_{STOP}(stop|h, left, adj) \cdot  P_{STOP}(stop|h, right, non\text{-}adj) \cdot  P_{ROOT}(h) \\[3.5pt]
 447     ROOT  \rightarrow& \SEAL{a} &h &P_{STOP}(stop|h, left, non\text{-}adj) \cdot  P_{STOP}(stop|h, right, adj) \\
 448     &&&\cdot  P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \cdot  P_{ROOT}(h) \\[3.5pt]
 449     ROOT  \rightarrow& \GOL{h.} &h &P_{STOP}(stop|h, left, non\text{-}adj) \cdot  P_{STOP}(stop|h, right, adj) \cdot  P_{ROOT}(h) \\[3.5pt]
 450     ROOT  \rightarrow& \GOL{h.} &\SEAL{\GOR{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \cdot  P_{ROOT}(h) \\[3.5pt]
 451     ROOT  \rightarrow& \SEAL{a} &\SEAL{\GOR{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\
 452     &&&\cdot  P_{ATTACH}(a|h, left)\cdot P_{STOP}(\neg stop|h, left, adj) \cdot  P_{ROOT}(h) \\[3.5pt]
 453   \end{tabular}
 454   \caption{Non-terminals of pure CNF rules, defined for all POS-tags
 455     $h$, $a$ and $b$.}\label{cnf-NT}
 456 \end{figure}
 457
 458
 459
 460 \section{Initialization}
 461 \citet{klein-thesis} describes DMV initialization using a ``harmonic
 462 distribution'' for the initial probabilities, where the probability of
 463 one word heading another is higher if they appear closer to one
 464 another.
 465
 466 There are several ways this could be implemented. We initialized
 467 attachment probabilities with the following formula:
 468
 469 \begin{align*}
 470   P_{ATTACH}(a|h,right) = \frac
 471   {\sum_{s \in S}\sum_{\LOC{h} \in s} \sum_{\LOC{a} \in s:loc(\LOC{a})>loc(\LOC{h})} 1/(loc(\LOC{a})-loc(\LOC{h})) + C_A}
 472   {\sum_{s \in S}\sum_{\LOC{h} \in s} \sum_{\LOC{w} \in s:loc(\LOC{w})>loc(\LOC{h})} 1/(loc(\LOC{w})-loc(\LOC{h})) + C_A}
 473 \end{align*}
 474
 475 The probability of stopping adjacently (left or right) was increased
 476 whenever a word occured at a (left or right) sentence
 477 border\footnote{For non-adjacent stopping we checked for occurence at
 478   the second(-to-last) position.}:
 479
 480 \begin{align*}
 481   f(stop:\LOC{h},left,adj)=\begin{cases}
 482     C_S \text{, if } loc(\LOC{h}) = 0,\\
 483     0 \text{, otherwise}
 484   \end{cases}
 485 \end{align*}
 486
 487 \begin{align*}
 488   P_{STOP}(stop|h,left,adj) = \frac
 489   {C_{M} + \sum_{s \in S}\sum_{\LOC{h} \in s} f(stop:\LOC{h},left,adj)}
 490   {C_{M} + \sum_{s \in S}\sum_{\LOC{h} \in s} C_S+C_N}
 491 \end{align*}
 492
 493 Tweaking the initialization constants $C_A, C_M, C_S$ and $C_N$
 494 allowed us to easily try different inital distributions.
 495
 496
 497 \bibliography{./statistical.bib}
 498 \bibliographystyle{plainnat}
 499
 500 \end{document}