report/report.tex

   1 % Created 2008-06-13 Fri 17:05
   2
   3 \documentclass[11pt,a4paper]{article}
   4 \usepackage[utf8]{inputenc}
   5 \usepackage[T1]{fontenc}
   6 \usepackage{hyperref}
   7 \usepackage{natbib}
   8
   9 \usepackage{pslatex}
  10 \usepackage{pdfsync}
  11 \pdfoutput=1
  12
  13 \usepackage{qtree}
  14 \usepackage{amsmath}
  15 \usepackage{amssymb}
  16
  17 \usepackage{avm}
  18 \avmfont{\sc}
  19 \avmoptions{sorted,active}
  20 \avmvalfont{\rm}
  21 \avmsortfont{\scriptsize\it}
  22
  23 \usepackage{array} % for a tabular with math in it
  24
  25 \title{The DMV and CCM models}
  26 \author{Emily Morgan \& Kevin Brubeck Unhammer}
  27 \date{15 September 2008}
  28
  29 \begin{document}
  30
  31 \maketitle
  32
  33 \tableofcontents
  34
  35 \section{Introduction}
  36 \section{The Constituent Context Model}
  37 \subsection{Results}
  38 \section{A Dependency Model with Valence}
  39 This is an attempt at fleshing out the details of the inside-outside
  40 algorithm \citep{lari-csl90} applied to the DMV model of
  41 \citet{klein-thesis}.
  42
  43 \newcommand{\LOC}[1]{\textbf{#1}}
  44 \newcommand{\GOR}[1]{\overrightarrow{#1}}
  45 \newcommand{\RGOL}[1]{\overleftarrow{\overrightarrow{#1}}}
  46 \newcommand{\SEAL}[1]{\overline{#1}}
  47 \newcommand{\LGOR}[1]{\overrightarrow{\overleftarrow{#1}}}
  48 \newcommand{\GOL}[1]{\overleftarrow{#1}}
  49 \newcommand{\LN}[1]{\underleftarrow{#1}}
  50 \newcommand{\RN}[1]{\underrightarrow{#1}}
  51 \newcommand{\XI}{\lessdot}
  52 \newcommand{\XJ}{\gtrdot}
  53 \newcommand{\SMTR}[1]{\dddot{#1}}
  54 \newcommand{\SDTR}[1]{\ddot{#1}}
  55
  56 \subsection{Note on notation}
  57 $i, j, k$ are sentence positions (between words), where $i$ and $j$
  58 are always the start and end, respectively, for what we're calculating
  59 ($k$ is between $i$ and $j$ for $P_{INSIDE}$, to their right or left
  60 for $P_{OUTSIDE}$). $s \in S$ are sentences in the corpus. $\LOC{w}$
  61 is a word token (actually POS-token) of type $w$ at a certain sentence
  62 location. If $\LOC{w}$ is between $i$ and $i+1$, $loc(\LOC{w})=i$
  63 following \citet{klein-thesis}, meaning $i$ is adjacent to $\LOC{w}$
  64 on the left, while $j=loc(\LOC{w})+1$ means that $j$ is adjacent to
  65 $\LOC{w}$ on the right. To simplify, $loc_l(\LOC{w}):=loc(\LOC{w})$ and
  66 $loc_r(\LOC{w}):=loc(\LOC{w})+1$. We write $\LOC{h}$ if this is a head
  67 in the rule being used, $\LOC{a}$ if it is an attached argument.
  68
  69 There are som notational differences between \citet{klein-thesis}
  70 \citet{km-dmv}:
  71
  72 \begin{tabular}{cc}
  73 Paper: & Thesis: \\
  74 $w$ & $\GOR{w}$ \\
  75 $w\urcorner$ & $\RGOL{w}$ \\
  76 $\ulcorner{w}\urcorner$ & $\SEAL{w}$ \\
  77 \end{tabular}
  78
  79 We use $\SMTR{w}$ (or $\SDTR{w}$) to signify one of either $w, \GOR{w},
  80 \RGOL{w}, \LGOR{w}, \GOL{w}$ or $\SEAL{w}$\footnote{This means that
  81   $\SMTR{\LOC{w}}$ is the triplet of the actual POS-tag, its sentence
  82   location as a token, and the ``level of seals''.}.
  83
  84
  85 \subsection{Inside probabilities}
  86 $P_{INSIDE}$ is defined in \citet[pp.~106-108]{klein-thesis}, the only
  87 thing we need to add is that for right attachments,
  88 $i \leq loc_l(w)<k \leq loc_l(\LOC{a})<j$ while for left attachments,
  89 $i \leq loc_l(\LOC{a})<k \leq loc_l(w)<j$.
  90
  91 (For now, let
  92 \[ \forall{w}[P_{ORDER}(right\text{-}first|w)=1.0] \] since the DMV implementation
  93 is not yet generalized to both directions.)
  94
  95
  96
  97
  98
  99 \subsubsection{Sentence probability}
 100
 101 $P_s$ is the sentence probability, based on
 102 \citet[p.~38]{lari-csl90}. Since the ROOT rules are different from the
 103 rest, we sum them explicitly in this definition:
 104 \begin{align*}
 105   P_s = \sum_{\LOC{w} \in s} P_{ROOT}(\LOC{w}) P_{INSIDE}(\SEAL{\LOC{w}}, 0, len(s))
 106 \end{align*}
 107
 108 \subsection{Outside probabilities}
 109
 110 \begin{align*}
 111   P_{OUTSIDE_s}(ROOT, i, j) = \begin{cases}
 112     1.0 & \text{ if $i = 0$ and $j = len(s)$,}\\
 113     0.0 & \text{ otherwise}
 114   \end{cases}
 115 \end{align*}
 116
 117 For $P_{OUTSIDE}(\SEAL{w}, i, j)$, $w$ is attached to under something
 118 else ($\SEAL{w}$ is what we elsewhere call $\SEAL{a}$). Adjacency is
 119 thus calculated on the basis of $h$, the head of the rule. If we are
 120 attached to from the left we have $i \leq  loc_l(\LOC{w}) < j \leq  loc_l(\LOC{h}) < k$, while
 121 from the right we have $k \leq  loc_l(\LOC{h}) < i \leq  loc_l(\LOC{w}) < j$:
 122 \begin{align*}
 123   P_{OUTSIDE}&(\SEAL{\LOC{w}}, i, j) = \\
 124   & P_{ROOT}(w) P_{OUTSIDE}(ROOT, i, j) + \\
 125   & [ \sum_{k > j} ~ \sum_{\LOC{h}:j\leq loc_l(\LOC{h})<k} \sum_{\SMTR{\LOC{h}} \in \{\RGOL{\LOC{h}},\GOL{\LOC{h}}\}} P_{STOP}(\neg stop|h, left,  adj(j, \LOC{h})) P_{ATTACH}(w|h, left) \\
 126   & \qquad \qquad \qquad \qquad \qquad P_{OUTSIDE}(\SMTR{\LOC{h}}, i, k) P_{INSIDE}(\SMTR{\LOC{h}}, j, k) ] ~ + \\
 127   & [ \sum_{k < i} ~ \sum_{\LOC{h}:k\leq loc_l(\LOC{h})<i} \sum_{\SMTR{\LOC{h}} \in \{\LGOR{\LOC{h}},\GOR{\LOC{h}}\}} P_{STOP}(\neg stop|h, right, adj(i, \LOC{h})) P_{ATTACH}(w|h, right) \\
 128   & \qquad \qquad \qquad \qquad \qquad P_{INSIDE}(\SMTR{\LOC{h}}, k, i) P_{OUTSIDE}(\SMTR{\LOC{h}}, k, j) ]
 129 \end{align*}
 130
 131 For $\RGOL{w}$ we know it is either under a left stop rule or it is
 132 the right daughter of a left attachment rule ($k \leq loc_l(\LOC{a}) <
 133 i \leq loc_l(\LOC{w}) < j$), and these are adjacent if the start point
 134 ($i$) equals $loc_l(\LOC{w})$:
 135 \begin{align*}
 136   P_{OUTSIDE}(\RGOL{\LOC{w}}, i, j) = & P_{STOP}(stop|w, left, adj(i,
 137   \LOC{w}))P_{OUTSIDE}(\SEAL{\LOC{w}}, i, j) ~ + \\
 138   & [ \sum_{k < i} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<i} P_{STOP}(\neg stop|w, left, adj(i, \LOC{w})) P_{ATTACH}(a|w, left) \\
 139   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{\LOC{a}}, k, i) P_{OUTSIDE}(\RGOL{\LOC{w}}, k, j) ]
 140 \end{align*}
 141
 142 For $\GOR{w}$ we are either under a right stop or the left daughter of
 143 a right attachment rule ($i \leq loc_l(\LOC{w}) < j \leq
 144 loc_l(\LOC{a}) < k$), adjacent iff the the end point ($j$) equals
 145 $loc_r(\LOC{w})$:
 146 \begin{align*}
 147   P_{OUTSIDE}(\GOR{\LOC{w}}, i, j) = & P_{STOP}(stop|w, right, adj(j,
 148   \LOC{w}))P_{OUTSIDE}(\RGOL{\LOC{w}}, i, j) ~ + \\
 149   & [ \sum_{k > j} ~ \sum_{\LOC{a}:j\leq loc_l(\LOC{a})<k} P_{STOP}(\neg stop|w, right, adj(j, \LOC{w})) P_{ATTACH}(a|w, right) \\
 150   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\GOR{\LOC{w}}, i, k) P_{INSIDE}(\SEAL{\LOC{a}}, j, k) ]
 151 \end{align*}
 152
 153 $\GOL{w}$ is just like $\RGOL{w}$, except for the outside probability
 154 of having a stop above, where we use $\LGOR{w}$:
 155 \begin{align*}
 156   P_{OUTSIDE}(\GOL{\LOC{w}}, i, j) = & P_{STOP}(stop|w, left, adj(i,
 157   \LOC{w}))P_{OUTSIDE}(\LGOR{\LOC{w}}, i, j) ~ + \\
 158   & [ \sum_{k < i} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<i} P_{STOP}(\neg stop|w, left, adj(i, \LOC{w})) P_{ATTACH}(a|w, left) \\
 159   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{\LOC{a}}, k, i) P_{OUTSIDE}(\GOL{\LOC{w}}, k, j) ]
 160 \end{align*}
 161
 162 $\LGOR{w}$ is just like $\GOR{w}$, except for the outside probability
 163 of having a stop above, where we use $\SEAL{w}$:
 164 \begin{align*}
 165   P_{OUTSIDE}(\LGOR{\LOC{w}}, i, j) = & P_{STOP}(stop|w, right, adj(j,
 166   \LOC{w}))P_{OUTSIDE}(\SEAL{\LOC{w}}, i, j) ~ + \\
 167   & [ \sum_{k > j} ~ \sum_{\LOC{a}:j\leq loc_l(\LOC{a})<k} P_{STOP}(\neg stop|w, right, adj(j, \LOC{w})) P_{ATTACH}(a|w, right) \\
 168   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\LGOR{\LOC{w}}, i, k) P_{INSIDE}(\SEAL{\LOC{a}}, j, k) ]
 169 \end{align*}
 170
 171
 172 \subsection{Reestimating the rules}
 173 % TODO: fix stop and attachment formulas so they divide before summing
 174
 175 \subsubsection{$c$ and $w$ (helper formulas used below)}
 176 $c_s(\SMTR{\LOC{w}} : i, j)$ is ``the expected fraction of parses of
 177 $s$ with a node labeled $\SMTR{w}$ extending from position $i$ to
 178 position $j$'' \citep[p.~88]{klein-thesis}, here defined to equal
 179 $v_{q}$ of \citet[p.~41]{lari-csl90}\footnote{In terms of regular EM,
 180   this is the count of trees ($f_{T_q}(x)$ in
 181   \citet[p.~46]{prescher-em}) in which the node extended from $i$ to
 182   $j$.}:
 183 \begin{align*}
 184   c_s(\SMTR{\LOC{w}} : i, j) = P_{INSIDE_s}(\SMTR{\LOC{w}}, i, j) P_{OUTSIDE_s}(\SMTR{\LOC{w}}, i, j) / P_s
 185 \end{align*}
 186
 187 $w_s$ is $w_{q}$ from \citet[p.~41]{lari-csl90}, generalized to $\SMTR{h}$ and $dir$:
 188 \begin{align*}
 189   w_s(\SEAL{a} & : \SMTR{\LOC{h}}, left, i, j) = \\
 190   & 1/P_s \sum_{k:i<k<j} ~ \sum_{\LOC{a}:i\leq loc_l(\LOC{a})<k}
 191           & P_{STOP}(\neg stop|h, left, adj(k, \LOC{h})) P_{CHOOSE}(a|h, left) \\
 192   &       & P_{INSIDE_s}(\SEAL{\LOC{a}}, i, k) P_{INSIDE_s}(\SMTR{\LOC{h}}, k, j) P_{OUTSIDE_s}(\SMTR{\LOC{h}}, i, j)
 193 \end{align*}
 194 \begin{align*}
 195   w_s(\SEAL{a} & : \SMTR{\LOC{h}}, right,  i, j) = \\
 196   & 1/P_s \sum_{k:i<k<j} ~ \sum_{\LOC{a}:k\leq loc_l(\LOC{a})<j}
 197           & P_{STOP}(\neg stop|h, right, adj(k, \LOC{h})) P_{CHOOSE}(a|h, right) \\
 198   &       & P_{INSIDE_s}(\SMTR{\LOC{h}}, i, k) P_{INSIDE_s}(\SEAL{\LOC{a}}, k, j) P_{OUTSIDE_s}(\SMTR{\LOC{h}}, i, j)
 199 \end{align*}
 200
 201 Let $\hat{P}$ be the new STOP/CHOOSE-probabilities (since the old $P$
 202 are used in $P_{INSIDE}$ and $P_{OUTSIDE}$).
 203
 204 \subsubsection{Attachment reestimation}
 205
 206 $\hat{a}$ is given in \citet[p.~41]{lari-csl90}. Here $i<loc_l(\LOC{h})$
 207 since we want trees with at least one attachment:
 208 \begin{align*}
 209   \hat{a} (a | \SMTR{h}, left) =  \frac
 210   { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i<loc_l(\LOC{h})} \sum_{j\geq loc_r(\LOC{h})} w_s(\SEAL{a} : \SMTR{\LOC{h}}, left, i, j) }
 211   { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i<loc_l(\LOC{h})} \sum_{j\geq loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
 212 \end{align*}
 213
 214 Here $j>loc_r(\SMTR{\LOC{h}})$ since we want at least one attachment:
 215 \begin{align*}
 216   \hat{a} (a | \SMTR{h}, right) = \frac
 217   { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i\leq loc_l(\LOC{h})} \sum_{j>loc_r(\LOC{h})} w_s(\SEAL{a} : \SMTR{\LOC{h}}, right, i, j) }
 218   { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i\leq loc_l(\LOC{h})} \sum_{j>loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
 219 \end{align*}
 220
 221 For the first/lowest attachments, $w_s$ and $c_s$ have zero probability
 222 where $i<loc_l(\LOC{h})$ (for $\GOR{h}$) or $j>loc_r(\LOC{h})$ (for $\GOL{h}$),
 223 this is implicit in $P_{INSIDE}$.
 224
 225
 226
 227 \begin{align*}
 228   \hat{P}_{CHOOSE} (a | h, left) =
 229   \hat{a} (a | \GOL{h}, left)
 230   + \hat{a} (a | \RGOL{h}, left)
 231 \end{align*}
 232 \begin{align*}
 233   \hat{P}_{CHOOSE} (a | h, right) =
 234   \hat{a} (a | \GOR{h},right)
 235   + \hat{a} (a | \LGOR{h},right)
 236 \end{align*}
 237
 238 \subsubsection{Stop reestimation}
 239 The following is based on \citet[p.~88]{klein-thesis}. For the
 240 non-adjacent rules, $i<loc_l(\LOC{h})$ on the left and $j>loc_r(\LOC{h})$ on the
 241 right, while for the adjacent rules these are equal (respectively).
 242
 243 To avoid some redundancy below, define a helper function $\hat{d}$ as follows:
 244 \begin{align*}
 245   \hat{d}(\SMTR{h},\SDTR{h},\XI,\XJ) = \frac
 246   { \sum_{s \in S} \sum_{\SMTR{\LOC{h}}:\LOC{h} \in s} \sum_{i:i \XI loc_l(\LOC{h})} \sum_{j:j \XJ loc_r(\LOC{h})} c_s(\SMTR{\LOC{h}} : i, j) }
 247   { \sum_{s \in S} \sum_{\SDTR{\LOC{h}}:\LOC{h} \in s} \sum_{i:i \XI loc_l(\LOC{h})} \sum_{j:j \XJ loc_r(\LOC{h})} c_s(\SDTR{\LOC{h}} : i, j) }
 248 \end{align*}
 249
 250 Then these are our reestimated stop probabilities:
 251 \begin{align*}
 252   \hat{P}_{STOP} (STOP|h, left, non\text{-}adj) =
 253   \hat{d}(\SEAL{h}, \RGOL{h},<,\geq)  +
 254   \hat{d}(\LGOR{h}, \GOL{h},<,=)
 255 \end{align*}
 256
 257 \begin{align*}
 258   \hat{P}_{STOP} (STOP|h, left, adj) =
 259   \hat{d}(\SEAL{h}, \RGOL{h},=,\geq)  +
 260   \hat{d}(\LGOR{h}, \GOL{h},=,=)
 261 \end{align*}
 262
 263 \begin{align*}
 264   \hat{P}_{STOP} (STOP|h, right, non\text{-}adj) =
 265   \hat{d}(\RGOL{h}, \GOR{h},=,>)  +
 266   \hat{d}(\SEAL{h}, \LGOR{h},\leq,>)
 267 \end{align*}
 268
 269 \begin{align*}
 270   \hat{P}_{STOP} (STOP|h, right, adj) =
 271   \hat{d}(\RGOL{h}, \GOR{h},=,=)  +
 272   \hat{d}(\SEAL{h}, \LGOR{h},\leq,=)
 273 \end{align*}
 274
 275
 276 \subsubsection{Root reestimation}
 277 Following \citet[p.~46]{prescher-em}, to find the reestimated
 278 probability of a PCFG rule, we first find the new treebank frequencies
 279 $f_{T_P}(tree)=P(tree)/P_s$, then for a rule $X' \rightarrow X$ we
 280 divide the new frequencies of the trees which use this rule and by
 281 those of the trees containing the node $X'$. $ROOT$ appears once per
 282 tree, meaning we divide by $1$ per sentence\footnote{Assuming each
 283   tree has frequency $1$.}, so $\hat{P}_{ROOT}(h)=\sum_{tree:ROOT
 284   \rightarrow \SEAL{h} \text{ used in } tree} f_{T_P}(tree)=\sum_{tree:ROOT
 285   \rightarrow \SEAL{h} \text{ used in } tree} P(tree)/P_s$, which turns into:
 286
 287 \begin{align*}
 288   \hat{P}_{ROOT} (h) = \frac
 289   {\sum_{s\in S} 1 / P_s \cdot \sum_{\LOC{h}\in s} P_{ROOT}(\LOC{h}) P_{INSIDE_s}(\SEAL{h}, 0, len(s))}
 290   {\sum_{s\in S} 1}
 291 \end{align*}
 292
 293
 294
 295
 296 \subsection{Alternate CNF-like rules}
 297 Since the IO algorithm as described in \citet{lari-csl90} is made for
 298 rules in Chomsky Normal Form (CNF), we have an alternate grammar
 299 (figure \ref{cnf-like}) for running testing purposes, where we don't
 300 have to sum over the different $loc(h)$ in IO. This is not yet
 301 generalized to include left-first attachment. It is also not quite
 302 CNF, since it includes some unary rewrite rules.
 303
 304 \begin{figure}[htp]
 305   \centering
 306   \begin{tabular} % four left-aligned math tabs, one vertical line
 307     { >{$}l<{$} >{$}l<{$} >{$}l<{$} | >{$}l<{$} }
 308     \multicolumn{3}{c}{Rule} & \multicolumn{1}{c}{$P_{RULE}$ ($a[i,j,k]$ in \citet{lari-csl90})}\\
 309     \hline{}
 310
 311     \RN{\GOR{h}} \rightarrow& \GOR{h} &\SEAL{a}        &P_{STOP}(\neg stop|h, right, adj) \cdot P_{ATTACH}(a|h, right) \\
 312     &&&\\
 313     \RN{\GOR{h}} \rightarrow& \RN{\GOR{h}} &\SEAL{a}   &P_{STOP}(\neg stop|h, right, non\text{-}adj) \cdot P_{ATTACH}(a|h, right) \\
 314     &&&\\
 315     \RGOL{h} \rightarrow& \GOR{h} &STOP       &P_{STOP}(stop|h, right, adj) \\
 316     &&&\\
 317     \RGOL{h} \rightarrow& \RN{\GOR{h}} &STOP  &P_{STOP}(stop|h, right, non\text{-}adj) \\
 318     &&&\\
 319     \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\RGOL{h}      &P_{STOP}(\neg stop|h, left, adj) \cdot P_{ATTACH}(a|h, left) \\
 320     &&&\\
 321     \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\LN{\RGOL{h}} &P_{STOP}(\neg stop|h, left, non\text{-}adj) \cdot P_{ATTACH}(a|h, left) \\
 322     &&&\\
 323     \SEAL{h} \rightarrow& STOP &\RGOL{h}      &P_{STOP}(stop|h, left, adj) \\
 324     &&&\\
 325     \SEAL{h} \rightarrow& STOP &\LN{\RGOL{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\
 326   \end{tabular}
 327   \caption{Alternate CFG rules (where a child node has an arrow below,
 328     we use non-adjacent probabilities), defined for all words/POS-tags
 329     $h$.}\label{cnf-like}
 330 \end{figure}
 331
 332 The inside probabilities are the same as those given in
 333 \citet{lari-csl90}, with the following exceptions:
 334
 335 When calculating $P_{INSIDE}(\SMTR{h}, i, j)$ and summing through
 336 possible rules which rewrite $\SMTR{h}$, if a rule is of the form
 337 $\SMTR{h} \rightarrow STOP ~ \SDTR{h}$ or $\SMTR{h} \rightarrow
 338 \SDTR{h} ~ STOP$, we add $P_{RULE}\cdot P_{INSIDE}(\SDTR{h}, i, j)$
 339 (that is, rewrite for the same sentence range); and, as a consequence
 340 of these unary rules: for ``terminal rules'' ($P_{ORDER}$) to be
 341 applicable, not only must $i = j-1$, but also the left-hand side
 342 symbol of the rule must be of the form $\GOR{h}$.
 343
 344 Similarly, the outside probabilities are the same as those for pure
 345 CNF rules, with the exception that we add the unary rewrite
 346 probabilities
 347 \begin{align*}
 348   \sum_{\SMTR{h}} [&P_{OUTSIDE}(\SMTR{h},i,j)\cdot P_{RULE}(\SMTR{h} \rightarrow \SDTR{h} ~ STOP) \\
 349           + &P_{OUTSIDE}(\SMTR{h},i,j)\cdot P_{RULE}(\SMTR{h} \rightarrow STOP ~ \SDTR{h})]
 350 \end{align*}
 351 to $P_{OUTSIDE}(\SDTR{h},i,j)$ (eg. $f(s,t,i)$).
 352
 353 This grammar gave the same results for inside and outside
 354 probabilities when run over our corpus.
 355
 356 \subsection{TODO: Initialization}
 357 \citet{klein-thesis} describes DMV initialization using a ``harmonic
 358 distribution'' for the initial probabilities, where the probability of
 359 one word heading another is higher if they appear closer to one
 360 another.
 361
 362 There are several ways this could be implemented. We initialized
 363 attachment probabilities with the following formula:
 364
 365 \begin{align*}
 366   P_{ATTACH}(a|h,right) = \frac
 367   {\sum_{s \in S}\sum_{\LOC{h} \in s} \sum_{\LOC{a} \in s:loc(\LOC{a})>loc(\LOC{h})} 1/(loc(\LOC{a})-loc(\LOC{h})) + C_A}
 368   {\sum_{s \in S}\sum_{\LOC{h} \in s} \sum_{\LOC{w} \in s:loc(\LOC{w})>loc(\LOC{h})} 1/(loc(\LOC{w})-loc(\LOC{h})) + C_A}
 369 \end{align*}
 370
 371 The probability of stopping adjacently (left or right) was increased
 372 whenever a word occured at a (left or right) sentence
 373 border\footnote{For non-adjacent stopping we checked for occurence at
 374   the second(-to-last) position.}:
 375
 376 \begin{align*}
 377   f(stop:\LOC{h},left,adj)=\begin{cases}
 378     C_S \text{, if } loc(\LOC{h}) = 0,\\
 379     0 \text{, otherwise}
 380   \end{cases}
 381 \end{align*}
 382
 383 \begin{align*}
 384   P_{STOP}(stop|h,left,adj) = \frac
 385   {C_{M} + \sum_{s \in S}\sum_{\LOC{h} \in s} f(stop:\LOC{h},left,adj)}
 386   {C_{M} + \sum_{s \in S}\sum_{\LOC{h} \in s} C_S+C_N}
 387 \end{align*}
 388
 389 \subsection{TODO: Results}
 390 We tried various values for the initialization constants $C_A, C_M, C_S$
 391 and $C_N$; but it was hard to find any clear pattern for what worked
 392 best.
 393
 394 % todo: check ~/dmv__zero_harmonic_c.txt and paste here
 395 We compared with a dependency parsed version of the WSJ-10
 396 corpus. Since single word sentences were not POS-tagged there, these
 397 were skipped. Also, the dependency parsed WSJ-10 did not have ROOT
 398 nodes, thus we both checked precision and recall without our ROOT
 399 dependency, and with a ROOT link added to the parses (where possible;
 400 221 parses had several heads that were not dependents, here we skipped
 401 the parses).
 402
 403 Table \ref{tab:dmv-wsj} shows the results of 40 iterations on the full
 404 WSJ-10 corpus, compared with the dependency parsed version.
 405 \begin{table*}
 406   \centering
 407   \begin{tabular}{cccccc}
 408     & Rooted  & & & Unrooted & \\
 409     P & R & F1 & P & R & F1
 410   \end{tabular}
 411   \caption{DMV results on the WSJ-10}
 412   \label{tab:dmv-wsj}
 413 \end{table*}
 414
 415
 416 \section{The combined model (?)}
 417 \subsection{Results (?)}
 418 \section{Conclusion}
 419
 420
 421
 422 \nocite{lari-csl90}
 423 \nocite{klein-thesis}
 424 \nocite{km-dmv}
 425 \bibliography{./statistical.bib}
 426 \bibliographystyle{plainnat}
 427
 428 \end{document}
 429
 430
 431