tex/formulas.tex

   1 % Created 2008-06-13 Fri 17:05
   2 \documentclass[11pt,a4paper]{article}
   3 \usepackage[utf8]{inputenc}
   4 \usepackage[T1]{fontenc}
   5 \usepackage{hyperref}
   6 \usepackage{natbib}
   7
   8 \usepackage{pslatex}
   9 \usepackage{pdfsync}
  10 \pdfoutput=1
  11
  12 \usepackage{qtree}
  13 \usepackage{amsmath}
  14 \usepackage{amssymb}
  15
  16 \usepackage{avm}
  17 \avmfont{\sc}
  18 \avmoptions{sorted,active}
  19 \avmvalfont{\rm}
  20 \avmsortfont{\scriptsize\it}
  21
  22 \usepackage{array} % for a tabular with math in it
  23
  24 \title{DMV formulas}
  25 \author{Kevin Brubeck Unhammer}
  26 \date{2 July 2008}
  27
  28 \begin{document}
  29
  30 \maketitle
  31 \tableofcontents
  32
  33 \newcommand{\GOR}[1]{\overrightarrow{#1}}
  34 \newcommand{\RGOL}[1]{\overleftarrow{\overrightarrow{#1}}}
  35 \newcommand{\SEAL}[1]{\overline{#1}}
  36 \newcommand{\LGOR}[1]{\overrightarrow{\overleftarrow{#1}}}
  37 \newcommand{\GOL}[1]{\overleftarrow{#1}}
  38 \newcommand{\LN}[1]{\underleftarrow{#1}}
  39 \newcommand{\RN}[1]{\underrightarrow{#1}}
  40
  41 \section{Note on notation}
  42 $i, j, k$ are sentence positions (between words), where $i$ and $j$
  43 are always the start and end, respectively, for what we're calculating
  44 ($k$ is between $i$ and $j$ for $P_{INSIDE}$, to their right or left
  45 for $P_{OUTSIDE}$). $s \in S$ are sentences in the corpus. $w$ is a
  46 word at a certain sentence location. If $w$ is between $i$ and $i+1$,
  47 $loc(w)=i$ following \citet{klein-thesis}, meaning $i$ is adjacent to
  48 $w$ on the left, while $j=loc(w)+1$ means that $j$ is adjacent to $h$
  49 on the right. To simplify, $loc_l(w):=loc(w)$ and
  50 $loc_r(w):=loc(w)+1$. We write $h$ if this is a
  51 head in the rule being used, $a$ if it is an attached argument.
  52
  53 Notational differences between the thesis \citep{klein-thesis} and the
  54 paper \citep{km-dmv}:
  55
  56 \begin{tabular}{cc}
  57 Paper: & Thesis: \\
  58 $w$ & $\GOR{w}$ \\
  59 $w\urcorner$ & $\RGOL{w}$ \\
  60 $\ulcorner{w}\urcorner$ & $\SEAL{w}$ \\
  61 \end{tabular}
  62
  63 $x$ can mean $w, \GOR{w}, \RGOL{w}$ or $\SEAL{w}$\footnote{This means
  64   that in the implementation, each such $x$ is represented by the
  65   triplet of the actual POS-tag, its sentence location, and the
  66   ``level of seals''.}.
  67
  68
  69 \section{Inside probabilities}
  70 $P_{INSIDE}$ is defined in \citet[pp.~106-108]{klein-thesis}, the only
  71 thing we need to add is that for right attachments,
  72 $i \leq loc_l(w)<k \leq loc_l(a)<j$ while for left attachments,
  73 $i \leq loc_l(a)<k \leq loc_l(w)<j$.
  74
  75 (For now, let
  76 \[ \forall{w}[P_{ORDER}(right\text{-}first|w)=1.0] \] since the DMV implementation
  77 is not yet generalized to both directions.)
  78
  79
  80
  81
  82
  83 \subsection{Sentence probability}
  84
  85 $P_s$ is the sentence probability, based on
  86 \citet[p.~38]{lari-csl90}. Since the ROOT rules are different from the
  87 rest, we sum them explicitly in this definition:
  88 \begin{align*}
  89   P_s = \sum_{w \in s} P_{ROOT}(w) P_{INSIDE}(\SEAL{w}, 0, len(s))
  90 \end{align*}
  91
  92 \section{Outside probabilities}
  93
  94 \begin{align*}
  95   P_{OUTSIDE_s}(ROOT, i, j) = \begin{cases}
  96     1.0 & \text{ if $i = 0$ and $j = len(s)$,}\\
  97     0.0 & \text{ otherwise}
  98   \end{cases}
  99 \end{align*}
 100
 101 For $P_{OUTSIDE}(\SEAL{w}, i, j)$, $w$ is attached to under something
 102 else ($\SEAL{w}$ is what we elsewhere call $\SEAL{a}$). Adjacency is
 103 thus calculated on the basis of $h$, the head of the rule. If we are
 104 attached to from the left we have $i \leq  loc_l(w) < j \leq  loc_l(h) < k$, while
 105 from the right we have $k \leq  loc_l(h) < i \leq  loc_l(w) < j$:
 106 \begin{align*}
 107   P_{OUTSIDE}&(\SEAL{w}, i, j) = \\
 108   & P_{ROOT}(w) P_{OUTSIDE}(ROOT, i, j) + \\
 109   & [ \sum_{k > j} ~ \sum_{h:j\leq loc_l(h)<k} \sum_{x \in \{\RGOL{h},\GOL{h}\}} P_{STOP}(\neg stop|h, left,  adj(j, h)) P_{ATTACH}(w|h, left) \\
 110   & \qquad \qquad \qquad \qquad \qquad P_{OUTSIDE}(x, i, k) P_{INSIDE}(x, j, k) ] ~ + \\
 111   & [ \sum_{k < i} ~ \sum_{h:k\leq loc_l(h)<i} \sum_{x \in \{\LGOR{h},\GOR{h}\}} P_{STOP}(\neg stop|h, right, adj(i, h)) P_{ATTACH}(w|h, right) \\
 112   & \qquad \qquad \qquad \qquad \qquad P_{INSIDE}(x, k, i) P_{OUTSIDE}(x, k, j) ]
 113 \end{align*}
 114
 115 For $\RGOL{w}$ we know it is either under a left stop rule or it is
 116 the right daughter of a left attachment rule ($k \leq  loc_l(a) < i \leq  loc_l(w)
 117 < j$), and these are adjacent if the start point ($i$) equals
 118 $loc_l(w)$:
 119 \begin{align*}
 120   P_{OUTSIDE}(\RGOL{w}, i, j) = & P_{STOP}(stop|w, left, adj(i,
 121   w))P_{OUTSIDE}(\SEAL{w}, i, j) ~ + \\
 122   & [ \sum_{k < i} ~ \sum_{a:k\leq loc_l(a)<i} P_{STOP}(\neg stop|w, left, adj(i, w)) P_{ATTACH}(a|w, left) \\
 123   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{a}, k, i) P_{OUTSIDE}(\RGOL{w}, k, j) ]
 124 \end{align*}
 125
 126 For $\GOR{w}$ we are either under a right stop or the left daughter of
 127 a right attachment rule ($i \leq  loc_l(w) < j \leq  loc_l(a) < k$), adjacent iff
 128 the the end point ($j$) equals $loc_r(w)$:
 129 \begin{align*}
 130   P_{OUTSIDE}(\GOR{w}, i, j) = & P_{STOP}(stop|w, right, adj(j,
 131   w))P_{OUTSIDE}(\RGOL{w}, i, j) ~ + \\
 132   & [ \sum_{k > j} ~ \sum_{a:j\leq loc_l(a)<k} P_{STOP}(\neg stop|w, right, adj(j, w)) P_{ATTACH}(a|w, right) \\
 133   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\GOR{w}, i, k) P_{INSIDE}(\SEAL{a}, j, k) ]
 134 \end{align*}
 135
 136 $\GOL{w}$ is just like $\RGOL{w}$, except for the outside probability
 137 of having a stop above, where we use $\LGOR{w}$:
 138 \begin{align*}
 139   P_{OUTSIDE}(\GOL{w}, i, j) = & P_{STOP}(stop|w, left, adj(i,
 140   w))P_{OUTSIDE}(\LGOR{w}, i, j) ~ + \\
 141   & [ \sum_{k < i} ~ \sum_{a:k\leq loc_l(a)<i} P_{STOP}(\neg stop|w, left, adj(i, w)) P_{ATTACH}(a|w, left) \\
 142   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{INSIDE}(\SEAL{a}, k, i) P_{OUTSIDE}(\GOL{w}, k, j) ]
 143 \end{align*}
 144
 145 $\LGOR{w}$ is just like $\GOR{w}$, except for the outside probability
 146 of having a stop above, where we use $\SEAL{w}$:
 147 \begin{align*}
 148   P_{OUTSIDE}(\LGOR{w}, i, j) = & P_{STOP}(stop|w, right, adj(j,
 149   w))P_{OUTSIDE}(\SEAL{w}, i, j) ~ + \\
 150   & [ \sum_{k > j} ~ \sum_{a:j\leq loc_l(a)<k} P_{STOP}(\neg stop|w, right, adj(j, w)) P_{ATTACH}(a|w, right) \\
 151   & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_{OUTSIDE}(\LGOR{w}, i, k) P_{INSIDE}(\SEAL{a}, j, k) ]
 152 \end{align*}
 153
 154
 155 \section{Reestimating the rules} % todo
 156
 157 \subsection{$c$ and $w$ (helper formulas used below)}
 158 % todo: left out rule-probability! wops (also, make P_{fooside}
 159 % sentence-specific)
 160
 161 $c_s(x : i, j)$ is ``the expected fraction of parses of $s$ with a
 162 node labeled $x$ extending from position $i$ to position $j$''
 163 \citep[p.~88]{klein-thesis}, here defined to equal $v_{q}$ of
 164 \citet[p.~41]{lari-csl90}:
 165 \begin{align*}
 166   c_s(x : i, j) = P_{INSIDE_s}(x, i, j) P_{OUTSIDE_s}(x, i, j) / P_s
 167 \end{align*}
 168
 169 $w_s$ is $w_{q}$ from \citet[p.~41]{lari-csl90}, generalized to $x$
 170 \footnote{$x$ here includes a ``level of seals'', to look up the
 171   $P_{STOP}$ and $P_{CHOOSE}$ formulas we of course only need the POS
 172   of $x$.}  and $dir$:
 173 \begin{align*}
 174   w_s(\SEAL{a} & : x, left, i, j) = \\
 175   & 1/P_s \sum_{k:i<k<j} ~ \sum_{a:i\leq loc_l(a)<k}
 176           & P_{STOP}(\neg stop|x, left, adj(k, x)) P_{CHOOSE}(a|x, left) \\
 177   &       & P_{INSIDE_s}(\SEAL{a}, i, k) P_{INSIDE_s}(x, k, j) P_{OUTSIDE_s}(x, i, j)
 178 \end{align*}
 179 \begin{align*}
 180   w_s(\SEAL{a} & : x, right,  i, j) = \\
 181   & 1/P_s \sum_{k:i<k<j} ~ \sum_{a:k\leq loc_l(a)<j}
 182           & P_{STOP}(\neg stop|x, right, adj(k, x)) P_{CHOOSE}(a|x, right) \\
 183   &       & P_{INSIDE_s}(x, i, k) P_{INSIDE_s}(\SEAL{a}, k, j) P_{OUTSIDE_s}(x, i, j)
 184 \end{align*}
 185
 186 Let $\hat{P}$ be the new STOP/CHOOSE-probabilities (since the old $P$
 187 are used in $P_{INSIDE}$ and $P_{OUTSIDE}$).
 188
 189 \subsection{Attachment reestimation}
 190
 191 $\hat{a}$ is given in \citet[p.~41]{lari-csl90}. Here $i<loc_l(h)$
 192 since we want trees with at least one attachment:
 193 \begin{align*}
 194   \hat{a} (a | x, left) =  \frac
 195   { \sum_{s \in S} \sum_{h \in s} \sum_{i<loc_l(h)} \sum_{j\geq loc_r(h)} w_s(\SEAL{a} : x, left, i, j) }
 196   { \sum_{s \in S} \sum_{h \in s} \sum_{i<loc_l(h)} \sum_{j\geq loc_r(h)} c_s(x : i, j) }
 197 \end{align*}
 198
 199 Here $j>loc_r(h)$ since we want at least one attachment:
 200 \begin{align*}
 201   \hat{a} (a | x, right) = \frac
 202   { \sum_{s \in S} \sum_{h \in s} \sum_{i\leq loc_l(h)} \sum_{j>loc_r(h)} w_s(\SEAL{a} : x, right, i, j) }
 203   { \sum_{s \in S} \sum_{h \in s} \sum_{i\leq loc_l(h)} \sum_{j>loc_r(h)} c_s(x : i, j) }
 204 \end{align*}
 205
 206 For the first/lowest attachments, $w$ and $c$ have zero probability
 207 where $i<loc_l(h)$ (for $\GOR{h}$) or $j>loc_r(h)$ (for $\GOL{h}$),
 208 this is implicit in $P_{INSIDE}$.
 209
 210
 211
 212 \begin{align*}
 213   \hat{P}_{CHOOSE} (a | h, left) = \frac
 214   { \hat{a} (a | \GOL{h}, left) }
 215   { \sum_{w} \hat{a} (w | \GOL{h}, left) }
 216   +
 217   \frac
 218   { \hat{a} (a | \RGOL{h}, left) }
 219   { \sum_{w} \hat{a} (w | \RGOL{h}, left) }
 220 \end{align*}
 221 \begin{align*}
 222   \hat{P}_{CHOOSE} (a | h, right) = \frac
 223   { \hat{a} (a | \GOR{h},right) }
 224   { \sum_{w} \hat{a} (w | \GOR{h},right) }
 225   +
 226   \frac
 227   { \hat{a} (a | \LGOR{h},right) }
 228   { \sum_{w} \hat{a} (w | \LGOR{h},right) }
 229 \end{align*}
 230
 231 \subsection{Stop reestimation}
 232 The following is based on \citet[p.~88]{klein-thesis}. For the
 233 non-adjacent rules, $i<loc_l(h)$ on the left and $j>loc_r(h)$ on the
 234 right, while for the adjacent rules these are equal (respectively).
 235
 236 To avoid some redundancy below, define a helper function $\hat{d}$ as follows:
 237 \begin{align*}
 238   \hat{d}(x,left,non\text{-}adj) = { \sum_{s \in S} \sum_{h \in s} \sum_{i<loc_l(h)} \sum_{j\geq loc_r(h)} c_s(x : i,       j) }
 239 \end{align*}
 240 \begin{align*}
 241   \hat{d}(x,left,           adj) = { \sum_{s \in S} \sum_{h \in s}                 \sum_{j\geq loc_r(h)} c_s(x : loc_l(h), j) }
 242 \end{align*}
 243 \begin{align*}
 244   \hat{d}(x,right,non\text{-}adj) = { \sum_{s \in S} \sum_{h \in s} \sum_{i\leq loc_l(h)} \sum_{j> loc_r(h)} c_s(x : i,      j) }
 245 \end{align*}
 246 \begin{align*}
 247   \hat{d}(x,right,           adj) = { \sum_{s \in S} \sum_{h \in s} \sum_{i\leq loc_l(h)}                  c_s(x : i,      loc_r(h)) }
 248 \end{align*}
 249
 250 \begin{align*}
 251   \hat{P}_{STOP} (STOP|h, left, non\text{-}adj) =
 252   \frac
 253   { \hat{d}(\SEAL{h},left,non\text{-}adj) }
 254   { \hat{d}(\RGOL{h},left,non\text{-}adj) }
 255   +
 256   \frac
 257   { \hat{d}(\LGOR{h},left,non\text{-}adj) }
 258   { \hat{d}(\GOL{h},left,non\text{-}adj) }
 259 \end{align*}
 260
 261 \begin{align*}
 262   \hat{P}_{STOP} (STOP|h, left, adj) =
 263   \frac
 264   { \hat{d}(\SEAL{h},left,adj) }
 265   { \hat{d}(\RGOL{h},left,adj) }
 266   +
 267   \frac
 268   { \hat{d}(\LGOR{h},left,adj) }
 269   { \hat{d}(\GOL{h},left,adj) }
 270 \end{align*}
 271
 272 \begin{align*}
 273   \hat{P}_{STOP} (STOP|h, right, non\text{-}adj) =
 274   \frac
 275   { \hat{d}(\RGOL{h},right,non\text{-}adj) }
 276   { \hat{d}(\GOR{h},right,non\text{-}adj) }
 277   +
 278   \frac
 279   { \hat{d}(\SEAL{h},right,non\text{-}adj) }
 280   { \hat{d}(\LGOR{h},right,non\text{-}adj) }
 281 \end{align*}
 282
 283 \begin{align*}
 284   \hat{P}_{STOP} (STOP|h, right, adj) =
 285   \frac
 286   { \hat{d}(\RGOL{h},right,adj) }
 287   { \hat{d}(\GOR{h},right,adj) }
 288   +
 289   \frac
 290   { \hat{d}(\SEAL{h},right,adj) }
 291   { \hat{d}(\LGOR{h},right,adj) }
 292 \end{align*}
 293
 294
 295 \subsection{Root reestimation} % todo grok: no \sum_{i} \sum_{j} ?
 296 \begin{align*}
 297   \hat{P}_{ROOT} (h) = \frac
 298   {\sum_{s\in S} P_{ROOT}(h) P_{INSIDE_s}(\SEAL{h}, 0, len(s)) }
 299   {\sum_{s\in S} P_s}
 300 \end{align*}
 301
 302
 303
 304
 305 \section{Alternate CNF rules}
 306 Since the IO algorithm as described in \citet{lari-csl90} is made for
 307 pure CNF style rules, we have an alternate grammar (figure \ref{cnf})
 308 for running parallell tests where we don't have to sum over the
 309 different $loc(h)$ in IO. This is
 310 not yet generalized to include left-first attachment.
 311
 312 \begin{figure}[htp]
 313   \centering
 314   \begin{tabular} % four left-aligned math tabs, one vertical line
 315     { >{$}l<{$} >{$}l<{$} >{$}l<{$} | >{$}l<{$} }
 316     \multicolumn{3}{c}{Rule} & \multicolumn{1}{c}{$P_{RULE}$ ($a[i,j,k]$ in \citet{lari-csl90})}\\
 317     \hline{}
 318
 319     \RN{\GOR{h}} \rightarrow& \GOR{h} &\SEAL{a}        &P_{STOP}(\neg stop|h, right, adj) \cdot P_{ATTACH}(a|h, right) \\
 320     &&&\\
 321     \RN{\GOR{h}} \rightarrow& \RN{\GOR{h}} &\SEAL{a}   &P_{STOP}(\neg stop|h, right, non\text{-}adj) \cdot P_{ATTACH}(a|h, right) \\
 322     &&&\\
 323     \RGOL{h} \rightarrow& \GOR{h} &STOP       &P_{STOP}(stop|h, right, adj) \\
 324     &&&\\
 325     \RGOL{h} \rightarrow& \RN{\GOR{h}} &STOP  &P_{STOP}(stop|h, right, non\text{-}adj) \\
 326     &&&\\
 327     \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\RGOL{h}      &P_{STOP}(\neg stop|h, left, adj) \cdot P_{ATTACH}(a|h, left) \\
 328     &&&\\
 329     \LN{\RGOL{h}} \rightarrow& \SEAL{a} &\LN{\RGOL{h}} &P_{STOP}(\neg stop|h, left, non\text{-}adj) \cdot P_{ATTACH}(a|h, left) \\
 330     &&&\\
 331     \SEAL{h} \rightarrow& STOP &\RGOL{h}      &P_{STOP}(stop|h, left, adj) \\
 332     &&&\\
 333     \SEAL{h} \rightarrow& STOP &\LN{\RGOL{h}} &P_{STOP}(stop|h, left, non\text{-}adj) \\
 334   \end{tabular}
 335   \caption{Alternate CFG rules (where a child node has an arrow below,
 336     we use non-adjacent probabilities), defined for all words/POS-tags
 337     $h$.}\label{cnf}
 338 \end{figure}
 339
 340 The inside probabilities are the same as those given in
 341 \citet{lari-csl90}, with the following exceptions:
 342
 343 When calculating $P_{INSIDE}(x, i, j)$ and summing through possible
 344 rules which rewrite $x$, if a rule is of the form $x \rightarrow
 345 STOP ~ x'$ or $x \rightarrow x' ~ STOP$, we add $P_{RULE}\cdot
 346 P_{INSIDE}(x', i, j)$ (that is, rewrite for the same sentence range);
 347 and, as a consequence of these unary rules: for ``terminal rules''
 348 ($P_{ORDER}$) to be applicable, not only must $i = j-1$, but also the
 349 left-hand side symbol of the rule must be of the form $\GOR{h}$.
 350
 351 Similarly, the outside probabilities are the same as those for pure
 352 CNF rules, with the exception that we add the unary rewrite
 353 probabilities
 354 \begin{align*}
 355   \sum_{x'} [&P_{OUTSIDE}(x,i,j)\cdot P_{RULE}(x' \rightarrow x ~ STOP) \\
 356           + &P_{OUTSIDE}(x,i,j)\cdot P_{RULE}(x' \rightarrow STOP ~ x)]
 357 \end{align*}
 358 to $P_{OUTSIDE}(x,i,j)$ (eg. $f(s,t,i)$).
 359
 360 The reestimation just needs to be expanded to allow unary stop rules,
 361 this is similar to the formula for binary rules in
 362 \citet[p.~41]{lari-csl90}:
 363 \begin{align*}
 364   \hat{P}_{RULE}(x' \rightarrow x) = \frac
 365   { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} 1/P_s \cdot P_{RULE}(x'\rightarrow x)P_{INSIDE}(x, i, j)P_{OUTSIDE}(x', i, j) }
 366   { \sum_{s\in S} \sum_{i<len(s)} \sum_{j>i} c_s(x', i, j) }
 367 \end{align*}
 368 (the denominator is unchanged, but the numerator sums $w_s$ defined
 369 for unary rules; $x'\rightarrow x$ is meant to include both left and
 370 right stop rules).
 371
 372 % todo: add ROOT rule to CNF grammar?
 373
 374
 375 \bibliography{./statistical.bib}
 376 \bibliographystyle{plainnat}
 377
 378 \end{document}