1 % Created 2008-06-13 Fri 17:05
2 \documentclass[11pt,
a4paper]{article
}
3 \usepackage[utf8
]{inputenc}
4 \usepackage[T1]{fontenc}
18 \avmoptions{sorted,active
}
20 \avmsortfont{\scriptsize\it}
22 \usepackage{array
} % for a tabular with math in it
25 \author{Kevin Brubeck Unhammer
}
33 \newcommand{\GOR}[1]{\overrightarrow{#1}}
34 \newcommand{\RGOL}[1]{\overleftarrow{\overrightarrow{#1}}}
35 \newcommand{\SEAL}[1]{\overline{#1}}
36 \newcommand{\LGOR}[1]{\overrightarrow{\overleftarrow{#1}}}
37 \newcommand{\GOL}[1]{\overleftarrow{#1}}
38 \newcommand{\LN}[1]{\underleftarrow{#1}}
39 \newcommand{\RN}[1]{\underrightarrow{#1}}
41 \section{Note on notation
}
42 $i, j, k$ are sentence positions (between words), where $i$ and $j$
43 are always the start and end, respectively, for what we're calculating
44 ($k$ is between $i$ and $j$ for $P_
{INSIDE
}$, to their right or left
45 for $P_
{OUTSIDE
}$). $s
\in S$ are sentences in the corpus. $w$ is a
46 word at a certain sentence location. If $w$ is between $i$ and $i+
1$,
47 $loc(w)=i$ following
\citet{klein-thesis
}, meaning $i$ is adjacent to
48 $w$ on the left, while $j=loc(w)+
1$ means that $j$ is adjacent to $h$
49 on the right. To simplify, $loc_l(w):=loc(w)$ and
50 $loc_r(w):=loc(w)+
1$. We write $h$ if this is a
51 head in the rule being used, $a$ if it is an attached argument.
53 Notational differences between the thesis
\citep{klein-thesis
} and the
59 $w
\urcorner$ & $
\RGOL{w
}$ \\
60 $
\ulcorner{w
}\urcorner$ & $
\SEAL{w
}$ \\
63 $x$ can mean $w,
\GOR{w
},
\RGOL{w
}$ or $
\SEAL{w
}$
\footnote{This means
64 that in the implementation, each such $x$ is represented by the
65 triplet of the actual POS-tag, its sentence location, and the
69 \section{Inside probabilities
}
70 $P_
{INSIDE
}$ is defined in
\citet[pp.~
106-
108]{klein-thesis
}, the only
71 thing we need to add is that for right attachments,
72 $i
\leq loc_l(w)<k
\leq loc_l(a)<j$ while for left attachments,
73 $i
\leq loc_l(a)<k
\leq loc_l(w)<j$.
76 \
[ \forall{w
}[P_
{ORDER
}(right
\text{-
}first|w)=
1.0] \
] since the DMV implementation
77 is not yet generalized to both directions.)
83 \subsection{Sentence probability
}
85 $P_s$ is the sentence probability, based on
86 \citet[p.~
38]{lari-csl90
}. Since the ROOT rules are different from the
87 rest, we sum them explicitly in this definition:
89 P_s =
\sum_{w
\in s
} P_
{ROOT
}(w) P_
{INSIDE
}(
\SEAL{w
},
0, len(s))
92 \section{Outside probabilities
}
95 P_
{OUTSIDE_s
}(ROOT, i, j) =
\begin{cases
}
96 1.0 &
\text{ if $i =
0$ and $j = len(s)$,
}\\
97 0.0 &
\text{ otherwise
}
101 For $P_
{OUTSIDE
}(
\SEAL{w
}, i, j)$, $w$ is attached to under something
102 else ($
\SEAL{w
}$ is what we elsewhere call $
\SEAL{a
}$). Adjacency is
103 thus calculated on the basis of $h$, the head of the rule. If we are
104 attached to from the left we have $i
\leq loc_l(w) < j
\leq loc_l(h) < k$, while
105 from the right we have $k
\leq loc_l(h) < i
\leq loc_l(w) < j$:
107 P_
{OUTSIDE
}&(
\SEAL{w
}, i, j) = \\
108 & P_
{ROOT
}(w) P_
{OUTSIDE
}(ROOT, i, j) + \\
109 &
[ \sum_{k > j
} ~
\sum_{h:j
\leq loc_l(h)<k
} \sum_{x
\in \
{\RGOL{h
},
\GOL{h
}\
}} P_
{STOP
}(
\neg stop|h, left, adj(j, h)) P_
{ATTACH
}(w|h, left) \\
110 &
\qquad \qquad \qquad \qquad \qquad P_
{OUTSIDE
}(x, i, k) P_
{INSIDE
}(x, j, k)
] ~ + \\
111 &
[ \sum_{k < i
} ~
\sum_{h:k
\leq loc_l(h)<i
} \sum_{x
\in \
{\LGOR{h
},
\GOR{h
}\
}} P_
{STOP
}(
\neg stop|h, right, adj(i, h)) P_
{ATTACH
}(w|h, right) \\
112 &
\qquad \qquad \qquad \qquad \qquad P_
{INSIDE
}(x, k, i) P_
{OUTSIDE
}(x, k, j)
]
115 For $
\RGOL{w
}$ we know it is either under a left stop rule or it is
116 the right daughter of a left attachment rule ($k
\leq loc_l(a) < i
\leq loc_l(w)
117 < j$), and these are adjacent if the start point ($i$) equals
120 P_
{OUTSIDE
}(
\RGOL{w
}, i, j) = & P_
{STOP
}(stop|w, left, adj(i,
121 w))P_
{OUTSIDE
}(
\SEAL{w
}, i, j) ~ + \\
122 &
[ \sum_{k < i
} ~
\sum_{a:k
\leq loc_l(a)<i
} P_
{STOP
}(
\neg stop|w, left, adj(i, w)) P_
{ATTACH
}(a|w, left) \\
123 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_
{INSIDE
}(
\SEAL{a
}, k, i) P_
{OUTSIDE
}(
\RGOL{w
}, k, j)
]
126 For $
\GOR{w
}$ we are either under a right stop or the left daughter of
127 a right attachment rule ($i
\leq loc_l(w) < j
\leq loc_l(a) < k$), adjacent iff
128 the the end point ($j$) equals $loc_r(w)$:
130 P_
{OUTSIDE
}(
\GOR{w
}, i, j) = & P_
{STOP
}(stop|w, right, adj(j,
131 w))P_
{OUTSIDE
}(
\RGOL{w
}, i, j) ~ + \\
132 &
[ \sum_{k > j
} ~
\sum_{a:j
\leq loc_l(a)<k
} P_
{STOP
}(
\neg stop|w, right, adj(j, w)) P_
{ATTACH
}(a|w, right) \\
133 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_
{OUTSIDE
}(
\GOR{w
}, i, k) P_
{INSIDE
}(
\SEAL{a
}, j, k)
]
136 $
\GOL{w
}$ is just like $
\RGOL{w
}$, except for the outside probability
137 of having a stop above, where we use $
\LGOR{w
}$:
139 P_
{OUTSIDE
}(
\GOL{w
}, i, j) = & P_
{STOP
}(stop|w, left, adj(i,
140 w))P_
{OUTSIDE
}(
\LGOR{w
}, i, j) ~ + \\
141 &
[ \sum_{k < i
} ~
\sum_{a:k
\leq loc_l(a)<i
} P_
{STOP
}(
\neg stop|w, left, adj(i, w)) P_
{ATTACH
}(a|w, left) \\
142 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_
{INSIDE
}(
\SEAL{a
}, k, i) P_
{OUTSIDE
}(
\GOL{w
}, k, j)
]
145 $
\LGOR{w
}$ is just like $
\GOR{w
}$, except for the outside probability
146 of having a stop above, where we use $
\SEAL{w
}$:
148 P_
{OUTSIDE
}(
\LGOR{w
}, i, j) = & P_
{STOP
}(stop|w, right, adj(j,
149 w))P_
{OUTSIDE
}(
\SEAL{w
}, i, j) ~ + \\
150 &
[ \sum_{k > j
} ~
\sum_{a:j
\leq loc_l(a)<k
} P_
{STOP
}(
\neg stop|w, right, adj(j, w)) P_
{ATTACH
}(a|w, right) \\
151 & ~~~~~~~~~~~~~~~~~~~~~~~~~ P_
{OUTSIDE
}(
\LGOR{w
}, i, k) P_
{INSIDE
}(
\SEAL{a
}, j, k)
]
155 \section{Reestimating the rules
} % todo
157 \subsection{$c$ and $w$ (helper formulas used below)
}
158 % todo: left out rule-probability! wops (also, make P_{fooside}
161 $c_s(x : i, j)$ is ``the expected fraction of parses of $s$ with a
162 node labeled $x$ extending from position $i$ to position $j$''
163 \citep[p.~
88]{klein-thesis
}, here defined to equal $v_
{q
}$ of
164 \citet[p.~
41]{lari-csl90
}:
166 c_s(x : i, j) = P_
{INSIDE_s
}(x, i, j) P_
{OUTSIDE_s
}(x, i, j) / P_s
169 $w_s$ is $w_
{q
}$ from
\citet[p.~
41]{lari-csl90
}, generalized to $x$
170 \footnote{$x$ here includes a ``level of seals'', to look up the
171 $P_
{STOP
}$ and $P_
{CHOOSE
}$ formulas we of course only need the POS
174 w_s(
\SEAL{a
} & : x, left, i, j) = \\
175 &
1/P_s
\sum_{k:i<k<j
} ~
\sum_{a:i
\leq loc_l(a)<k
}
176 & P_
{STOP
}(
\neg stop|x, left, adj(k, x)) P_
{CHOOSE
}(a|x, left) \\
177 & & P_
{INSIDE_s
}(
\SEAL{a
}, i, k) P_
{INSIDE_s
}(x, k, j) P_
{OUTSIDE_s
}(x, i, j)
180 w_s(
\SEAL{a
} & : x, right, i, j) = \\
181 &
1/P_s
\sum_{k:i<k<j
} ~
\sum_{a:k
\leq loc_l(a)<j
}
182 & P_
{STOP
}(
\neg stop|x, right, adj(k, x)) P_
{CHOOSE
}(a|x, right) \\
183 & & P_
{INSIDE_s
}(x, i, k) P_
{INSIDE_s
}(
\SEAL{a
}, k, j) P_
{OUTSIDE_s
}(x, i, j)
186 Let $
\hat{P
}$ be the new STOP/CHOOSE-probabilities (since the old $P$
187 are used in $P_
{INSIDE
}$ and $P_
{OUTSIDE
}$).
189 \subsection{Attachment reestimation
}
191 $
\hat{a
}$ is given in
\citet[p.~
41]{lari-csl90
}. Here $i<loc_l(h)$
192 since we want trees with at least one attachment:
194 \hat{a
} (a | x, left) =
\frac
195 { \sum_{s
\in S
} \sum_{h
\in s
} \sum_{i<loc_l(h)
} \sum_{j
\geq loc_r(h)
} w_s(
\SEAL{a
} : x, left, i, j)
}
196 { \sum_{s
\in S
} \sum_{h
\in s
} \sum_{i<loc_l(h)
} \sum_{j
\geq loc_r(h)
} c_s(x : i, j)
}
199 Here $j>loc_r(h)$ since we want at least one attachment:
201 \hat{a
} (a | x, right) =
\frac
202 { \sum_{s
\in S
} \sum_{h
\in s
} \sum_{i
\leq loc_l(h)
} \sum_{j>loc_r(h)
} w_s(
\SEAL{a
} : x, right, i, j)
}
203 { \sum_{s
\in S
} \sum_{h
\in s
} \sum_{i
\leq loc_l(h)
} \sum_{j>loc_r(h)
} c_s(x : i, j)
}
206 For the first/lowest attachments, $w$ and $c$ have zero probability
207 where $i<loc_l(h)$ (for $
\GOR{h
}$) or $j>loc_r(h)$ (for $
\GOL{h
}$),
208 this is implicit in $P_
{INSIDE
}$.
213 \hat{P
}_
{CHOOSE
} (a | h, left) =
\frac
214 { \hat{a
} (a |
\GOL{h
}, left)
}
215 { \sum_{w
} \hat{a
} (w |
\GOL{h
}, left)
}
218 { \hat{a
} (a |
\RGOL{h
}, left)
}
219 { \sum_{w
} \hat{a
} (w |
\RGOL{h
}, left)
}
222 \hat{P
}_
{CHOOSE
} (a | h, right) =
\frac
223 { \hat{a
} (a |
\GOR{h
},right)
}
224 { \sum_{w
} \hat{a
} (w |
\GOR{h
},right)
}
227 { \hat{a
} (a |
\LGOR{h
},right)
}
228 { \sum_{w
} \hat{a
} (w |
\LGOR{h
},right)
}
231 \subsection{Stop reestimation
}
232 The following is based on
\citet[p.~
88]{klein-thesis
}. For the
233 non-adjacent rules, $i<loc_l(h)$ on the left and $j>loc_r(h)$ on the
234 right, while for the adjacent rules these are equal (respectively).
236 To avoid some redundancy below, define a helper function $
\hat{d
}$ as follows:
238 \hat{d
}(x,left,non
\text{-
}adj) =
{ \sum_{s
\in S
} \sum_{h
\in s
} \sum_{i<loc_l(h)
} \sum_{j
\geq loc_r(h)
} c_s(x : i, j)
}
241 \hat{d
}(x,left, adj) =
{ \sum_{s
\in S
} \sum_{h
\in s
} \sum_{j
\geq loc_r(h)
} c_s(x : loc_l(h), j)
}
244 \hat{d
}(x,right,non
\text{-
}adj) =
{ \sum_{s
\in S
} \sum_{h
\in s
} \sum_{i
\leq loc_l(h)
} \sum_{j> loc_r(h)
} c_s(x : i, j)
}
247 \hat{d
}(x,right, adj) =
{ \sum_{s
\in S
} \sum_{h
\in s
} \sum_{i
\leq loc_l(h)
} c_s(x : i, loc_r(h))
}
251 \hat{P
}_
{STOP
} (STOP|h, left, non
\text{-
}adj) =
253 { \hat{d
}(
\SEAL{h
},left,non
\text{-
}adj)
}
254 { \hat{d
}(
\RGOL{h
},left,non
\text{-
}adj)
}
257 { \hat{d
}(
\LGOR{h
},left,non
\text{-
}adj)
}
258 { \hat{d
}(
\GOL{h
},left,non
\text{-
}adj)
}
262 \hat{P
}_
{STOP
} (STOP|h, left, adj) =
264 { \hat{d
}(
\SEAL{h
},left,adj)
}
265 { \hat{d
}(
\RGOL{h
},left,adj)
}
268 { \hat{d
}(
\LGOR{h
},left,adj)
}
269 { \hat{d
}(
\GOL{h
},left,adj)
}
273 \hat{P
}_
{STOP
} (STOP|h, right, non
\text{-
}adj) =
275 { \hat{d
}(
\RGOL{h
},right,non
\text{-
}adj)
}
276 { \hat{d
}(
\GOR{h
},right,non
\text{-
}adj)
}
279 { \hat{d
}(
\SEAL{h
},right,non
\text{-
}adj)
}
280 { \hat{d
}(
\LGOR{h
},right,non
\text{-
}adj)
}
284 \hat{P
}_
{STOP
} (STOP|h, right, adj) =
286 { \hat{d
}(
\RGOL{h
},right,adj)
}
287 { \hat{d
}(
\GOR{h
},right,adj)
}
290 { \hat{d
}(
\SEAL{h
},right,adj)
}
291 { \hat{d
}(
\LGOR{h
},right,adj)
}
295 \subsection{Root reestimation
} % todo grok: no \sum_{i} \sum_{j} ?
297 \hat{P
}_
{ROOT
} (h) =
\frac
298 {\sum_{s
\in S
} P_
{ROOT
}(h) P_
{INSIDE_s
}(
\SEAL{h
},
0, len(s))
}
305 \section{Alternate CNF rules
}
306 Since the IO algorithm as described in
\citet{lari-csl90
} is made for
307 pure CNF style rules, we have an alternate grammar (figure
\ref{cnf
})
308 for running parallell tests where we don't have to sum over the
309 different $loc(h)$ in IO. This is
310 not yet generalized to include left-first attachment.
314 \begin{tabular
} % four left-aligned math tabs, one vertical line
315 { >
{$
}l<
{$
} >
{$
}l<
{$
} >
{$
}l<
{$
} | >
{$
}l<
{$
} }
316 \multicolumn{3}{c
}{Rule
} &
\multicolumn{1}{c
}{$P_
{RULE
}$ ($a
[i,j,k
]$ in
\citet{lari-csl90
})
}\\
319 \RN{\GOR{h
}} \rightarrow&
\GOR{h
} &
\SEAL{a
} &P_
{STOP
}(
\neg stop|h, right, adj)
\cdot P_
{ATTACH
}(a|h, right) \\
321 \RN{\GOR{h
}} \rightarrow&
\RN{\GOR{h
}} &
\SEAL{a
} &P_
{STOP
}(
\neg stop|h, right, non
\text{-
}adj)
\cdot P_
{ATTACH
}(a|h, right) \\
323 \RGOL{h
} \rightarrow&
\GOR{h
} &STOP &P_
{STOP
}(stop|h, right, adj) \\
325 \RGOL{h
} \rightarrow&
\RN{\GOR{h
}} &STOP &P_
{STOP
}(stop|h, right, non
\text{-
}adj) \\
327 \LN{\RGOL{h
}} \rightarrow&
\SEAL{a
} &
\RGOL{h
} &P_
{STOP
}(
\neg stop|h, left, adj)
\cdot P_
{ATTACH
}(a|h, left) \\
329 \LN{\RGOL{h
}} \rightarrow&
\SEAL{a
} &
\LN{\RGOL{h
}} &P_
{STOP
}(
\neg stop|h, left, non
\text{-
}adj)
\cdot P_
{ATTACH
}(a|h, left) \\
331 \SEAL{h
} \rightarrow& STOP &
\RGOL{h
} &P_
{STOP
}(stop|h, left, adj) \\
333 \SEAL{h
} \rightarrow& STOP &
\LN{\RGOL{h
}} &P_
{STOP
}(stop|h, left, non
\text{-
}adj) \\
335 \caption{Alternate CFG rules (where a child node has an arrow below,
336 we use non-adjacent probabilities), defined for all words/POS-tags
340 The inside probabilities are the same as those given in
341 \citet{lari-csl90
}, with the following exceptions:
343 When calculating $P_
{INSIDE
}(x, i, j)$ and summing through possible
344 rules which rewrite $x$, if a rule is of the form $x
\rightarrow
345 STOP ~ x'$ or $x
\rightarrow x' ~ STOP$, we add $P_
{RULE
}\cdot
346 P_
{INSIDE
}(x', i, j)$ (that is, rewrite for the same sentence range);
347 and, as a consequence of these unary rules: for ``terminal rules''
348 ($P_
{ORDER
}$) to be applicable, not only must $i = j-
1$, but also the
349 left-hand side symbol of the rule must be of the form $
\GOR{h
}$.
351 Similarly, the outside probabilities are the same as those for pure
352 CNF rules, with the exception that we add the unary rewrite
355 \sum_{x'
} [&P_
{OUTSIDE
}(x,i,j)
\cdot P_
{RULE
}(x'
\rightarrow x ~ STOP) \\
356 + &P_
{OUTSIDE
}(x,i,j)
\cdot P_
{RULE
}(x'
\rightarrow STOP ~ x)
]
358 to $P_
{OUTSIDE
}(x,i,j)$ (eg. $f(s,t,i)$).
360 The reestimation just needs to be expanded to allow unary stop rules,
361 this is similar to the formula for binary rules in
362 \citet[p.~
41]{lari-csl90
}:
364 \hat{P
}_
{RULE
}(x'
\rightarrow x) =
\frac
365 { \sum_{s
\in S
} \sum_{i<len(s)
} \sum_{j>i
} 1/P_s
\cdot P_
{RULE
}(x'
\rightarrow x)P_
{INSIDE
}(x, i, j)P_
{OUTSIDE
}(x', i, j)
}
366 { \sum_{s
\in S
} \sum_{i<len(s)
} \sum_{j>i
} c_s(x', i, j)
}
368 (the denominator is unchanged, but the numerator sums $w_s$ defined
369 for unary rules; $x'
\rightarrow x$ is meant to include both left and
372 % todo: add ROOT rule to CNF grammar?
375 \bibliography{./statistical.bib
}
376 \bibliographystyle{plainnat
}