DMVCCM.org

   1 # -*- coding: mule-utf-8-unix -*-
   2
   3 #+STARTUP: overview
   4 #+TAGS: OPTIMIZE PRETTIER
   5 #+STARTUP: hidestars
   6 #+TITLE: DMV/CCM -- todo-list / progress
   7 #+AUTHOR: Kevin Brubeck Unhammer
   8 #+EMAIL: K.BrubeckUnhammer at student uva nl
   9 #+OPTIONS: H:4
  10 #+OPTIONS: toc:3
  11 #+OPTIONS: ^:{}
  12 #+LANGUAGE: en
  13 #+SEQ_TODO: TOGROK TODO DONE
  14
  15 * dmvccm report and project
  16   DEADLINE: <2008-06-30 Mon>
  17 - [[file:src/dmv.py][dmv.py]]
  18 - [[file:src/io.py][io.py]]
  19 - [[file:src/harmonic.py::harmonic%20py%20initialization%20for%20dmv][harmonic.py]]
  20
  21 (But absolute, extended, really-quite-dead-now deadline: August 31...)
  22 * DONE outer probabilities
  23   CLOSED: [2008-06-12 Thu 11:11]
  24 # <<outer>>
  25 There are 6 different configurations here, based on what the above
  26 (mother) node is, and what the Node for which we're computing is.
  27 Here *r* is not between *s* and *t* as in inner(), but an /outer/ index. *loc_N*
  28 is the location of the Node head in the sentence, *loc_m* for the head
  29 of the mother of Node.
  30
  31 + mother is a RIGHT-stop:
  32   - outer(*s, t*, mom.LHS, *loc_N*), no inner-call
  33   - adjacent iff *t* == *loc_m*
  34 + mother is a  LEFT-stop:
  35   - outer(*s, t*, mom.LHS, *loc_N*), no inner-call
  36   - adjacent iff *s* == *loc_m*
  37
  38 + Node is on the LEFT branch (mom.L == Node)
  39   * and mother is a LEFT attachment:
  40     - *loc_N* will be in the LEFT branch, can be anything here.
  41     - In the RIGHT, non-attached, branch we find inner(*t+1, r*, mom.R,
  42       *loc_m*) for all possible *loc_m* in the right part of the sentence.
  43     - outer(*s, r*, mom.LHS, *loc_m*).
  44     - adjacent iff *t+1* == *loc_m*
  45   * and mother is a RIGHT attachment:
  46     - *loc_m* = *loc_N*.
  47     - In the RIGHT, attached, branch we find inner(*t+1, r*, mom.R, *loc_R*) for
  48       all possible *loc_R* in the right part of the sentence.
  49     - outer(*s, r*, mom.LHS, *loc_N*).
  50     - adjacent iff *t* == *loc_m*
  51
  52 + Node is on the RIGHT branch (mom.R == Node)
  53   * and mother is a LEFT attachment:
  54     - *loc_m* = *loc_N*.
  55     - In the LEFT, attached, branch we find inner(*r, s-1*, mom.L, *loc_L*) for
  56       all possible *loc_L* in the left part of the sentence.
  57     - outer(*r, t*, mom.LHS, *loc_m*).
  58     - adjacent iff *s* == *loc_m*
  59   * and mother is a RIGHT attachment:
  60     - *loc_N* will be in the RIGHT branch, can be anything here.
  61     - In the LEFT, non-attached, branch we find inner(*r, s-1*, mom.L, *loc_m*) for
  62       all possible *loc_m* in the left part of the sentence.
  63     - outer(*r, t*, mom.LHS, *loc_N*).
  64     - adjacent iff *s-1* == *loc_m*
  65
  66 [[file:outer_attachments.jpg]]
  67
  68 : in notes:   in code (constants):    in Klein thesis:
  69 :-------------------------------------------------------------------
  70 : _h_         SEAL                    bar over h
  71 :  h_         RGO_L                   right-under-left-arrow over h
  72 :  h          GO_R                    right-arrow over h
  73 :
  74 :             LGO_R                   left-under-right-arrow over h
  75 :             GO_L                    left-arrow over h
  76
  77 Also, unlike in [[http://bibsonomy.org/bibtex/2b9f6798bb092697da7042ca3f5dee795][Lari & Young]], non-ROOT ('S') symbols may cover the
  78 whole sentence, but ROOT may /only/ appear if it covers the whole
  79 sentence.
  80 ** COMMENT qtrees, tex
  81 \usepackage{qtree}
  82 \usepackage{amssymb}
  83
  84 \newcommand{\GOR}[1]{\overrightarrow{#1}}
  85 \newcommand{\RGOL}[1]{\overleftarrow{\overrightarrow{#1}}}
  86
  87 \newcommand{\SEAL}[1]{\overline{#1}}
  88
  89 \newcommand{\LGOR}[1]{\overrightarrow{\overleftarrow{#1}}}
  90 \newcommand{\GOL}[1]{\overleftarrow{#1}}
  91
  92 \Tree [.{$\RGOL{h}$} [.{$_s \SEAL{a} _t$\\
  93   Node} ] {$_{t+1} \RGOL{h} _r$\\
  94   R} ]
  95 \Tree [.{$\GOR{h}$} {$_{s} \GOR{h} _{t}$\\
  96   Node} [.{$_{t+1} \SEAL{a} _r$\\
  97   R} ] ]
  98 \Tree [.{$\RGOL{h}$} [.{$_r \SEAL{a} _{s-1}$\\
  99   L} ] {$_{s} \RGOL{h} _t$\\
 100   Node} ]
 101 \Tree [.{$\GOR{h}$} {$_{r} \GOR{h} _{s-1}$\\
 102   L} [.{$_{s} \SEAL{a} _t$\\
 103   Node} ] ]
 104
 105
 106 \Tree [.{$h\urcorner$} [.{$_s \ulcorner a\urcorner _t$\\
 107   Node} ] {$_{t+1} h\urcorner _r$\\
 108   R} ]
 109 \Tree [.{$h$} {$_{s} h _{t}$\\
 110   Node} [.{$_{t+1} \ulcorner a\urcorner _r$\\
 111   R} ] ]
 112 \Tree [.{$h\urcorner$} [.{$_r \ulcorner a\urcorner _{s-1}$\\
 113   L} ] {$_{s} h\urcorner _t$\\
 114   Node} ]
 115 \Tree [.{$h$} {$_{r} h _{s-1}$\\
 116   L} [.{$_{s} \ulcorner a\urcorner _t$\\
 117   Node} ] ]
 118 * TODO [#B] P_STOP and P_CHOOSE for IO/EM (reestimation)
 119 [[file:src/dmv.py::DMV%20probabilities][dmv-P_STOP]]
 120 Remember: The P_{STOP} formula is upside-down (left-to-right also).
 121 (In the article..not the [[http://www.eecs.berkeley.edu/~klein/papers/klein_thesis.pdf][thesis]])
 122
 123 ** P_STOP formulas for various dir and adj:
 124 Assuming this:
 125
 126 | P_{STOP}(STOP : h,L,non_adj) = | \sum_{corpus} \sum_{s<loc(h)} \sum_{t>=loc(h)} c(s,t,_h_, ...) |
 127 |                                | \sum_{corpus} \sum_{s<loc(h)} \sum_{t>=loc(h)} c(s,t,h_, ...)  |
 128 |                                |                                                                |
 129 | P_{STOP}(STOP : h,L,adj) =     | \sum_{corpus} \sum_{s=loc(h)} \sum_{t>=loc(h)} c(s,t,_h_, ...) |
 130 |                                | \sum_{corpus} \sum_{s=loc(h)} \sum_{t>=loc(h)} c(s,t,h_, ...)  |
 131 |                                |                                                                |
 132 | P_{STOP}(STOP : h,R,non_adj) = | \sum_{corpus} \sum_{s=loc(h)} \sum_{t>loc(h)} c(s,t,h_, ...)   |
 133 |                                | \sum_{corpus} \sum_{s=loc(h)} \sum_{t>loc(h)} c(s,t,h, ...)    |
 134 |                                |                                                                |
 135 | P_{STOP}(STOP : h,R,adj) =     | \sum_{corpus} \sum_{s=loc(h)} \sum_{t=loc(h)} c(s,t,h_, ...)   |
 136 |                                | \sum_{corpus} \sum_{s=loc(h)} \sum_{t=loc(h)} c(s,t,h, ...)    |
 137
 138 (And P_{STOP}(-STOP|...) = 1 - P_{STOP}(STOP|...) )
 139 ** P_CHOOSE formula:
 140 Assuming this:
 141
 142 | P_{CHOOSE}(a : h,R) = | \sum_{corpus} \sum_{s=loc(h)} \sum_{t > loc(h)} \sum_{loc(h) < r <= t} c(s, r-1, h_, ...) * c(r,t,_a_,...) |
 143 |                       | \sum_{corpus} \sum_{s=loc(h)} \sum_{t > loc(h)} c(s,t,h, ...)                                              |
 144 |                       |                                                                                                            |
 145 | P_{CHOOSE}(a : h,L) = | \sum_{corpus} \sum_{s<loc(h)} \sum_{t>=loc(h)} \sum_{r<loc(h)} c(s,r,_a_, ...) * c(r+1, t, h_, ...)        |
 146 |                       | \sum_{corpus} \sum_{s<loc(h)} \sum_{t>=loc(h)} c(s,t,h_,...)                                               |
 147 t >= loc(h) since there are many possibilites for right-attachments
 148 below, and each of them alone gives a lower probability (through
 149 multiplication) to the upper tree (so add them all)
 150
 151 The reason we have to check /both/ children of the attachments is that we
 152 have to make sure they are contiguous (otherwise we would have no way
 153 of ruling out eg. h_->_b_,_b_->b_->_a_, where h_ covers *s* and *t*,_b_ is
 154 from *s* to *x<r* and _ a_ is from *s* to *r*).
 155 ** DONE [#A] How do we only count from completed trees?
 156    CLOSED: [2008-06-13 Fri 11:40]
 157 Use c(s,t,Node); inner * outer / P_sent
 158
 159 ** DONE [#A] c(s,t,Node)
 160   CLOSED: [2008-06-13 Fri 11:38]
 161 = inner * outer / P_sent
 162
 163 implemented as inner * outer / inner_sent
 164
 165 * TOGROK Find Most Probable Parse of given test sentence
 166 We could probably do it pretty easily from the chart:
 167 - for all loc_h, select the (0,len(sent),ROOT,loc_h) that has the highest p
 168 - for stops, just keep going
 169 - for rewrites, for loc_dep select the one that has the highest p
 170 * TOGROK Combine CCM with DMV
 171 Page 108 (pdf: 124) of [[http://www.eecs.berkeley.edu/~klein/papers/klein_thesis.pdf][Klein's thesis]] gives info about this.
 172 ** TODO Make inner() and outer() also allow left-first attachment
 173 Using P_{ORDER}(/left-first/ | w) etc.
 174 * TODO Get a dependency parsed version of WSJ to test with
 175 * Initialization
 176 [[file:~/Documents/Skole/V08/Probability/dmvccm/src/dmv.py::Initialization%20todo][dmv-inits]]
 177
 178 We go through the corpus, since the probabilities are based on how far
 179 away in the sentence arguments are from their heads.
 180 ** TOGROK CCM Initialization
 181 P_{SPLIT} used here... how, again?
 182 ** DONE Separate initialization to another file?                      :PRETTIER:
 183    CLOSED: [2008-06-08 Sun 12:51]
 184 [[file:src/harmonic.py::harmonic%20py%20initialization%20for%20dmv][harmonic.py]]
 185 ** DONE DMV Initialization probabilities
 186 (from initialization frequency)
 187 ** DONE DMV Initialization frequencies
 188    CLOSED: [2008-05-27 Tue 20:04]
 189 *** P_STOP
 190 P_{STOP} is not well defined by K&M. One possible interpretation given
 191 the sentence [det nn vb nn] is
 192 : f_{STOP}( STOP|det, L, adj) +1
 193 : f_{STOP}(-STOP|det, L, adj) +0
 194 : f_{STOP}( STOP|det, L, non_adj) +1
 195 : f_{STOP}(-STOP|det, L, non_adj) +0
 196 : f_{STOP}( STOP|det, R, adj) +0
 197 : f_{STOP}(-STOP|det, R, adj) +1
 198 :
 199 : f_{STOP}( STOP|nn, L, adj) +0
 200 : f_{STOP}(-STOP|nn, L, adj) +1
 201 : f_{STOP}( STOP|nn, L, non_adj) +1  # since there's at least one to the left
 202 : f_{STOP}(-STOP|nn, L, non_adj) +0
 203 **** TODO tweak
 204 # <<pstoptweak>>
 205 :            f[head,  'STOP', 'LN'] += (i_h <= 1)     # first two words
 206 :            f[head, '-STOP', 'LN'] += (not i_h <= 1)
 207 :            f[head,  'STOP', 'LA'] += (i_h == 0)     # very first word
 208 :            f[head, '-STOP', 'LA'] += (not i_h == 0)
 209 :            f[head,  'STOP', 'RN'] += (i_h >= n - 2) # last two words
 210 :            f[head, '-STOP', 'RN'] += (not i_h >= n - 2)
 211 :            f[head,  'STOP', 'RA'] += (i_h == n - 1) # very last word
 212 :            f[head, '-STOP', 'RA'] += (not i_h == n - 1)
 213 vs
 214 :            # this one requires some additional rewriting since it
 215 :            # introduces divisions by zero
 216 :            f[head,  'STOP', 'LN'] += (i_h == 1)     # second word
 217 :            f[head, '-STOP', 'LN'] += (not i_h <= 1) # not first two
 218 :            f[head,  'STOP', 'LA'] += (i_h == 0)     # first word
 219 :            f[head, '-STOP', 'LA'] += (not i_h == 0) # not first
 220 :            f[head,  'STOP', 'RN'] += (i_h == n - 2)     # second-to-last
 221 :            f[head, '-STOP', 'RN'] += (not i_h >= n - 2) # not last two
 222 :            f[head,  'STOP', 'RA'] += (i_h == n - 1)     # last word
 223 :            f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last
 224 vs
 225 :            f[head,  'STOP', 'LN'] += (i_h == 1)     # second word
 226 :            f[head, '-STOP', 'LN'] += (not i_h == 1) # not second
 227 :            f[head,  'STOP', 'LA'] += (i_h == 0)     # first word
 228 :            f[head, '-STOP', 'LA'] += (not i_h == 0) # not first
 229 :            f[head,  'STOP', 'RN'] += (i_h == n - 2)     # second-to-last
 230 :            f[head, '-STOP', 'RN'] += (not i_h == n - 2) # not second-to-last
 231 :            f[head,  'STOP', 'RA'] += (i_h == n - 1)     # last word
 232 :            f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last
 233 vs
 234 "all words take the same number of arguments" interpreted as
 235 :for all heads:
 236 :    p_STOP(head, 'STOP', 'LN') = 0.3
 237 :    p_STOP(head, 'STOP', 'LA') = 0.5
 238 :    p_STOP(head, 'STOP', 'RN') = 0.4
 239 :    p_STOP(head, 'STOP', 'RA') = 0.7
 240 (which we easily may tweak in init_zeros())
 241 *** P_CHOOSE
 242 Go through the corpus, counting distances between heads and
 243 arguments. In [det nn vb nn], we give
 244 - f_{CHOOSE}(nn|det, R) +1/1 + C
 245 - f_{CHOOSE}(vb|det, R) +1/2 + C
 246 - f_{CHOOSE}(nn|det, R) +1/3 + C
 247   - If this were the full corpus, P_{CHOOSE}(nn|det, R) would have
 248     (1+1/3+2C) / sum_a f_{CHOOSE}(a|det, R)
 249
 250 The ROOT gets "each argument with equal probability", so in a sentence
 251 of three words, 1/3 for each (in [nn vb nn], 'nn' gets 2/3). Basically
 252 just a frequency count of the corpus...
 253
 254 In a sense there are no terminal probabilities, since an /h/ can only
 255 rewrite to an 'h' anyway (it's just a check for whether, at this
 256 location in the sentence, we have the right POS-tag).
 257 * [#C] Deferred
 258 http://wiki.python.org/moin/PythonSpeed/PerformanceTips Eg., use
 259 map/reduce/filter/[i for i in [i's]]/(i for i in [i's]) instead of
 260 for-loops; use local variables for globals (global variables or or
 261 functions), etc.
 262
 263 ** TODO when reestimating P_STOP etc, remove rules with p < epsilon   :OPTIMIZE:
 264 ** TODO inner_dmv, short ranges and impossible attachment             :OPTIMIZE:
 265 If s-t <= 2, there can be only one attachment below, so don't recurse
 266 with both Lattach=True and Rattach=True.
 267
 268 If s-t <= 1, there can be no attachment below, so only recurse with
 269 Lattach=False, Rattach=False.
 270
 271 Put this in the loop under rewrite rules (could also do it in the STOP
 272 section, but that would only have an effect on very short sentences).
 273 ** TODO clean up the module files                                     :PRETTIER:
 274 Is there better way to divide dmv and harmonic? There's a two-way
 275 dependency between the modules. Guess there could be a third file that
 276 imports both the initialization and the actual EM stuff, while a file
 277 containing constants and classes could be imported by all others:
 278 : dmv.py imports dmv_EM.py imports dmv_classes.py
 279 : dmv.py imports dmv_inits.py imports dmv_classes.py
 280
 281 ** TOGROK Some (tagged) sentences are bound to come twice             :OPTIMIZE:
 282 Eg, first sort and count, so that the corpus
 283 [['nn','vbd','det','nn'],
 284  ['vbd','nn','det','nn'],
 285  ['nn','vbd','det','nn']]
 286 becomes
 287 [(['nn','vbd','det','nn'],2),
 288  (['vbd','nn','det','nn'],1)]
 289 and then in each loop through sentences, make sure we handle the
 290 frequency correctly.
 291
 292 Is there much to gain here?
 293
 294 ** TOGROK tags as numbers or tags as strings?                         :OPTIMIZE:
 295 Need to clean up the representation.
 296
 297 Stick with tag-strings in initialization then switch to numbers for
 298 IO-algorithm perhaps? Can probably afford more string-matching in
 299 initialization..
 300 ** DONE if loc_h == t, no need to try right-attachment rules &v.v.    :OPTIMIZE:
 301    CLOSED: [2008-06-10 Tue 14:34]
 302 (and if loc_h == s, no need to try left-attachment rules.)
 303
 304 Modest speed increase (5%).
 305 ** DONE io.debug parameters should not call functions                 :OPTIMIZE:
 306    CLOSED: [2008-06-10 Tue 12:26]
 307 Exchanged all io.debug(str,'level') calls with statements of the form:
 308 :if 'level' in io.DEBUG:
 309 :    print str
 310
 311 and got an almost threefold speed increase on inner().
 312 ** DONE inner_dmv() should disregard rules with heads not in sent     :OPTIMIZE:
 313    CLOSED: [2008-06-08 Sun 10:18]
 314 If the sentence is "nn vbd det nn", we should not even look at rules
 315 where
 316 : rule.head() not in "nn vbd det nn".split()
 317 This is ruled out by getting rules from g.rules(LHS, sent).
 318
 319 Also, we optimize this further by saying we don't even recurse into
 320 attachment rules where
 321 : rule.head() not in sent[ s :r+1]
 322 : rule.head() not in sent[r+1:t+1]
 323 meaning, if we're looking at the span "vbd det", we only use
 324 attachment rules where both daughters are members of ['vbd','det']
 325 (although we don't (yet) care about removing rules that rewrite to the
 326 same tag if there are no duplicate tags in the span, etc., that would
 327 be a lot of trouble for little potential gain).
 328 * Adjacency and combining it with the inside-outside algorithm
 329 Each DMV_Rule has both a probN and a probA, for adjacencies. inner()
 330 and outer() needs the correct one in each case.
 331
 332 In each inner() call, loc_h is the location of the head of this
 333 dependency structure. In each outer() call, it's the head of the /Node/,
 334 the structure we're looking outside of.
 335
 336 We call inner() for each location of a head, and on each terminal,
 337 loc_h must equal s (and t). In the recursive attachment calls, we use
 338 the locations (sentence indices) of words to the left or right of the
 339 head in calls to inner(). /loc_h lets us check whether we need probN or
 340 probA/.
 341 ** Possible alternate type of adjacency
 342 K&M's adjacency is just whether or not an argument has been generated
 343 in the current direction yet. One could also make a stronger type of
 344 adjacency, where h and a are not adjacent if b is in between, eg. with
 345 the sentence "a b h" and the structure ((h->a), (a->b)), h is
 346 K&M-adjacent to a, but not next to a, since b is in between. It's easy
 347 to check this type of adjacency in inner(), but it needs new rules for
 348 P_STOP reestimation.
 349 * Expectation Maximation in IO/DMV-terms
 350 outer(s,t,Node) and inner(s,t,Node) calculates the expected number of
 351 trees (CNF-)headed by Node from *s* to *t* (sentence positions). This uses
 352 the P_STOP and P_CHOOSE values, which have been conveniently
 353 distributed into CNF rules as probN and probA (non-adjacent and
 354 adjacent probabilites).
 355
 356 When re-estimating, we use the expected values from outer() and
 357 inner() to get new values for P_STOP and P_CHOOSE. When we've
 358 re-estimated for the entire corpus, we distribute P_STOP and P_CHOOSE
 359 into the CNF rules again, so that in the next round we use new probN
 360 and probA to find outer- and inner-probabilites.
 361
 362 The distribution of P_STOP and P_CHOOSE into CNF rules also happens in
 363 init_normalize() (here along with the creation of P_STOP and
 364 P_CHOOSE); P_STOP is used to create CNF rules where one branch of the
 365 rule is STOP, P_CHOOSE is used to create rules of the form
 366 : h  -> h  _a_
 367 : h_ -> h_ _a_
 368
 369 Since "adjacency" is not captured in regular CNF rules, we need two
 370 probabilites for each rule, and outer() and inner() have to know when
 371 to use which.
 372
 373
 374 * Python-stuff
 375 # <<python>>
 376 Make those debug statements steal a bit less attention in emacs:
 377 :(font-lock-add-keywords
 378 : 'python-mode                   ; not really regexp, a bit slow
 379 : '(("^\\( *\\)\\(\\if +'.+' +in +io.DEBUG. *\\(
 380 :\\1    .+$\\)+\\)" 2 font-lock-preprocessor-face t)))
 381 :(font-lock-add-keywords
 382 : 'python-mode
 383 : '(("\\<\\(\\(io\\.\\)?debug(.+)\\)" 1 font-lock-preprocessor-face t)))
 384
 385 - [[file:src/pseudo.py][pseudo.py]]
 386 - http://nltk.org/doc/en/structured-programming.html recursive dynamic
 387 - http://nltk.org/doc/en/advanced-parsing.html
 388 - http://jaynes.colorado.edu/PythonIdioms.html
 389
 390
 391
 392 * Git
 393 Repository web page: http://repo.or.cz/w/dmvccm.git
 394
 395 Setting up a new project:
 396 : git init
 397 : git add .
 398 : git commit -m "first release"
 399
 400 Later on: (=-a= does =git rm= and =git add= automatically)
 401 : git init
 402 : git commit -a -m "some subsequent release"
 403
 404 Then push stuff up to the remote server:
 405 : git push git+ssh://username@repo.or.cz/srv/git/dmvccm.git master
 406
 407 (=eval `ssh-agent`= and =ssh-add= to avoid having to type in keyphrase all
 408 the time)
 409
 410 Make a copy of the (remote) master branch:
 411 : git clone git://repo.or.cz/dmvccm.git
 412
 413 Make and name a new branch in this folder
 414 : git checkout -b mybranch
 415
 416 To save changes in =mybranch=:
 417 : git commit -a
 418
 419 Go back to the master branch (uncommitted changes from =mybranch= are
 420 carried over):
 421 : git checkout master
 422
 423 Try out:
 424 : git add --interactive
 425
 426 Good tutorial:
 427 http://www-cs-students.stanford.edu/~blynn//gitmagic/