DMVCCM.org

   1 # -*- coding: mule-utf-8-unix -*-
   2
   3 #+STARTUP: overview
   4 #+TAGS: OPTIMIZE PRETTIER
   5 #+STARTUP: hidestars
   6 #+TITLE: DMV/CCM -- todo-list / progress
   7 #+AUTHOR: Kevin Brubeck Unhammer
   8 #+EMAIL: K.BrubeckUnhammer at student uva nl
   9 #+OPTIONS: H:4 toc:3 ^:{}
  10 #+LANGUAGE: en
  11 #+SEQ_TODO: TOGROK TODO DONE
  12
  13 * dmvccm report and project
  14   DEADLINE: <2008-06-30 Mon>
  15 - [[file:src/dmv.py][dmv.py]]
  16 - [[file:src/io.py][io.py]]
  17 - [[file:src/harmonic.py::harmonic%20py%20initialization%20for%20dmv][harmonic.py]]
  18
  19 (But absolute, extended, really-quite-dead-now deadline: August 31...)
  20 * TODO [#A] outer probabilities
  21 # <<outer>>
  22 There are 6 different configurations here, based on what the above
  23 (mother) node is, and what the Node for which we're computing is.
  24 Here *r* is not between *s* and *t* as in inner(), but an /outer/ index. *loc_N*
  25 is the location of the Node head in the sentence, *loc_m* for the head
  26 of the mother rule.
  27
  28 + mother is a RIGHT-stop:
  29   - outer(*s, t*, mom.LHS, *loc_N*), no inner-call
  30   - adjacent iff *t* == *loc_m*
  31 + mother is a  LEFT-stop:
  32   - outer(*s, t*, mom.LHS, *loc_N*), no inner-call
  33   - adjacent iff *s* == *loc_m*
  34
  35 + Node is on the LEFT branch (mom.L == Node)
  36   * and mother is a LEFT attachment:
  37     - *loc_N* will be in the LEFT branch, can be anything here.
  38     - In the RIGHT, attached, branch we find inner(*t+1, r*, mom.R, *loc_m*) for
  39       all possible *loc_m* in the right part of the sentence.
  40     - outer(*s, r*, mom.LHS, *loc_m*).
  41     - adjacent iff *t+1* == *loc_m*
  42   * and mother is a RIGHT attachment:
  43     - *loc_m* = *loc_N*.
  44     - In the RIGHT, attached, branch we find inner(*t+1, r*, mom.R, *loc_R*) for
  45       all possible *loc_R* in the right part of the sentence.
  46     - outer(*s, r*, mom.LHS, *loc_N*).
  47     - adjacent iff *t* == *loc_m*
  48
  49 + Node is on the RIGHT branch (mom.R == Node)
  50   * and mother is a LEFT attachment:
  51     - *loc_m* = *loc_N*.
  52     - In the LEFT, attached, branch we find inner(*r, s-1*, mom.L, *loc_L*) for
  53       all possible *loc_L* in the left part of the sentence.
  54     - outer(*r, t*, mom.LHS, *loc_m*).
  55     - adjacent iff *s* == *loc_m*
  56   * and mother is a RIGHT attachment:
  57     - *loc_N* will be in the RIGHT branch, can be anything here.
  58     - In the RIGHT, attached, branch we find inner(*r, s-1*, mom.L, *loc_m*) for
  59       all possible *loc_m* in the right part of the sentence.
  60     - outer(*r, t*, mom.LHS, *loc_N*).
  61     - adjacent iff *s-1* == *loc_m*
  62
  63 [[file:outer_attachments.jpg]]
  64
  65 ** COMMENT qtrees
  66
  67 \Tree [.{$h\urcorner$} [.{$_s \ulcorner a\urcorner _t$\\
  68   Node} ] {$_{t+1} h\urcorner _r$\\
  69   R} ]
  70 \Tree [.{$h$} {$_{s} h _{t}$\\
  71   Node} [.{$_{t+1} \ulcorner a\urcorner _r$\\
  72   R} ] ]
  73 \Tree [.{$h\urcorner$} [.{$_r \ulcorner a\urcorner _{s-1}$\\
  74   L} ] {$_{s} h\urcorner _t$\\
  75   Node} ]
  76 \Tree [.{$h$} {$_{r} h _{s-1}$\\
  77   L} [.{$_{s} \ulcorner a\urcorner _t$\\
  78   Node} ] ]
  79
  80 * TODO [#B] P_STOP and P_CHOOSE for IO/EM (reestimation)
  81 [[file:src/dmv.py::DMV%20probabilities][dmv-P_STOP]]
  82 Remember: The P_{STOP} formula is upside-down (left-to-right also).
  83 (In the article..not the thesis)
  84
  85 ** TOGROK [#A] How do we only count from completed trees?
  86 The chart from inner() contains several entries where the only ways to
  87 reach them (from above) are through nodes that have zero probability;
  88 we don't want these to give counts for reestimation.
  89
  90 prune() weeds out entries with only zero-probability mothers (or
  91 grandmothers), but is incredibly slow (reestimating with pruning takes
  92 almost 50 times longer).
  93
  94 Another solution is to add
  95 ** P_STOP formulas for various dir and adj:
  96 Assuming this:
  97 - P_{STOP}(STOP|h,L,non_adj) = \sum_{corpus} \sum_{s<loc(h)} \sum_{t}
  98   inner(s,t,_h_, ...) / \sum_{corpus} \sum_{s<loc(h)} \sum_{t}
  99   inner(s,t,h_, ...)
 100 - P_{STOP}(STOP|h,L,adj) = \sum_{corpus} \sum_{s=loc(h)} \sum_{t}
 101   inner(s,t,_h_, ...) / \sum_{corpus} \sum_{s=loc(h)} \sum_{t}
 102   inner(s,t,h_, ...)
 103 - P_{STOP}(STOP|h,R,non_adj) = \sum_{corpus} \sum_{s} \sum_{t>loc(h)}
 104   inner(s,t,_h_, ...) / \sum_{corpus} \sum_{s} \sum_{t>loc(h)}
 105   inner(s,t,h_, ...)
 106 - P_{STOP}(STOP|h,R,adj) = \sum_{corpus} \sum_{s} \sum_{t=loc(h)}
 107   inner(s,t,_h_, ...) / \sum_{corpus} \sum_{s} \sum_{t=loc(h)}
 108   inner(s,t,h_, ...)
 109
 110 (And P_{STOP}(-STOP|...) = 1 - P_{STOP}(STOP|...) )
 111 ** P_CHOOSE formula:
 112 Assuming this:
 113 - P_{CHOOSE}(a|h,L) = \sum_{corpus} \sum_{s<loc(h)} \sum_{t>=loc(h)} \sum_{r<t}
 114   inner(s,r,_a_, ...) / \sum_{corpus} \sum_{s<loc(h)} \sum_{t>=loc(h)}
 115   inner(s,t,h_,...)
 116   - t >= loc(h) since there are many possibilites for
 117     right-attachments below, and each of them alone gives a lower
 118     probability (through multiplication) to the upper tree (so sum
 119     them all)
 120 - P_{CHOOSE}(a|h,R) = \sum_{corpus} \sum_{s=loc(h)} \sum_{r>s}
 121   inner(s,r,_a_,...) / \sum_{corpus} \sum_{s=loc(h)} \sum_{t}
 122   inner(s,t,h, ...)
 123
 124 * TOGROK Find Most Probable Parse of given test sentence
 125 We could probably do it pretty easily from the chart:
 126 - for all loc_h, select the (0,len(sent),ROOT,loc_h) that has the highest p
 127 - for stops, just keep going
 128 - for rewrites, for loc_dep select the one that has the highest p
 129 * TOGROK Combine CCM with DMV
 130 * TODO Get a dependency parsed version of WSJ to test with
 131 * Initialization
 132 [[file:~/Documents/Skole/V08/Probability/dmvccm/src/dmv.py::Initialization%20todo][dmv-inits]]
 133
 134 We do have to go through the corpus, since the probabilities are based
 135 on how far away in the sentence arguments are from their heads.
 136 ** TOGROK CCM Initialization
 137 P_{SPLIT} used here... how, again?
 138 ** DONE Separate initialization to another file?                      :PRETTIER:
 139    CLOSED: [2008-06-08 Sun 12:51]
 140 [[file:src/harmonic.py::harmonic%20py%20initialization%20for%20dmv][harmonic.py]]
 141 ** DONE DMV Initialization probabilities
 142 (from initialization frequency)
 143 ** DONE DMV Initialization frequencies
 144    CLOSED: [2008-05-27 Tue 20:04]
 145 *** P_STOP
 146 P_{STOP} is not well defined by K&M. One possible interpretation given
 147 the sentence [det nn vb nn] is
 148 : f_{STOP}( STOP|det, L, adj) +1
 149 : f_{STOP}(-STOP|det, L, adj) +0
 150 : f_{STOP}( STOP|det, L, non_adj) +1
 151 : f_{STOP}(-STOP|det, L, non_adj) +0
 152 : f_{STOP}( STOP|det, R, adj) +0
 153 : f_{STOP}(-STOP|det, R, adj) +1
 154 :
 155 : f_{STOP}( STOP|nn, L, adj) +0
 156 : f_{STOP}(-STOP|nn, L, adj) +1
 157 : f_{STOP}( STOP|nn, L, non_adj) +1  # since there's at least one to the left
 158 : f_{STOP}(-STOP|nn, L, non_adj) +0
 159 **** TODO tweak
 160 # <<pstoptweak>>
 161 :            f[head,  'STOP', 'LN'] += (i_h <= 1)     # first two words
 162 :            f[head, '-STOP', 'LN'] += (not i_h <= 1)
 163 :            f[head,  'STOP', 'LA'] += (i_h == 0)     # very first word
 164 :            f[head, '-STOP', 'LA'] += (not i_h == 0)
 165 :            f[head,  'STOP', 'RN'] += (i_h >= n - 2) # last two words
 166 :            f[head, '-STOP', 'RN'] += (not i_h >= n - 2)
 167 :            f[head,  'STOP', 'RA'] += (i_h == n - 1) # very last word
 168 :            f[head, '-STOP', 'RA'] += (not i_h == n - 1)
 169 vs
 170 :            # this one requires some additional rewriting since it
 171 :            # introduces divisions by zero
 172 :            f[head,  'STOP', 'LN'] += (i_h == 1)     # second word
 173 :            f[head, '-STOP', 'LN'] += (not i_h <= 1) # not first two
 174 :            f[head,  'STOP', 'LA'] += (i_h == 0)     # first word
 175 :            f[head, '-STOP', 'LA'] += (not i_h == 0) # not first
 176 :            f[head,  'STOP', 'RN'] += (i_h == n - 2)     # second-to-last
 177 :            f[head, '-STOP', 'RN'] += (not i_h >= n - 2) # not last two
 178 :            f[head,  'STOP', 'RA'] += (i_h == n - 1)     # last word
 179 :            f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last
 180 vs
 181 :            f[head,  'STOP', 'LN'] += (i_h == 1)     # second word
 182 :            f[head, '-STOP', 'LN'] += (not i_h == 1) # not second
 183 :            f[head,  'STOP', 'LA'] += (i_h == 0)     # first word
 184 :            f[head, '-STOP', 'LA'] += (not i_h == 0) # not first
 185 :            f[head,  'STOP', 'RN'] += (i_h == n - 2)     # second-to-last
 186 :            f[head, '-STOP', 'RN'] += (not i_h == n - 2) # not second-to-last
 187 :            f[head,  'STOP', 'RA'] += (i_h == n - 1)     # last word
 188 :            f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last
 189 vs
 190 "all words take the same number of arguments" interpreted as
 191 :for all heads:
 192 :    p_STOP(head, 'STOP', 'LN') = 0.3
 193 :    p_STOP(head, 'STOP', 'LA') = 0.5
 194 :    p_STOP(head, 'STOP', 'RN') = 0.4
 195 :    p_STOP(head, 'STOP', 'RA') = 0.7
 196 (which we easily may tweak in init_zeros())
 197 *** P_CHOOSE
 198 Go through the corpus, counting distances between heads and
 199 arguments. In [det nn vb nn], we give
 200 - f_{CHOOSE}(nn|det, R) +1/1 + C
 201 - f_{CHOOSE}(vb|det, R) +1/2 + C
 202 - f_{CHOOSE}(nn|det, R) +1/3 + C
 203   - If this were the full corpus, P_{CHOOSE}(nn|det, R) would have
 204     (1+1/3+2C) / sum_a f_{CHOOSE}(a|det, R)
 205
 206 The ROOT gets "each argument with equal probability", so in a sentence
 207 of three words, 1/3 for each (in [nn vb nn], 'nn' gets 2/3). Basically
 208 just a frequency count of the corpus...
 209 * [#C] Deferred
 210 http://wiki.python.org/moin/PythonSpeed/PerformanceTips Eg., use
 211 map/reduce/filter/[i for i in [i's]]/(i for i in [i's]) instead of
 212 for-loops; use local variables for globals (global variables or or
 213 functions), etc.
 214
 215 ** TODO when reestimating P_STOP etc, remove rules with p < epsilon   :OPTIMIZE:
 216 ** TODO inner_dmv, short ranges and impossible attachment             :OPTIMIZE:
 217 If s-t <= 2, there can be only one attachment below, so don't recurse
 218 with both Lattach=True and Rattach=True.
 219
 220 If s-t <= 1, there can be no attachment below, so only recurse with
 221 Lattach=False, Rattach=False.
 222
 223 Put this in the loop under rewrite rules (could also do it in the STOP
 224 section, but that would only have an effect on very short sentences).
 225 ** TODO clean up the module files                                     :PRETTIER:
 226 Is there better way to divide dmv and harmonic? There's a two-way
 227 dependency between the modules. Guess there could be a third file that
 228 imports both the initialization and the actual EM stuff, while a file
 229 containing constants and classes could be imported by all others:
 230 : dmv.py imports dmv_EM.py imports dmv_classes.py
 231 : dmv.py imports dmv_inits.py imports dmv_classes.py
 232
 233 ** TOGROK Some (tagged) sentences are bound to come twice             :OPTIMIZE:
 234 Eg, first sort and count, so that the corpus
 235 [['nn','vbd','det','nn'],
 236  ['vbd','nn','det','nn'],
 237  ['nn','vbd','det','nn']]
 238 becomes
 239 [(['nn','vbd','det','nn'],2),
 240  (['vbd','nn','det','nn'],1)]
 241 and then in each loop through sentences, make sure we handle the
 242 frequency correctly.
 243
 244 Is there much to gain here?
 245
 246 ** TOGROK tags as numbers or tags as strings?                         :OPTIMIZE:
 247 Need to clean up the representation.
 248
 249 Stick with tag-strings in initialization then switch to numbers for
 250 IO-algorithm perhaps? Can probably afford more string-matching in
 251 initialization..
 252 ** DONE if loc_h == t, no need to try right-attachment rules &v.v.    :OPTIMIZE:
 253    CLOSED: [2008-06-10 Tue 14:34]
 254 (and if loc_h == s, no need to try left-attachment rules.)
 255
 256 Modest speed increase (5%).
 257 ** DONE io.debug parameters should not call functions                 :OPTIMIZE:
 258    CLOSED: [2008-06-10 Tue 12:26]
 259 Exchanged all io.debug(str,'level') calls with statements of the form:
 260 :if 'level' in io.DEBUG:
 261 :    print str
 262
 263 and got an almost threefold speed increase on inner().
 264 ** DONE inner_dmv() should disregard rules with heads not in sent     :OPTIMIZE:
 265    CLOSED: [2008-06-08 Sun 10:18]
 266 If the sentence is "nn vbd det nn", we should not even look at rules
 267 where
 268 : rule.head() not in "nn vbd det nn".split()
 269 This is ruled out by getting rules from g.rules(LHS, sent).
 270
 271 Also, we optimize this further by saying we don't even recurse into
 272 attachment rules where
 273 : rule.head() not in sent[ s :r+1]
 274 : rule.head() not in sent[r+1:t+1]
 275 meaning, if we're looking at the span "vbd det", we only use
 276 attachment rules where both daughters are members of ['vbd','det']
 277 (although we don't (yet) care about removing rules that rewrite to the
 278 same tag if there are no duplicate tags in the span, etc., that would
 279 be a lot of trouble for little potential gain).
 280 * Adjacency and combining it with inner()
 281 Each DMV_Rule now has both a probN and a probA, for
 282 adjacencies. inner() needs the correct one in each case.
 283
 284 Adjacency gives a problem with duplicate words/tags, eg. in the
 285 sentence "a a b". If this has the dependency structure b->a_{0}->a_{1},
 286 then b is non-adjacent to a_{0} and should use probN (for the LRStop and
 287 the attachment of a_{0}), while the other rules should all use
 288 probA. But within the e(0,2,b) we can't just say "oh, a has index 0
 289 so it's not adjacent to 2", since there's also an a at index 1, and
 290 there's also a dependency structure b->a_{1}->a_{0} for that. We want
 291 both. And in possibly much more complex versions.
 292
 293 So now we call inner() for each location of a head, and on each
 294 terminal, loc_h must equal s (and t). In the recursive attachment
 295 calls, we use the locations (sentence indices) of words to the left or
 296 right of the head in calls to inner(). loc_h lets us check whether we
 297 need probN or probA.
 298
 299 In each inner() call, loc_h is the location of the root of this
 300 dependency structure.
 301 ** Possible alternate type of adjacency
 302 K&M's adjacency is just whether or not an argument has been generated
 303 in the current direction yet. One could also make a stronger type of
 304 adjacency, where h and a are not adjacent if b is in between, eg. with
 305 the sentence "a b h" and the structure ((h->a), (a->b)), h is
 306 K&M-adjacent to a, but not next to a, since b is in between. It's easy
 307 to check this type of adjacency in inner(), but it needs new rules for
 308 P_STOP reestimation.
 309 * Expectation Maximation in IO/DMV-terms
 310 inner(s,t,LHS) calculates the expected number of trees headed by LHS
 311 from s to t (sentence positions). This uses the P_STOP and P_CHOOSE
 312 values, which have been conveniently distributed into CNF rules as
 313 probN and probA (non-adjacent and adjacent probabilites).
 314
 315 When re-estimating, we use the expected values from inner() to get new
 316 values for P_STOP and P_CHOOSE. When we've re-estimated for the entire
 317 corpus, we distribute P_STOP and P_CHOOSE into the CNF rules again, so
 318 that in the next round we use new probN and probA to find
 319 inner-probabilites.
 320
 321 The distribution of P_STOP and P_CHOOSE into CNF rules also happens in
 322 init_normalize() (here along with the creation of P_STOP and
 323 P_CHOOSE); P_STOP is used to create CNF rules where one branch of the
 324 rule is STOP, P_CHOOSE is used to create rules of the form
 325 : h  -> h  _a_
 326 : h_ -> h_ _a_
 327
 328 Since "adjacency" is not captured in regular CNF rules, we need two
 329 probabilites for each rule, and inner() has to know when to use which.
 330
 331 ** TODO Corpus access
 332 ** TOGROK sentences or rules as the "outer loop"?                     :OPTIMIZE:
 333 In regard to the E/M-step, finding P_{STOP}, P_{CHOOSE}.
 334
 335
 336 * Python-stuff
 337 # <<python>>
 338 Make those debug statements steal a bit less attention in emacs:
 339 :(font-lock-add-keywords
 340 : 'python-mode                   ; not really regexp, a bit slow
 341 : '(("^\\( *\\)\\(\\if +'.+' +in +io.DEBUG. *\\(
 342 :\\1    .+$\\)+\\)" 2 font-lock-preprocessor-face t)))
 343 :(font-lock-add-keywords
 344 : 'python-mode
 345 : '(("\\<\\(\\(io\\.\\)?debug(.+)\\)" 1 font-lock-preprocessor-face t)))
 346
 347 - [[file:src/pseudo.py][pseudo.py]]
 348 - http://nltk.org/doc/en/structured-programming.html recursive dynamic
 349 - http://nltk.org/doc/en/advanced-parsing.html
 350 - http://jaynes.colorado.edu/PythonIdioms.html
 351
 352
 353
 354 * Git
 355 Repository web page: http://repo.or.cz/w/dmvccm.git
 356
 357 Setting up a new project:
 358 : git init
 359 : git add .
 360 : git commit -m "first release"
 361
 362 Later on: (=-a= does =git rm= and =git add= automatically)
 363 : git init
 364 : git commit -a -m "some subsequent release"
 365
 366 Then push stuff up to the remote server:
 367 : git push git+ssh://username@repo.or.cz/srv/git/dmvccm.git master
 368
 369 (=eval `ssh-agent`= and =ssh-add= to avoid having to type in keyphrase all
 370 the time)
 371
 372 Make a copy of the (remote) master branch:
 373 : git clone git://repo.or.cz/dmvccm.git
 374
 375 Make and name a new branch in this folder
 376 : git checkout -b mybranch
 377
 378 To save changes in =mybranch=:
 379 : git commit -a
 380
 381 Go back to the master branch (uncommitted changes from =mybranch= are
 382 carried over):
 383 : git checkout master
 384
 385 Try out:
 386 : git add --interactive
 387
 388 Good tutorial:
 389 http://www-cs-students.stanford.edu/~blynn//gitmagic/