DMVCCM.org

   1 # -*- coding: mule-utf-8-unix -*-
   2
   3 #+STARTUP: overview
   4 #+TAGS: OPTIMIZE PRETTIER
   5 #+STARTUP: hidestars
   6 #+TITLE: DMV/CCM -- todo-list / progress
   7 #+AUTHOR: Kevin Brubeck Unhammer
   8 #+EMAIL: K.BrubeckUnhammer at student uva nl
   9 #+OPTIONS: ^:{}  skip:t
  10 #+LANGUAGE: en
  11 #+SEQ_TODO: TOGROK TODO DONE
  12
  13 [[file:src/main.py][main.py]]
  14 [[file:src/wsjdep.py][wsjdep.py]]
  15 [[file:src/loc_h_dmv.py][loc_h_dmv.py]]
  16
  17 Meta-todo:
  18 - debug reestimate2 which stores charts for all sentences and has
  19   arguments as the outer loop
  20   - have to fix the lack of attachment probabilities...
  21 - fix cnf outer
  22
  23 [[file:DMVCCM.html][DMVCCM.html]]
  24
  25 * DMV/CCM report and project
  26 - DMV-[[file:tex/formulas.pdf][formulas.pdf]]  -- /clear/ information =D
  27 - [[file:src/main.py][main.py]] -- evaluation, corpus likelihoods
  28 - [[file:src/wsjdep.py][wsjdep.py]] -- corpus
  29
  30 - [[file:src/loc_h_dmv.py][loc_h_dmv.py]] -- DMV-IO and reestimation
  31 - [[file:src/loc_h_harmonic.py][loc_h_harmonic.py]] -- DMV initialization
  32
  33 - [[file:src/common_dmv.py][common_dmv.py]] -- various functions used by loc_h_dmv and others
  34 - [[file:src/io.py][io.py]] -- non-DMV IO
  35
  36 - [[file:src/cnf_dmv.py][cnf_dmv.py]] -- cnf-like implementation of DMV
  37 - [[file:src/cnf_harmonic.py][cnf_harmonic.py]] -- initialization for cnf_dmv
  38
  39 [[http://www.student.uib.no/~kun041/dmvccm/DMVCCM_archive.html][Archived entries]] from this file.
  40 * Notation
  41 : old notes:   new notes:   in tex/code (constants):    in Klein thesis:
  42 :--------------------------------------------------------------------------------------
  43 : _h_            _h_            SEAL                    bar over h
  44 :  h_             h><           RGOL                    right-under-left-arrow over h
  45 :  h              h>            GOR                     right-arrow over h
  46 :
  47 :               ><h             LGOR                    left-under-right-arrow over h
  48 :                <h             GOL                     left-arrow over h
  49 These are represented in the code as pairs =(s_h,h)=, where =h= is an
  50 integer (POS-tag) and =s_h= \in ={SEAL,RGOL,GOR,LGOR,GOL}=.
  51
  52 =P_ATTACH= and =P_CHOOSE= are synonymous, I try to use the
  53 former. Also,
  54 : P_GO_AT(a|h,dir,adj) := P_ATTACH(a|h,dir)*(1-P_STOP(STOP|h,dir,adj)
  55
  56 (precalculated after each reestimation with =g.p_GO_AT = make_GO_AT(g.p_STOP,g.p_ATTACH)=)
  57 ** COMMENT qtrees, tex
  58 \usepackage{qtree}
  59 \usepackage{amssymb}
  60
  61 \newcommand{\GOR}[1]{\overrightarrow{#1}}
  62 \newcommand{\RGOL}[1]{\overleftarrow{\overrightarrow{#1}}}
  63
  64 \newcommand{\SEAL}[1]{\overline{#1}}
  65
  66 \newcommand{\LGOR}[1]{\overrightarrow{\overleftarrow{#1}}}
  67 \newcommand{\GOL}[1]{\overleftarrow{#1}}
  68
  69 \Tree [.{$\RGOL{h}$} [.{$_s \SEAL{a} _t$\\
  70   Node} ] {$_{t+1} \RGOL{h} _r$\\
  71   R} ]
  72 \Tree [.{$\GOR{h}$} {$_{s} \GOR{h} _{t}$\\
  73   Node} [.{$_{t+1} \SEAL{a} _r$\\
  74   R} ] ]
  75 \Tree [.{$\RGOL{h}$} [.{$_r \SEAL{a} _{s-1}$\\
  76   L} ] {$_{s} \RGOL{h} _t$\\
  77   Node} ]
  78 \Tree [.{$\GOR{h}$} {$_{r} \GOR{h} _{s-1}$\\
  79   L} [.{$_{s} \SEAL{a} _t$\\
  80   Node} ] ]
  81
  82
  83 \Tree [.{$h\urcorner$} [.{$_s \ulcorner a\urcorner _t$\\
  84   Node} ] {$_{t+1} h\urcorner _r$\\
  85   R} ]
  86 \Tree [.{$h$} {$_{s} h _{t}$\\
  87   Node} [.{$_{t+1} \ulcorner a\urcorner _r$\\
  88   R} ] ]
  89 \Tree [.{$h\urcorner$} [.{$_r \ulcorner a\urcorner _{s-1}$\\
  90   L} ] {$_{s} h\urcorner _t$\\
  91   Node} ]
  92 \Tree [.{$h$} {$_{r} h _{s-1}$\\
  93   L} [.{$_{s} \ulcorner a\urcorner _t$\\
  94   Node} ] ]
  95 * Testing the dependency parsed WSJ
  96 [[file:src/wsjdep.py][wsjdep.py]] uses NLTK (sort of) to get a dependency parsed version of
  97 WSJ10 into the format used in mpp() in loc_h_dmv.py.
  98
  99 As a default, =WSJDepCorpusReader= looks for the file =wsj.combined.10.dep= in
 100 =../corpus/wsjdep=.
 101
 102 Only =sents()=, =tagged_sents()= and =parsed_sents()= (plus a new function
 103 =tagonly_sents()=) are implemented, the other NLTK corpus functions are
 104 ..um.. undefined...
 105 ** TODO [#A] Should =def evaluate= use add_root?
 106 [[file:src/main.py::def%20evaluate%20g%20tagonly_sents%20parsed_sents][main.py]] evaluate
 107 [[file:src/wsjdep.py][wsjdep.py]] add_root
 108
 109 (just has to count how many pairs are in there; Precision and Recall)
 110 * TODO [#C] Alternative CNF for DMV
 111
 112 # <<dmv2cnf>>
 113 - [[file:src/cnf_dmv.py][cnf_dmv.py]]
 114 - [[file:src/cnf_harmonic.py][cnf_harmonic.py]]
 115
 116 See section 5 of [[file:tex/formulas.pdf][formulas.pdf]].
 117
 118 Given a grammar with certain p_ATTACH, p_STOP and p_ROOT, we get:
 119 :>>> print testgrammar_h():
 120 :  h>< -->   h>  STOP   [0.30]
 121 :  h>< -->  >h>  STOP   [0.40]
 122 : _h_  --> STOP    h><  [1.00]
 123 : _h_  --> STOP   <h><  [1.00]
 124 : >h>  -->   h>   _h_   [1.00]
 125 : >h>  -->  >h>   _h_   [1.00]
 126 : <h>< -->  _h_    h><  [0.70]
 127 : <h>< -->  _h_   <h><  [0.60]
 128 :ROOT  --> STOP   _h_   [1.00]
 129
 130 ** TODO [#A] Make and implement an equivalent grammar that's /pure/ CNF
 131 ...since I'm not sure about my unary reestimation rules (section 5 of
 132 [[file:tex/formulas.pdf][formulas]]).
 133
 134 :  h>< -->   h>  STOP
 135 :  h>< -->  >h>  STOP
 136 : _h_  --> STOP    h><
 137 : _h_  --> STOP   <h><
 138 : >h>  -->   h>   _a_
 139 : >h>  -->  >h>   _a_
 140 : <h>< -->  _a_    h><
 141 : <h>< -->  _a_   <h><
 142 : ROOT -->  _a_   <h>< p_ROOT(h) * p_ATTACH(a|h,L)
 143 : ROOT -->  _a_    h>  p_ROOT(h) * p_ATTACH(a|h,R)
 144 : ROOT -->   h         p_ROOT(h)
 145
 146
 147 ** TOGROK [#A] convert L&Y-based reestimation into P_ATTACH and P_STOP values
 148 Sum over the various rules? Or something? Must think of this.
 149 ** TODO [#C] move as much as possible into common_dmv.py
 150 [[file:src/common_dmv.py][common_dmv.py]]
 151 ** DONE L&Y-based reestimation for cnf_dmv
 152    CLOSED: [2008-08-21 Thu 16:35]
 153 ** DONE dmv2cnf re-estimation formulas
 154    CLOSED: [2008-08-21 Thu 16:36]
 155 ** DONE inner and outer for cnf_dmv.py, also cnf_harmonic.py
 156 * TOGROK Combine CCM with DMV
 157
 158 # <<comboquestions>>
 159
 160 Questions about the =P_COMBO= info in [[http://www.eecs.berkeley.edu/~klein/papers/klein_thesis.pdf][Klein's thesis]]:
 161 - Page 109 (pdf: 125): We have to premultiply "all our probabilities"
 162   by the CCM base product /\Pi_{<i,j>}
 163   P_{SPAN}(\alpha(i,j,s)|false)P_{CONTEXT}(\beta(i,j,s)|false)/; which
 164   probabilities are included under "all"? I'm assuming this includes
 165   =P_ATTACH= since each time =P_ATTACH= is used, /\phi/ is multiplied in
 166   (pp.110-111 ibid.); but /\phi/ is not used for STOPs, so should we not
 167   have our CCM product multiplied in there? How about =P_ROOT=?
 168   (Guessing =P_ORDER= is way out of the question...)
 169 - For the outside probabilities, is it correct to assume we multiply
 170   in /\phi(j,k)/ or /\phi(k,i)/ when calculating =inner(i,j...)=? (Eg., only
 171   for the outside part, not for the whole range.) I don't understand
 172   the notation in =O()= on p.103.
 173 * TOGROK Reestimate P_ORDER ?
 174 * Most Probable Parse
 175 ** TOGROK Find MPP with CCM
 176 ** DONE Find Most Probable Parse of given test sentence, in DMV
 177   CLOSED: [2008-07-23 Wed 10:56]
 178 inner() optionally keeps track of the highest probability children of
 179 any node in =mpptree=. Say we're looking for =inner(i,j,(s_h,h),loc_h)= in
 180 a certain sentence, and we find some possible left and right children,
 181 we add to =mpptree[i,j,(s_h,h),loc_h]= the triple =(p, L, R)= where =L= and
 182 =R= are of the same form as the key (=i,j,(s_h,h),loc_h=) and =p= is the
 183 probability of this node rewriting to =L= and =R=,
 184 eg. =inner(L)*inner(R)*p_GO_AT= or =p_STOP= or whatever. We only add this
 185 entry to =mpptree= if there wasn't a higher-probability entry there
 186 before.
 187
 188 Then, after =inner_sent= makes an =mpptree=, we find the /relevant/
 189 head-argument pairs by searching through the tree using a queue,
 190 adding the =L= and =R= keys of any entry to the queue as we find them
 191 (skipping =STOP= keys), and adding any attachment entries to a set of
 192 triples =(head,argument,dir)=. Thus we have our most probable parse,
 193 eg.
 194 : set([( ROOT, (vbd,2),RIGHT),
 195 :      ((vbd,2),(nn,1),LEFT),
 196 :      ((vbd,2),(nn,3),RIGHT),
 197 :      ((nn,1),(det,0),LEFT)])
 198 * Initialization
 199 [[file:~/Documents/Skole/V08/Probability/dmvccm/src/dmv.py::Initialization%20todo][dmv-inits]]
 200
 201 We go through the corpus, since the probabilities are based on how far
 202 away in the sentence arguments are from their heads.
 203 ** TOGROK CCM Initialization
 204 P_{SPLIT} used here... how, again?
 205 * [#C] Deferred
 206 http://wiki.python.org/moin/PythonSpeed/PerformanceTips Eg., use
 207 map/reduce/filter/[i for i in [i's]]/(i for i in [i's]) instead of
 208 for-loops; use local variables for globals (global variables or or
 209 functions), etc.
 210 ** TODO Clean up reestimation code                                    :PRETTIER:
 211 ** TODO [#A] compare speed of w_left/right(...) and w(LEFT/RIGHT, ...) :OPTIMIZE:
 212 ** TODO when reestimating P_STOP etc, remove rules with p < epsilon   :OPTIMIZE:
 213 ** TODO inner_dmv, short ranges and impossible attachment             :OPTIMIZE:
 214 If s-t <= 2, there can be only one attachment below, so don't recurse
 215 with both Lattach=True and Rattach=True.
 216
 217 If s-t <= 1, there can be no attachment below, so only recurse with
 218 Lattach=False, Rattach=False.
 219
 220 Put this in the loop under rewrite rules (could also do it in the STOP
 221 section, but that would only have an effect on very short sentences).
 222 ** TODO clean up the module files                                     :PRETTIER:
 223 Is there better way to divide dmv and harmonic? There's a two-way
 224 dependency between the modules. Guess there could be a third file that
 225 imports both the initialization and the actual EM stuff, while a file
 226 containing constants and classes could be imported by all others:
 227 : dmv.py imports dmv_EM.py imports dmv_classes.py
 228 : dmv.py imports dmv_inits.py imports dmv_classes.py
 229
 230 ** TOGROK Some (tagged) sentences are bound to come twice             :OPTIMIZE:
 231 Eg, first sort and count, so that the corpus
 232 [['nn','vbd','det','nn'],
 233  ['vbd','nn','det','nn'],
 234  ['nn','vbd','det','nn']]
 235 becomes
 236 [(['nn','vbd','det','nn'],2),
 237  (['vbd','nn','det','nn'],1)]
 238 and then in each loop through sentences, make sure we handle the
 239 frequency correctly.
 240
 241 Is there much to gain here?
 242
 243 ** TOGROK tags as numbers or tags as strings?                         :OPTIMIZE:
 244 Need to clean up the representation.
 245
 246 Stick with tag-strings in initialization then switch to numbers for
 247 IO-algorithm perhaps? Can probably afford more string-matching in
 248 initialization..
 249 * Adjacency and combining it with the inside-outside algorithm
 250 Each DMV_Rule has both a probN and a probA, for adjacencies. inner()
 251 and outer() needs the correct one in each case.
 252
 253 In each inner() call, loc_h is the location of the head of this
 254 dependency structure. In each outer() call, it's the head of the /Node/,
 255 the structure we're looking outside of.
 256
 257 We call inner() for each location of a head, and on each terminal,
 258 loc_h must equal =i= (and =loc_h+1= equal =j=). In the recursive attachment
 259 calls, we use the locations (sentence indices) of words to the left or
 260 right of the head in calls to inner(). /loc_h lets us check whether we
 261 need probN or probA/.
 262 ** Possible alternate type of adjacency
 263 K&M's adjacency is just whether or not an argument has been generated
 264 in the current direction yet. One could also make a stronger type of
 265 adjacency, where h and a are not adjacent if b is in between, eg. with
 266 the sentence "a b h" and the structure ((h->a), (a->b)), h is
 267 K&M-adjacent to a, but not next to a, since b is in between. It's easy
 268 to check this type of adjacency in inner(), but it needs new rules for
 269 P_STOP reestimation.
 270 * Python-stuff
 271 # <<python>>
 272 Make those debug statements steal a bit less attention in emacs:
 273 :(font-lock-add-keywords
 274 : 'python-mode                   ; not really regexp, a bit slow
 275 : '(("^\\( *\\)\\(\\if +'.+' +in +io.DEBUG. *\\(
 276 :\\1    .+$\\)+\\)" 2 font-lock-preprocessor-face t)))
 277 :(font-lock-add-keywords
 278 : 'python-mode
 279 : '(("\\<\\(\\(io\\.\\)?debug(.+)\\)" 1 font-lock-preprocessor-face t)))
 280
 281 - [[file:src/pseudo.py][pseudo.py]]
 282 - http://nltk.org/doc/en/structured-programming.html recursive dynamic
 283 - http://nltk.org/doc/en/advanced-parsing.html
 284 - http://jaynes.colorado.edu/PythonIdioms.html
 285
 286
 287
 288 * Git
 289 Repository web page: http://repo.or.cz/w/dmvccm.git
 290
 291 Setting up a new project:
 292 : git init
 293 : git add .
 294 : git commit -m "first release"
 295
 296 Later on: (=-a= does =git rm= and =git add= automatically)
 297 : git init
 298 : git commit -a -m "some subsequent release"
 299
 300 Then push stuff up to the remote server:
 301 : git push git+ssh://username@repo.or.cz/srv/git/dmvccm.git master
 302
 303 (=eval `ssh-agent`= and =ssh-add= to avoid having to type in keyphrase all
 304 the time)
 305
 306 Make a copy of the (remote) master branch:
 307 : git clone git://repo.or.cz/dmvccm.git
 308
 309 Make and name a new branch in this folder
 310 : git checkout -b mybranch
 311
 312 To save changes in =mybranch=:
 313 : git commit -a
 314
 315 Go back to the master branch (uncommitted changes from =mybranch= are
 316 carried over):
 317 : git checkout master
 318
 319 Try out:
 320 : git add --interactive
 321
 322 Good tutorial:
 323 http://www-cs-students.stanford.edu/~blynn//gitmagic/