pygr/cnestedlist.pyx

   1 import sequence
   2 import nlmsa_utils
   3 import logger
   4
   5 cdef class IntervalDBIterator:
   6   def __new__(self,int start,int end,IntervalDB db not None):
   7     self.it=interval_iterator_alloc()
   8     self.it_alloc=self.it
   9     self.start=start
  10     self.end=end
  11     self.db=db
  12
  13   def __iter__(self):
  14     return self
  15
  16   cdef int cnext(self): # C VERSION OF ITERATOR next METHOD RETURNS INDEX
  17     cdef int i
  18     if self.ihit>=self.nhit: # TRY TO GET ONE MORE BUFFER CHUNK OF HITS
  19       if self.it==NULL: # ITERATOR IS EXHAUSTED
  20         return -1
  21       find_intervals(self.it,self.start,self.end,self.db.im,self.db.ntop,
  22                      self.db.subheader,self.db.nlists,self.im_buf,1024,
  23                      &(self.nhit),&(self.it)) # GET NEXT BUFFER CHUNK
  24       self.ihit=0 # START ITERATING FROM START OF BUFFER
  25     if self.ihit<self.nhit: # RETURN NEXT ITEM FROM BUFFER
  26       i=self.ihit
  27       self.ihit = self.ihit+1 # ADVANCE THE BUFFER COUNTER
  28       return i
  29     else: # BUFFER WAS EMPTY, NO HITS TO ITERATE OVER...
  30       return -1
  31
  32   # PYTHON VERSION OF next RETURNS HIT AS A TUPLE
  33   def __next__(self): # PYREX USES THIS NON-STANDARD NAME INSTEAD OF next()!!!
  34     cdef int i
  35     i=self.cnext()
  36     if i>=0:
  37       return (self.im_buf[i].start,self.im_buf[i].end,self.im_buf[i].target_id,
  38               self.im_buf[i].target_start,self.im_buf[i].target_end)
  39     else:
  40       raise StopIteration
  41
  42   def __dealloc__(self):
  43     'remember: dealloc cannot call other methods!'
  44     free_interval_iterator(self.it_alloc)
  45
  46
  47
  48
  49 cdef class IntervalDB:
  50   def __new__(self,filename='noname',nsize=0,**kwargs):
  51     cdef int i
  52     cdef FILE *ifile
  53     self.n=nsize
  54     if nsize>0:
  55       ifile=fopen(filename,"r") # text file, one interval per line
  56       if ifile:
  57         self.im=read_intervals(self.n,ifile)
  58         fclose(ifile)
  59         if self.im!=NULL:
  60           self.runBuildMethod(**kwargs)
  61       else:
  62         msg='could not open file %s' % filename
  63         raise IOError(msg)
  64
  65   def save_tuples(self,l,**kwargs):
  66     'build in-memory NLMSA from list of alignment tuples'
  67     cdef int i
  68     self.close() # DUMP OUR EXISTING MEMORY
  69     self.n=len(l)
  70     self.im=interval_map_alloc(self.n)
  71     if self.im==NULL:
  72       raise MemoryError('unable to allocate IntervalMap[%d]' % self.n)
  73     i=0
  74     for t in l:
  75       self.im[i].start=t[0]
  76       self.im[i].end=t[1]
  77       self.im[i].target_id=t[2]
  78       self.im[i].target_start=t[3]
  79       self.im[i].target_end=t[4]
  80       self.im[i].sublist= -1
  81       i=i+1
  82     self.runBuildMethod(**kwargs)
  83
  84   def runBuildMethod(self,buildInPlace=True):
  85     'build either in-place or using older build method'
  86     if buildInPlace:
  87       self.subheader=build_nested_list_inplace(self.im,self.n,&(self.ntop),&(self.nlists))
  88     else:
  89       self.subheader=build_nested_list(self.im,self.n,&(self.ntop),&(self.nlists))
  90
  91   def buildFromUnsortedFile(self,filename,int n,**kwargs):
  92     'load unsorted binary data, and build nested list'
  93     cdef FILE *ifile
  94     cdef int i
  95     cdef IntervalMap *im_new
  96     self.close()
  97     ifile=fopen(filename,'rb') # binary file
  98     if ifile==NULL:
  99       raise IOError('unable to open '+filename)
 100     im_new=interval_map_alloc(n)
 101     if im_new==NULL:
 102       raise MemoryError('unable to allocate IntervalMap[%d]' % n)
 103     i=read_imdiv(ifile,im_new,n,0,n)
 104     fclose(ifile)
 105     if i!=n:
 106       raise IOError('IntervalMap file corrupted?')
 107     self.n=n
 108     self.im=im_new
 109     self.runBuildMethod(**kwargs)
 110
 111   def find_overlap(self,int start,int end):
 112     self.check_nonempty() # RAISE EXCEPTION IF NO DATA
 113     return IntervalDBIterator(start,end,self)
 114
 115   def find_overlap_list(self,int start,int end):
 116     cdef int i,nhit
 117     cdef IntervalIterator *it,*it_alloc
 118     cdef IntervalMap im_buf[1024]
 119     self.check_nonempty() # RAISE EXCEPTION IF NO DATA
 120     it=interval_iterator_alloc()
 121     it_alloc=it
 122     l=[] # LIST OF RESULTS TO HAND BACK
 123     while it:
 124       find_intervals(it,start,end,self.im,self.ntop,
 125                      self.subheader,self.nlists,im_buf,1024,
 126                      &(nhit),&(it)) # GET NEXT BUFFER CHUNK
 127       for i from 0 <= i < nhit:
 128         l.append((im_buf[i].start,im_buf[i].end,im_buf[i].target_id,im_buf[i].target_start,im_buf[i].target_end))
 129     free_interval_iterator(it_alloc)
 130     return l
 131
 132   def check_nonempty(self):
 133     if self.im:
 134       return True
 135     else:
 136       msg='empty IntervalDB, not searchable!'
 137       raise IndexError(msg)
 138
 139   def write_binaries(self,filestem,div=256):
 140     cdef char *err_msg
 141     err_msg=write_binary_files(self.im,self.n,self.ntop,div,
 142                                self.subheader,self.nlists,filestem)
 143     if err_msg:
 144       raise IOError(err_msg)
 145
 146   def __dealloc__(self):
 147     'remember: dealloc cannot call other methods!'
 148     if self.subheader:
 149       free(self.subheader)
 150     if self.im:
 151       free(self.im)
 152
 153   def close(self):
 154     if self.subheader:
 155       free(self.subheader)
 156     if self.im:
 157       free(self.im)
 158     self.subheader=NULL
 159     self.im=NULL
 160
 161     return None
 162
 163
 164
 165 cdef class IntervalFileDBIterator:
 166   def __new__(self,int start,int end,IntervalFileDB db=None,
 167               NLMSASequence ns=None,
 168               int nbuffer=1024,rawIvals=None):
 169     cdef int i
 170     self.it_alloc=interval_iterator_alloc()
 171     self.restart(start, end, db, ns)
 172     if rawIvals is not None and len(rawIvals)>nbuffer:
 173       nbuffer=len(rawIvals)
 174     self.im_buf=interval_map_alloc(nbuffer)
 175     self.nbuf=nbuffer
 176     if rawIvals is not None:
 177       i=0
 178       for ival in rawIvals:
 179         self.im_buf[i].start=ival[0] # SAVE INTERVAL INFO
 180         self.im_buf[i].end=ival[1]
 181         self.im_buf[i].target_id=ival[2]
 182         self.im_buf[i].target_start=ival[3]
 183         self.im_buf[i].target_end=ival[4]
 184         i=i+1
 185       self.nhit=i # TOTAL NUMBER OF INTERVALS STORED
 186
 187   cdef int restart(self,int start,int end,IntervalFileDB db,
 188                    NLMSASequence ns) except -2:
 189     'reuse this iterator for another search without reallocing memory'
 190     self.nhit=0 # FLUSH ANY EXISTING DATA
 191     self.start=start
 192     self.end=end
 193     self.db=db
 194     if ns is not None:
 195       if ns.idb is not None:
 196         self.idb=ns.idb
 197       elif ns.db is None:
 198         ns.forceLoad()
 199       self.db=ns.db
 200     self.it=self.it_alloc # REUSE OUR CURRENT ITERATOR
 201     reset_interval_iterator(self.it) # RESET IT FOR REUSE
 202     return 0
 203
 204   cdef int reset(self) except -2:
 205     'flush the buffer so we can reuse this iterator'
 206     self.nhit=0
 207     return 0
 208
 209   cdef int extend(self,int ikeep):
 210     'expand the buffer if necessary, keeping elements [ikeep:nbuf]'
 211     cdef int length,istart
 212     cdef IntervalMap *new_buf
 213     istart=self.nbuf-ikeep
 214     length=sizeof(IntervalMap)*istart # #BYTES WE MUST KEEP
 215     if ikeep>0 and length>0: # SHIFT [ikeep:] SLICE OF BUFFER TO [0:istart]
 216       memmove(self.im_buf,self.im_buf+ikeep,length)
 217     if ikeep<8: # RUNNING OUT OF ROOM, SO DOUBLE OUR BUFFER
 218       new_buf=<IntervalMap *>realloc(self.im_buf,
 219                                      sizeof(IntervalMap)*2*self.nbuf)
 220       if new_buf==NULL:
 221         raise MemoryError('out of memory')
 222       self.im_buf=new_buf
 223       self.nbuf=2*self.nbuf
 224     return istart # RETURN START OF EMPTY BLOCK WHERE WE CAN ADD NEW DATA
 225
 226   cdef int saveInterval(self,int start,int end,int target_id,
 227                         int target_start,int target_end):
 228     'save an interval, expanding array if necessary'
 229     cdef int i
 230     if self.nhit>=self.nbuf: # EXPAND ARRAY IF NECESSARY
 231       self.extend(0)
 232     i=self.nhit
 233     self.im_buf[i].start=start # SAVE INTERVAL INFO
 234     self.im_buf[i].end=end
 235     self.im_buf[i].target_id=target_id
 236     self.im_buf[i].target_start=target_start
 237     self.im_buf[i].target_end=target_end
 238     self.nhit = i+1
 239     return self.nhit
 240
 241   cdef int nextBlock(self,int *pkeep) except -2:
 242     'load one more block of overlapping intervals'
 243     cdef int i
 244     if self.it==NULL: # ITERATOR IS EXHAUSTED
 245       return -1
 246     if pkeep and pkeep[0]>=0 and pkeep[0]<self.nhit: #MUST KEEP [ikeep:] SLICE
 247       i=self.extend(pkeep[0]) # MOVE SLICE TO THE FRONT
 248     else: # WE CAN USE THE WHOLE BUFFER
 249       i=0
 250     if self.db is not None: # ON-DISK DATABASE
 251       find_file_intervals(self.it,self.start,self.end,
 252                           self.db.db[0].ii,self.db.db[0].nii,
 253                           self.db.db[0].subheader,self.db.db[0].nlists,
 254                           &(self.db.db[0].subheader_file),
 255                           self.db.db[0].ntop,self.db.db[0].div,
 256                           self.db.db[0].ifile_idb,
 257                           self.im_buf+i,self.nbuf-i,
 258                           &(self.nhit),&(self.it)) # GET NEXT BUFFER CHUNK
 259     elif self.idb is not None: # IN-MEMORY DATABASE
 260       find_intervals(self.it,self.start,self.end,self.idb.im,self.idb.ntop,
 261                      self.idb.subheader,self.idb.nlists,self.im_buf+i,self.nbuf-i,
 262                      &(self.nhit),&(self.it)) # GET NEXT BUFFER CHUNK
 263     else:
 264       raise IOError('Iterator has no database!  Please provide a db argument.')
 265     self.nhit=self.nhit+i # TOTAL #HITS IN THE BUFFER
 266     self.ihit=i # START ITERATING FROM START OF NEW HITS
 267     if pkeep and pkeep[0]>=0: # RESET ikeep INDEX TO START OF BUFFER
 268       pkeep[0]=0
 269     return self.nhit-self.ihit # RETURN #NEW HITS IN NEXT BLOCK
 270
 271   cdef IntervalMap *getIntervalMap(self):
 272     '''return the IntervalMap array loaded by iterator,
 273     and release it from iterator.  User must free the array!'''
 274     cdef int len
 275     cdef IntervalMap *im
 276     if self.nhit==0: # NO HITS
 277       return NULL
 278     elif self.nhit<self.nbuf: # LARGER BUFFER THAN WE ACTUALLY NEED
 279       len=sizeof(IntervalMap)*self.nhit # COMPUTE FINAL SIZE
 280       im=<IntervalMap *>realloc(self.im_buf,len) # COMPACT TO FINAL SIZE
 281     else: # JUST HAND BACK OUR FULL BUFFER
 282       im=self.im_buf
 283     self.im_buf=NULL # RELEASE THIS STORAGE FROM ITERATOR; USER MUST FREE IT!
 284     return im # HAND BACK THE STORAGE
 285
 286   cdef int loadAll(self) except -1:
 287     'load all overlapping interval hits, return count of hits'
 288     cdef int len,ikeep
 289     len=1
 290     ikeep=0 # DON'T LET extend DISCARD ANY HITS, KEEP THEM ALL!
 291     while len>0: # LOAD BLOCKS UNTIL NO MORE...
 292       len=self.nextBlock(&ikeep) # LOAD ANOTHER BLOCK OF INTERVALS
 293     return self.nhit
 294
 295   cdef int cnext(self,int *pkeep): # C VERSION OF ITERATOR next METHOD
 296     'get one more overlapping interval'
 297     cdef int i
 298     if self.ihit>=self.nhit: # TRY TO GET ONE MORE BUFFER CHUNK OF HITS
 299       self.nextBlock(pkeep) # LOAD THE NEXT BLOCK IF ANY
 300     if self.ihit<self.nhit: # RETURN NEXT ITEM FROM BUFFER
 301       i=self.ihit
 302       self.ihit = self.ihit+1 # ADVANCE THE BUFFER COUNTER
 303       return i
 304     else: # BUFFER WAS EMPTY, NO HITS TO ITERATE OVER...
 305       return -1
 306
 307   cdef int copy(self,IntervalFileDBIterator src):
 308     'copy items from src to this iterator buffer'
 309     cdef IntervalMap *new_buf
 310     if src is None:
 311       raise ValueError('src is None!  Debug!!')
 312     if src.nhit>self.nbuf: # NEED TO EXPAND OUR BUFFER
 313       new_buf=<IntervalMap *>realloc(self.im_buf,src.nhit*sizeof(IntervalMap))
 314       if new_buf==NULL:
 315         raise MemoryError('out of memory')
 316       self.im_buf=new_buf # RECORD NEW BUFFER LOCATION AND SIZE
 317       self.nbuf=src.nhit
 318     self.nhit=src.nhit # COPY ARRAY AND SET CORRECT SIZE
 319     if src.nhit>0: # ONLY COPY IF NON-EMPTY
 320       memcpy(self.im_buf,src.im_buf,src.nhit*sizeof(IntervalMap))
 321     return 0
 322
 323   def mergeSeq(self):
 324     'merge intervals into single interval per sequence orientation'
 325     cdef int i,j,n,id
 326     if self.nhit<=0: # NOTHING TO MERGE, SO JUST RETURN
 327       return 0
 328     qsort(self.im_buf,self.nhit,sizeof(IntervalMap),target_qsort_cmp) # ORDER BY id,start
 329     n=0
 330     id= -1
 331     for i from 0 <= i < self.nhit:
 332       if self.im_buf[i].target_id!=id or (self.im_buf[j].target_start<0 and
 333                                           self.im_buf[i].target_start>=0):
 334         if id>=0: # WE NEED TO SAVE PREVIOUS INTERVAL
 335           if n<j: # COPY MERGED INTERVAL TO COMPACTED LOCATION
 336             memcpy(self.im_buf+n,self.im_buf+j,sizeof(IntervalMap))
 337           n=n+1
 338         j=i # RECORD THIS AS START OF THE NEW SEQUENCE / ORIENTATION
 339         id=self.im_buf[i].target_id
 340       elif self.im_buf[i].target_end>self.im_buf[j].target_end:
 341         self.im_buf[j].target_end=self.im_buf[i].target_end # EXPAND THIS INTERVAL
 342         self.im_buf[j].end=self.im_buf[i].end # COPY SOURCE SEQ COORDS AS WELL
 343     if n<j: # COPY LAST MERGED INTERVAL TO COMPACTED LOCATION
 344       memcpy(self.im_buf+n,self.im_buf+j,sizeof(IntervalMap))
 345     self.nhit=n+1 # TOTAL #MERGED INTERVALS
 346     return self.nhit
 347
 348   def __iter__(self):
 349     return self
 350
 351   # PYTHON VERSION OF next RETURNS HIT AS A TUPLE
 352   def __next__(self): # PYREX USES THIS NON-STANDARD NAME INSTEAD OF next()!!!
 353     cdef int i
 354     i=self.cnext(NULL)
 355     if i>=0:
 356       return (self.im_buf[i].start,self.im_buf[i].end,self.im_buf[i].target_id,
 357               self.im_buf[i].target_start,self.im_buf[i].target_end)
 358     else:
 359       raise StopIteration
 360
 361   def __dealloc__(self):
 362     'remember: dealloc cannot call other methods!'
 363     free_interval_iterator(self.it_alloc)
 364     if self.im_buf:
 365       free(self.im_buf)
 366
 367
 368
 369
 370
 371 cdef class IntervalFileDB:
 372   def __new__(self,filestem=None,mode='r'):
 373     if filestem is not None and mode=='r':
 374       self.open(filestem)
 375
 376   def open(self,filestem):
 377     cdef char err_msg[1024]
 378     self.db=read_binary_files(filestem,err_msg,1024)
 379     if self.db==NULL:
 380       raise IOError(err_msg)
 381
 382   def find_overlap(self,int start,int end):
 383     self.check_nonempty() # RAISE EXCEPTION IF NO DATA
 384     return IntervalFileDBIterator(start,end,self)
 385
 386   def find_overlap_list(self,int start,int end):
 387     cdef int i,nhit
 388     cdef IntervalIterator *it,*it_alloc
 389     cdef IntervalMap im_buf[1024]
 390     self.check_nonempty() # RAISE EXCEPTION IF NO DATA
 391     it=interval_iterator_alloc()
 392     it_alloc=it
 393     l=[] # LIST OF RESULTS TO HAND BACK
 394     while it:
 395       find_file_intervals(it,start,end,self.db[0].ii,self.db[0].nii,
 396                           self.db[0].subheader,self.db[0].nlists,
 397                           &(self.db[0].subheader_file),
 398                           self.db[0].ntop,self.db[0].div,
 399                           self.db[0].ifile_idb,im_buf,1024,
 400                           &(nhit),&(it)) # GET NEXT BUFFER CHUNK
 401       for i from 0 <= i < nhit:
 402         l.append((im_buf[i].start,im_buf[i].end,im_buf[i].target_id,
 403                   im_buf[i].target_start,im_buf[i].target_end))
 404     free_interval_iterator(it_alloc)
 405     return l
 406
 407   def check_nonempty(self):
 408     if self.db==NULL:
 409       raise IndexError('empty IntervalFileDB, not searchable!')
 410
 411   def close(self):
 412     if self.db:
 413       free_interval_dbfile(self.db)
 414     self.db=NULL
 415
 416   def __dealloc__(self):
 417     'remember: dealloc cannot call other methods!'
 418     if self.db:
 419       free_interval_dbfile(self.db)
 420
 421
 422
 423
 424 cdef class NLMSASliceLetters:
 425   'graph interface to letter graph within this region'
 426   def __new__(self,NLMSASlice nlmsaSlice):
 427     self.nlmsaSlice=nlmsaSlice
 428   def __iter__(self):
 429     return NLMSASliceIterator(self.nlmsaSlice)
 430   def __getitem__(self,NLMSANode node):
 431     return node.nodeEdges()
 432   def items(self):
 433     'list of tuples (node,{target_node:edge})'
 434     l=[]
 435     for node in self:
 436       l.append((node,node.nodeEdges()))
 437     return l
 438   def iteritems(self):
 439     return iter(self.items())
 440
 441
 442
 443
 444 cdef class NLMSASlice:
 445   def __new__(self,NLMSASequence ns not None,int start,int stop,
 446               int id= -1,int offset=0,seq=None):
 447     cdef int i,j,n,start_max,end_min,start2,stop2,nseq,istart,istop,localQuery
 448     cdef NLMSASequence ns_lpo
 449     cdef IntervalFileDBIterator it,it2
 450     cdef IntervalMap *im,*im2
 451     cdef int cacheMax
 452
 453     if seq is None: # GET FROM NLMSASequence
 454       seq=ns.seq
 455     self.nlmsaSequence=ns # SAVE BASIC INFO
 456     self.nlmsa = ns.nlmsaLetters
 457     self.start=start
 458     self.stop=stop
 459     self.offset=offset # ALWAYS STORE offset IN POSITIVE ORIENTATION
 460     self.seq=seq
 461     try: # USE PYTHON METHOD TO DO QUERY
 462       id,ivals=ns.nlmsaLetters.doSlice(seq) # doSlice() RETURNS RAW INTERVALS
 463       self.id=id # SAVE OUR SEQUENCE'S nlmsa_id
 464       it=IntervalFileDBIterator(start,stop,rawIvals=ivals) # STORE IN BINARY FMT
 465       it2=IntervalFileDBIterator(start,stop) # HOLDER FOR SUBSEQUENT MERGE
 466       localQuery=0 # DO NOT PERFORM LOCAL QUERY CODE BELOW!!
 467     except AttributeError:
 468       localQuery=1
 469     if localQuery: ################################## PERFORM LOCAL QUERY
 470       if id<0:
 471         id=ns.id
 472       self.id=id
 473       it2=None
 474       if start<0: # NEED TO TRANSLATE OFFSETS TO MINUS ORIENTATION
 475         offset= -offset
 476       if ns.nlmsaLetters.pairwiseMode==1: # TRANSLATE SEQ DIRECTLY TO LPO
 477         it=IntervalFileDBIterator(start,stop,rawIvals=((start,stop,ns.id-1,
 478                                                         start+offset,stop+offset),))
 479         n=1 # JUST THE SINGLE IDENTITY MAPPING FROM SEQ TO LPO
 480       else: # PERFORM NORMAL SEQ --> LPO QUERY
 481         it=IntervalFileDBIterator(start+offset,stop+offset,ns=ns)
 482         n=it.loadAll() # GET ALL OVERLAPPING INTERVALS
 483         if n<=0:
 484           raise nlmsa_utils.EmptySliceError('this interval is not aligned!')
 485         for i from 0 <= i < n: # CLIP INTERVALS TO FIT [start:stop]
 486           it.im_buf[i].start=it.im_buf[i].start-offset # XLATE TO SRC SEQ COORDS
 487           it.im_buf[i].end=it.im_buf[i].end-offset
 488           if stop<it.im_buf[i].end: # TRUNCATE TO FIT WITHIN [start:stop]
 489             it.im_buf[i].target_end= it.im_buf[i].target_end \
 490                                      +stop-it.im_buf[i].end # CALCULATE NEW ENDPOINT
 491             it.im_buf[i].end=stop
 492           if start>it.im_buf[i].start: # CALCULATE NEW STARTPOINT
 493             it.im_buf[i].target_start=it.im_buf[i].target_start \
 494                                      +start-it.im_buf[i].start
 495             it.im_buf[i].start=start
 496
 497       if ns.is_lpo: # TARGET INTERVALS MUST BE LPO, MUST MAP TO REAL SEQUENCES
 498         it2=IntervalFileDBIterator(start,stop) # HOLDER FOR SUBSEQUENT MERGE
 499       else:
 500         ns_lpo=ns.nlmsaLetters.seqlist[ns.nlmsaLetters.lpo_id] # DEFAULT LPO
 501         for i from 0 <= i < n:
 502           if it.im_buf[i].target_id != ns_lpo.id: # SWITCHING TO A DIFFERENT LPO?
 503             ns_lpo=ns.nlmsaLetters.seqlist[it.im_buf[i].target_id]
 504             if not ns_lpo.is_lpo:
 505               raise ValueError('sequence mapped to non-LPO target??')
 506           if it2 is None: # NEED TO ALLOCATE NEW ITERATOR
 507             it2=IntervalFileDBIterator(it.im_buf[i].target_start,
 508                                        it.im_buf[i].target_end,ns=ns_lpo)
 509           else: # JUST REUSE THIS ITERATOR WITHOUT REALLOCING MEMORY
 510             it2.restart(it.im_buf[i].target_start,
 511                         it.im_buf[i].target_end,None,ns_lpo)
 512           it2.loadAll() # GET ALL OVERLAPPING INTERVALS
 513           if it2.nhit<=0: # NO HITS, SO TRY THE NEXT INTERVAL???
 514             continue
 515           im2=it2.im_buf # ARRAY FROM THIS ITERATOR
 516           for j from 0 <= j < it2.nhit: # MAP EACH INTERVAL BACK TO ns
 517             if it.im_buf[i].target_start>im2[j].start: # GET INTERSECTION INTERVAL
 518               start_max=it.im_buf[i].target_start
 519             else:
 520               start_max=im2[j].start
 521             if it.im_buf[i].target_end<im2[j].end:
 522               end_min=it.im_buf[i].target_end
 523             else:
 524               end_min=im2[j].end
 525             istart=it.im_buf[i].start+start_max-it.im_buf[i].target_start # SRC COORDS
 526             istop=it.im_buf[i].start+end_min-it.im_buf[i].target_start
 527             start2=im2[j].target_start+start_max-im2[j].start # COORDS IN TARGET
 528             stop2=im2[j].target_start+end_min-im2[j].start
 529             if im2[j].target_id!=id or istart!=start2 \
 530                    or ns.nlmsaLetters.pairwiseMode==1: # DISCARD SELF-MATCH
 531               it.saveInterval(istart,istop,im2[j].target_id,start2,stop2) # SAVE IT!
 532             assert ns_lpo.id!=im2[j].target_id
 533
 534     if it.nhit<=0:
 535       raise nlmsa_utils.EmptySliceError('this interval is not aligned!')
 536     it2.copy(it) # COPY FULL SET OF SAVED INTERVALS
 537     self.nseqBounds=it2.mergeSeq() # MERGE TO ONE INTERVAL PER SEQUENCE ORIENTATION
 538     self.seqBounds=it2.getIntervalMap() # SAVE SORTED ARRAY & DETACH FROM ITERATOR
 539
 540     self.im=it.getIntervalMap() # RELEASE THIS ARRAY FROM THE ITERATOR
 541     self.n=it.nhit # TOTAL #INTERVALS SAVED FROM JOIN
 542     qsort(self.im,self.n,sizeof(IntervalMap),imstart_qsort_cmp) # ORDER BY start
 543
 544     n=0
 545     for i from 0 <= i < self.nseqBounds: # COUNT NON-LPO SEQUENCES
 546       if not ns.nlmsaLetters.seqlist.is_lpo(self.seqBounds[i].target_id):
 547         n=n+1
 548     self.nrealseq=n # SAVE THE COUNT
 549
 550     try: # _cache_max=0 TURNS OFF CACHING...
 551       cacheMax=ns.nlmsaLetters.seqDict._cache_max
 552     except AttributeError:
 553       cacheMax=1 # ALLOW CACHING...
 554     try:  # SAVE OUR COVERING INTERVALS AS CACHE HINTS IF POSSIBLE...
 555       saveCache=ns.nlmsaLetters.seqDict.cacheHint
 556     except AttributeError:
 557       cacheMax=0 # TURN OFF CACHING
 558     if cacheMax>0: # CONSTRUCT & SAVE DICT OF CACHE HINTS: COVERING INTERVALS
 559       cacheDict={}
 560       if seq is not None:
 561         try: # ADD A CACHE HINT FOR QUERY SEQ IVAL
 562           seqID=ns.nlmsaLetters.seqs.getSeqID(seq) # GET FULL-LENGTH ID
 563           cacheDict[seqID]=(self.start,self.stop)
 564         except KeyError:
 565           pass
 566       for i from 0 <= i < self.nseqBounds: # ONLY SAVE NON-LPO SEQUENCES
 567         if not ns.nlmsaLetters.seqlist.is_lpo(self.seqBounds[i].target_id):
 568           cacheDict[ns.nlmsaLetters.seqlist.getSeqID(self.seqBounds[i].target_id)]=(self.seqBounds[i].target_start,self.seqBounds[i].target_end)
 569
 570       if cacheDict:
 571         self.weakestLink = nlmsa_utils.SeqCacheOwner()
 572         saveCache(cacheDict, self.weakestLink) # SAVE COVERING IVALS AS CACHE HINT
 573
 574   def __hash__(self):
 575     return id(self)
 576
 577   def __repr__(self):
 578     return "<NLMSASlice object at 0x%x (seq=%s)>" % (id(self), self.seq.id,)
 579
 580   def __dealloc__(self):
 581     'remember: dealloc cannot call other methods!'
 582     if self.im:
 583       free(self.im)
 584       self.im=NULL
 585     if self.seqBounds:
 586       free(self.seqBounds)
 587       self.seqBounds=NULL
 588
 589   cdef object get_seq_interval(self, NLMSA nl, int targetID,
 590                                int start, int stop):
 591     'get seq interval and ensure cache owner keeps it in the cache'
 592     if start<stop:
 593       ival = nl.seqInterval(targetID, start, stop)
 594     else:  # GET THE SEQUENCE OBJECT
 595       ival = nl.seqlist.getSeq(targetID)
 596     try: # tell owner to keep it in the cache
 597       self.weakestLink.cache_reference(ival.pathForward)
 598     except AttributeError:
 599       pass
 600     return ival
 601
 602   ########################################### ITERATOR METHODS
 603   def edges(self,mergeAll=False,**kwargs):
 604     'get list of tuples (srcIval,destIval,edge) aligned in this slice'
 605     seqIntervals=self.groupByIntervals(mergeAll=mergeAll,**kwargs)
 606     ivals=self.groupBySequences(seqIntervals,**kwargs)
 607     l=[]
 608     for ival1,ival2,mergeIntervals in ivals:
 609       l.append((ival1,ival2,sequence.Seq2SeqEdge(self,ival2,ival1,mergeIntervals)))
 610     return l
 611   def items(self,**kwargs):
 612     'get list of tuples (ival2,edge) aligned to this slice'
 613     l=[]
 614     for ival1,ival2,edge in self.edges(**kwargs):
 615       l.append((ival2,edge))
 616     return l
 617   def iteritems(self,**kwargs):
 618     return iter(self.items(**kwargs))
 619   def keys(self,mergeAll=False,**kwargs):
 620     'get list of intervals aligned to this slice according to groupBy options'
 621     seqIntervals=self.groupByIntervals(mergeAll=mergeAll,**kwargs)
 622     ivals=self.groupBySequences(seqIntervals,**kwargs)
 623     l=[]
 624     for ival1,ival2,mergeIntervals in ivals:
 625       l.append(ival2)
 626     return l
 627   def __iter__(self): # PYREX DOESNT ALLOW ARGS TO __iter__ !
 628     return iter(self.keys())
 629   def __getitem__(self,k):
 630     return sequence.Seq2SeqEdge(self,k)
 631   def __setitem__(self,k,v):
 632     raise ValueError('''this NLMSA is read-only!  Currently, you cannot add new
 633 alignment intervals to an NLMSA after calling its build() method.''')
 634   def __len__(self):
 635     return self.nrealseq # NUMBER OF NON-LPO SEQUENCE/ORIS ALIGNED HERE
 636
 637
 638   ##################################### 1:1 INTERVAL METHODS
 639   def matchIntervals(self,seq=None):
 640     '''get all 1:1 match intervals in this region of alignment
 641     as list of tuples.  if seq argument not None, only match intervals
 642     for that sequence will be included.  No clipping is performed.'''
 643     cdef int i,target_id
 644     cdef NLMSA nl
 645     nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
 646     if seq is not None:
 647       target_id=nl.seqs.getID(seq) # CHECK IF IN OUR ALIGNMENT
 648     else:
 649       target_id= -1
 650     l=[]
 651     for i from 0 <= i <self.n: # GET ALL STORED INTERVALS
 652       if not nl.seqlist.is_lpo(self.im[i].target_id) and \
 653                (target_id<0 or self.im[i].target_id==target_id):
 654         ival2=nl.seqInterval(self.im[i].target_id,self.im[i].target_start,
 655                              self.im[i].target_end)
 656         if seq is None or ival2.orientation==seq.orientation:
 657           ival1=sequence.absoluteSlice(self.seq,
 658                                        self.im[i].start,self.im[i].end)
 659           l.append((ival1,ival2)) # SAVE THE INTERVAL MATCH
 660     return l
 661
 662   ############################## MAXIMUM INTERVAL METHODS
 663   cdef int findSeqBounds(self,int id,int ori):
 664     'find the specified sequence / orientation using binary search'
 665     cdef int left,right,mid
 666     left=0
 667     right=self.nseqBounds
 668     while left<right:
 669       mid=(left+right)/2
 670       if self.seqBounds[mid].target_id<id:
 671         left=mid+1
 672       elif self.seqBounds[mid].target_id>id:
 673         right=mid
 674       elif ori>0 and seqBounds[mid].target_start<0:
 675         left=mid+1
 676       elif ori<0 and seqBounds[mid].target_start>=0:
 677         right=mid
 678       else: # MATCHES BOTH id AND ori
 679         return mid
 680     return -1 # FAILED TO FIND id,ori MATCH
 681
 682   def findSeqEnds(self,seq):
 683     'get maximum interval of seq aligned in this interval'
 684     cdef int i,id
 685     cdef NLMSA nl
 686     nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
 687     id=nl.seqs.getID(seq) # CHECK IF IN OUR ALIGNMENT
 688     i=self.findSeqBounds(id,seq.orientation) # FIND THIS id,ORIENTATION
 689     if i<0: # NOT FOUND!
 690       raise KeyError('seq not aligned in this interval')
 691     return self.get_seq_interval(nl, self.seqBounds[i].target_id,
 692                                  self.seqBounds[i].target_start,
 693                                  self.seqBounds[i].target_end)
 694
 695   def generateSeqEnds(self):
 696     'get list of tuples (ival1,ival2,edge)'
 697     cdef int i
 698     cdef NLMSA nl
 699     nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
 700     l=[]
 701     for i from 0 <= i <self.nseqBounds:
 702       if nl.seqlist.is_lpo(self.seqBounds[i].target_id):
 703         continue  # DON'T RETURN EDGES TO LPO
 704       #ival1=sequence.absoluteSlice(self.seq,self.seqBounds[i].start,
 705       #                             self.seqBounds[i].end)
 706       ival2 = self.get_seq_interval(nl, self.seqBounds[i].target_id,
 707                                     self.seqBounds[i].target_start,
 708                                     self.seqBounds[i].target_end)
 709       #l.append((ival1,ival2,sequence.Seq2SeqEdge(self,ival2,ival1)))
 710       edge = self[ival2] # LET edge FIGURE OUT sourcePath FOR US
 711       l.append((edge.sourcePath,ival2,edge))
 712     return l
 713
 714   ############################################## GROUP-BY METHODS
 715   def groupByIntervals(self,int maxgap=0,int maxinsert=0,
 716                        int mininsert= 0,filterSeqs=None,filterList=None,
 717                        mergeMost=False,maxsize=500000000,
 718                        mergeAll=True,ivalMethod=None,**kwargs):
 719     '''merge alignment intervals using "horizontal" group-by rules:
 720       - maxgap (=0): longest gap allowed within a region
 721       - maxinsert (=0): longest insert allowed within a region
 722       - mininsert (=0): should be 0, to prevent cycles within a region
 723         use negative values to allow some overlap / cycles.
 724       - maxsize: upper bound on maximum size for interval merging
 725       - mergeMost: merge, but with limits (10000,10000,-10,50000)
 726       - mergeAll: merge everything without any limits
 727       - filterSeqs (=None): dict of sequences to apply these rules to;
 728         other sequences alignment will be ignored if
 729         filterSeqs not None.  Slower than the filterList option.
 730       - filterList (=None): list of sequence intervals to mask the
 731         result set by.  Note: a single sequence should not be
 732         represented by more than one interval in filterList, as only
 733         one interval for each sequence will be used as the clipping region.
 734         Significantly faster than filterSeqs.
 735       - ivalMethod: a function to process the list of intervals
 736         for each sequence (it can merge or split them in any way
 737         it wants)
 738       - pAlignedMin: a fractional minimum alignment threshold e.g. (0.9)
 739       - pIdentityMin: a fractional minimum identity threshold e.g. (0.9)
 740       '''
 741     cdef int i,j,n,gap,insert,targetStart,targetEnd,start,end,maskStart,maskEnd
 742     cdef NLMSA nl
 743     nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
 744     if mergeMost: # BE REASONABLE: DON'T MERGE A WHOLE CHROMOSOME
 745       maxgap=10000
 746       maxinsert=10000
 747       mininsert=-10 # ALLOW SOME OVERLAP IN INTERVAL ALIGNMENTS
 748       maxsize=50000
 749     if filterList is not None:
 750       targetDict = {}
 751       for seq in filterList: # CREATE AN INDEX OF SEQUENCE IDs TO KEEP
 752         t = nl.seqs.getIDcoords(seq)
 753         targetDict[t[0]] = t[1:] # SAVE START,STOP
 754     seqIntervals={}
 755     for i from 0 <= i < self.n: # LIST INTERVALS FOR EACH TARGET
 756       if nl.seqlist.is_lpo(self.im[i].target_id):
 757         continue # IT IS AN LPO, SO SKIP IT
 758       start=self.im[i].start
 759       end=self.im[i].end
 760       targetStart=self.im[i].target_start
 761       targetEnd=self.im[i].target_end
 762       if filterList is not None:
 763         try: # CHECK IF SEQUENCE IS IN MASKING DICTIONARY
 764           maskStart,maskEnd = targetDict[self.im[i].target_id]
 765         except KeyError:
 766           continue # FILTER THIS SEQUENCE OUT OF THE RESULT SET
 767         if start>=maskEnd or end<=maskStart: # NO OVERLAP
 768           continue
 769         if start<maskStart: # CLIP START TO MASKED REGION
 770           targetStart = targetStart+maskStart-start
 771           start = maskStart
 772         if end>maskEnd:# CLIP END TO MASKED REGION
 773           targetEnd = targetEnd+maskEnd-end
 774           end = maskEnd
 775       elif filterSeqs is not None: # CLIP TARGET SEQ INTERVAL
 776         target = self.get_seq_interval(nl, self.im[i].target_id,
 777                                        targetStart, targetEnd)
 778         try:
 779           target=filterSeqs[target] # PERFORM CLIPPING
 780         except KeyError: # NO OVERLAP IN filterSeqs, SO SKIP
 781           continue
 782         start=start+target.start-targetStart # CLIP SOURCE SEQUENCE
 783         end=end+target.stop-targetEnd
 784         targetStart=target.start  # GET COORDS OF CLIPPED TARGET
 785         targetEnd=target.stop
 786       try: # ADD INTERVAL TO EXISTING LIST
 787         seqIntervals[self.im[i].target_id] \
 788              .append([start,end,targetStart,targetEnd,None])
 789       except KeyError: # CREATE A NEW LIST FOR THIS TARGET
 790         seqIntervals[self.im[i].target_id]= \
 791            [[start,end,targetStart,targetEnd,None]]
 792
 793     for i,l in seqIntervals.iteritems(): # MERGE INTERVALS FOR EACH SEQ
 794       if ivalMethod is not None: # USER-SUPPLIED GROUPING FUNCTION
 795         ivalMethod(l,nl.seqlist.getSeq(i),msaSlice=self,maxgap=maxgap,
 796                    maxinsert=maxinsert,mininsert=mininsert,
 797                    filterSeqs=filterSeqs,mergeAll=mergeAll,**kwargs)
 798         continue # NO NEED TO APPLY GENERIC MERGING OPERATION BELOW
 799       n=0
 800       for j from 1 <= j < len(l): # MERGE BY INDEL LENGTH RULES
 801         gap=l[j][0]-l[n][1] # current.start - last.end
 802         insert=l[j][2]-l[n][3] # current.target_start - last.target_end
 803         if not mergeAll and \
 804                (gap>maxgap or insert>maxinsert or insert<mininsert
 805                 or l[j][1]-l[n][0]>maxsize
 806                 or l[j][3]-l[n][2]>maxsize):
 807           n=n+1 # SPLIT, SO START A NEW INTERVAL
 808           if n<j: # COPY START COORDS TO NEW SLOT
 809             l[n][0]=l[j][0]
 810             l[n][2]=l[j][2]
 811         else: # INTERVALS MERGED: SAVE ORIGINAL 1:1 INTERVAL LIST
 812           try:
 813             lastIval = l[n][4][-1] # GET LAST 1:1 INTERVAL
 814           except TypeError: # EMPTY LIST: CREATE ONE
 815             if l[n][1]==l[j][0] and l[n][3]==l[j][2]: # NO GAP, SO MERGE
 816               l[n][4] = [(l[n][0],l[j][1],l[n][2],l[j][3])]
 817             else: # TWO SEPARATE 1:1 INTERVALS
 818               l[n][4] = [tuple(l[n][:4]),tuple(l[j][:4])]
 819           else: # SEE IF WE CAN FUSE TO LAST 1:1 INTERVAL
 820             if lastIval[1]==l[j][0] and lastIval[3]==l[j][2]:
 821               l[n][4][-1] = (lastIval[0],l[j][1],lastIval[2],l[j][3])
 822             else: # GAP, SO JUST APPEND THIS 1:1 INTERVAL
 823               l[n][4].append(tuple(l[j][:4]))
 824         if n<j: # COPY END COORDS TO CURRENT SLOT
 825           l[n][1]=l[j][1]
 826           l[n][3]=l[j][3]
 827       del l[n+1:] # DELETE REMAINING UNMERGED INTERVALS
 828       for m in l: # CULL SINGLETON 1:1 INTERVAL LISTS (DUE TO FUSION)
 829         try:
 830           if len(m[4])==1: # TWO INTERVALS MUST HAVE BEEN FUSED
 831             m[4] = None # NO NEED TO KEEP SINGLETON!
 832         except TypeError:
 833           pass
 834     # SEQUENCE MASKING BY CONSERVATION OR %ALIGNED CONSTRAINT
 835     if 'pAlignedMin' in kwargs or 'pIdentityMin' in kwargs or \
 836            'minAlignSize' in kwargs or 'maxAlignSize' in kwargs:
 837       self.filterIvalConservation(seqIntervals,**kwargs)
 838     return seqIntervals
 839
 840   def conservationFilter(self,seq,m,pIdentityMin=None,
 841                          minAlignSize=None,maxAlignSize=None,**kwargs):
 842     if minAlignSize is not None and m[1]-m[0]<minAlignSize:
 843       return None
 844     if maxAlignSize is not None and m[1]-m[0]>maxAlignSize:
 845       return None
 846     if pIdentityMin is not None:
 847       seqEdge=sequence.Seq2SeqEdge(self,sequence.relativeSlice(seq,m[2],m[3]),
 848                                    sequence.absoluteSlice(self.seq,m[0],m[1]),m[4])
 849       t = seqEdge.conservedSegment(pIdentityMin=pIdentityMin, # GET CLIPPED INTERVAL
 850                                    minAlignSize=minAlignSize,**kwargs)
 851       if t is None:
 852         return None
 853       mergeIntervals = self.clip_interval_list(t[0],t[1],m[4]) # CLIP mergeIntervals
 854       return list(t)+[mergeIntervals] # RECOMBINE
 855     else:
 856       return m
 857 ##     if pAlignedMin is not None and seqEdge.pAligned()<pAlignedMin:
 858 ##       return False # INTERVAL FAILED ALIGNMENT THRESHOLD, SO REMOVE IT
 859 ##     if pIdentityMin is not None and seqEdge.pIdentity()<pIdentityMin:
 860 ##       return False # INTERVAL FAILED CONSERVATION THRESHOLD, SO REMOVE IT
 861 ##     return True
 862
 863   def filterIvalConservation(self,seqIntervals,pIdentityMin=None,
 864                              filterFun=None,**kwargs):
 865     cdef int i,j
 866     cdef NLMSA nl
 867     import types
 868     if filterFun is None:
 869       filterFun=self.conservationFilter
 870     nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
 871     pIdentityMin0=pIdentityMin
 872     for targetID,l in seqIntervals.items(): # MERGE INTERVALS FOR EACH SEQ
 873       seq = self.get_seq_interval(nl, targetID, 0, 0) # GET THE SEQUENCE OBJECT
 874       if pIdentityMin0 is not None and not isinstance(pIdentityMin0,types.FloatType):
 875         try:
 876           pIdentityMin=pIdentityMin0[seq] # LOOK UP DESIRED IDENTITY FOR THIS SEQ
 877         except KeyError:
 878           del seqIntervals[targetID] # SO REMOVE TARGET ENTIRELY
 879           continue # NO NEED TO PROCESS THIS TARGET ANY FURTHER
 880       j=0
 881       for i from 0 <= i <len(l): # CHECK EACH INTERVAL FOR CONSERVATION THRESHOLD
 882         newIval=filterFun(seq,l[i],pIdentityMin=pIdentityMin,**kwargs)
 883         if newIval is None:
 884           continue # l[i] FAILED FILTER CRITERIA, SO SKIP IT
 885         l[j]=newIval # COMPACT THE ARRAY: KEEP newIval IN LOCATION j
 886         j=j+1 # KEEP THIS ARRAY ENTRY, SO INCREMENT COUNT OF ENTRIES
 887       if j==0: # NO INTERVALS FOR THIS SEQUENCE SURVIVED MASKING
 888         del seqIntervals[targetID] # SO REMOVE TARGET ENTIRELY
 889       elif j<i: # SOME INTERVALS REMOVED, SO SHRINK ITS LIST
 890         del l[j:] # JUST TRUNCATE THE LIST TO ENTRIES THAT PASSED
 891
 892   def groupBySequences(self,seqIntervals,sourceOnly=False,
 893                        indelCut=False,seqGroups=None,minAligned=1,
 894                        pMinAligned=0.,seqMethod=None,**kwargs):
 895     '''merge groups of sequences using "vertical" group-by rules:
 896     - seqGroups: a list of one or more lists of sequences to group.
 897       Each group will be analyzed separately, as follows:
 898     - sourceOnly: output intervals will be reported giving only
 899       the corresponding interval on the source sequence; redundant
 900       output intervals (mapping to the same source interval) are
 901       culled.  Has the effect of giving a single interval traversal
 902       of each group.
 903     - indelCut: for sourceOnly mode, do not merge separate
 904       intervals as reported by seqIntervals (in other words,
 905       that the groupByIntervals analysis separated due to an indel).
 906     - minAligned: the minimum #sequences that must be aligned to
 907       the source sequence for masking the output.  Regions below
 908       this threshold are masked out; no intervals will be reported
 909       in these regions.
 910     - pMinAligned: the minimum fraction of sequences (out of the
 911       total in the group) that must be aligned to the source
 912       sequence for masking the output.
 913     - seqMethod: you may supply your own function for grouping.
 914       Called as seqMethod(bounds,seqs,**kwargs), where
 915       bounds is a sorted list of
 916       (ipos,isStart,i,ns,isIndel,(start,end,targetStart,targetEnd))
 917       seqs is a list of sequences in the group.
 918       Must return a list of (sourceIval,targetIval).  See the docs.
 919     '''
 920     cdef int i,j,start,end,targetStart,targetEnd,ipos,id
 921     cdef float f
 922     cdef NLMSA nl
 923     nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
 924     if seqGroups is None:
 925       seqGroups=[seqIntervals] # JUST USE THE WHOLE SET
 926     result=[]
 927     import mapping # GET ACCESS TO DictQueue CLASS
 928     for seqs in seqGroups: # PROCESS EACH SEQ GROUP
 929       bounds=[]
 930       j=0
 931       for seq in seqs: # CONSTRUCT INTERVAL BOUNDS LIST
 932         if isinstance(seq,int): # seqIntervals USES INT INDEX VALUES
 933           id=seq # SAVE THE ID
 934           seq = self.get_seq_interval(nl, id, 0, 0) # GET THE SEQUENCE OBJECT
 935         else: # EXPECT USER TO SUPPLY ACTUAL SEQUENCE OBJECTS
 936           id=nl.seqs.getID(seq)
 937           seq=seq.pathForward # ENSURE WE HAVE TOP-LEVEL SEQ OBJECT
 938         try:
 939           ivals=seqIntervals[id]
 940         except KeyError: # SEQUENCE NOT IN THIS ALIGNMENT REGION, SO SKIP
 941           continue
 942         isIndel=False
 943         for ival in ivals:
 944           bounds.append((ival[1],False,j,seq,isIndel,ival))
 945           bounds.append((ival[0],True,j,seq,isIndel,ival))
 946           isIndel=True
 947         j=j+1 # SEQUENCE COUNTER ENSURES ORDER OF SEQS IN SORTED LIST
 948       bounds.sort() # ASCENDING ORDER OF source_pos, SORT stop B4 start
 949       if seqMethod is not None:
 950         result=result+seqMethod(bounds,seqs,sourceOnly=sourceOnly,
 951                                 msaSlice=self,minAligned=minAligned,
 952                                 pMinAligned=pMinAligned,
 953                                 indelCut=indelCut,**kwargs)
 954         continue # DON'T USE GENERIC GROUPING METHOD BELOW
 955       seqStart=mapping.DictQueue() # setitem PUSHES, delitem POPS
 956       maskStart=None
 957       for bound in bounds: # GENERIC GROUPING: APPLY MASKING, sourceOnly
 958         ipos,isStart,j,seq,isIndel=bound[0:5]
 959         if isStart: # INTERVAL START
 960           seqStart[seq]=bound[5] # JUST RECORD START OF INTERVAL
 961         else: # INTERVAL STOP
 962           start,end,targetStart,targetEnd,mergeIntervals=bound[5]
 963           if maskStart is not None and not sourceOnly: # SAVE TARGET IVAL
 964             if maskStart>start: # TRUNCATE TARGET IVAL START
 965               targetStart=targetStart+maskStart-start
 966               start=maskStart
 967               mergeIntervals = self.clip_interval_list(maskStart,None,mergeIntervals)
 968             result.append((sequence.absoluteSlice(self.seq,start,end),
 969                            sequence.relativeSlice(seq,targetStart,
 970                                                   targetEnd),mergeIntervals))
 971           del seqStart[seq] # POP THIS SEQ FROM START DICT
 972
 973         f=len(seqStart) # #ALIGNED SEQS IN THIS REGION
 974         if f<minAligned or f/len(seqs)<pMinAligned: # APPLY MASKING
 975           if maskStart is not None:
 976             if sourceOnly: # JUST SAVE MERGED SOURCE INTERVAL
 977               result.append(sequence.absoluteSlice(self.seq,maskStart,end))
 978             else: # REPORT TARGET IVALS WITHIN (maskStart,end) REGION
 979               for seq in seqStart: # CANNOT USE items() BECAUSE THIS IS A QUEUE!
 980                 (start,i,targetStart,targetEnd,mergeIntervals)=seqStart[seq]
 981                 pleaseClip = False
 982                 if maskStart>start: # TRUNCATE TARGET IVAL START
 983                   targetStart=targetStart+maskStart-start
 984                   start=maskStart
 985                   pleaseClip = True
 986                 if end<i: # TRUNCATE TARGET IVAL END
 987                   targetEnd=targetEnd+end-i
 988                   pleaseClip = True
 989                 if pleaseClip:
 990                   mergeIntervals = self.clip_interval_list(maskStart,end,mergeIntervals)
 991                 result.append((sequence.absoluteSlice(self.seq,start,end),
 992                                sequence.relativeSlice(seq,targetStart,
 993                                                       targetEnd),mergeIntervals))
 994             maskStart=None # REGION NOW BELOW THRESHOLD
 995         elif maskStart is None:
 996           maskStart=ipos # START OF REGION ABOVE THRESHOLD
 997         if maskStart is not None and sourceOnly and indelCut \
 998                and isIndel and maskStart<ipos:
 999           result.append(sequence.absoluteSlice(self.seq,maskStart,ipos))
1000           maskStart=ipos
1001     return result
1002   def clip_interval_list(self,start,end,l):
1003     'truncate list of 1:1 intervals using start,end'
1004     if l is None:
1005       return None
1006     result = []
1007     for srcStart,srcEnd,destStart,destEnd in l:
1008       if (start is not None and start>=srcEnd) or (end is not None and end<=srcStart):
1009         continue
1010       if start is not None and start>srcStart:
1011         destStart = destStart+start-srcStart
1012         srcStart = start
1013       if end is not None and end<srcEnd:
1014         destEnd = destEnd+end-srcEnd
1015         srcEnd = end
1016       result.append((srcStart,srcEnd,destStart,destEnd))
1017     if len(result)<2:
1018       return None
1019     else:
1020       return result
1021
1022   ############################################## LPO REGION METHODS
1023   def split(self,minAligned=0,**kwargs):
1024     '''Use groupByIntervals() and groupBySequences() methods to
1025     divide this slice into subslices using indel rules etc.'''
1026     seqIntervals=self.groupByIntervals(**kwargs)
1027     kwargs['sourceOnly']=True
1028     kwargs['indelCut']=True
1029     ivals=self.groupBySequences(seqIntervals,minAligned=minAligned,
1030                                 **kwargs)
1031     l=[]
1032     for ival in ivals:
1033       if ival.start==self.start and ival.stop==self.stop:
1034         l.append(self) # SAME INTERVAL, SO JUST RETURN self
1035       else:
1036         subslice=NLMSASlice(self.nlmsaSequence,ival.start,ival.stop,
1037                             self.id,self.offset,self.seq)
1038         l.append(subslice)
1039     return l
1040
1041
1042   def regions(self,dummyArg=None,**kwargs):
1043     '''get LPO region(s) corresponding to this interval
1044     Same group-by rules apply here as for the split() method.'''
1045     cdef int i
1046     cdef NLMSA nl
1047     cdef NLMSASequence ns_lpo
1048     if self.nlmsaSequence.is_lpo: # ALREADY AN LPO REGION!
1049       return self.split(**kwargs) # JUST APPLY GROUP-BY RULES TO  self
1050     nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
1051     l=[]
1052     for i from 0 <= i <self.nseqBounds:
1053       ns_lpo=nl.seqlist[self.seqBounds[i].target_id]
1054       if ns_lpo.is_lpo: # ADD ALL LPO INTERVALS ALIGNED HERE
1055         subslice=NLMSASlice(ns_lpo,self.seqBounds[i].target_start,
1056                             self.seqBounds[i].target_end)
1057         l=l+subslice.split(**kwargs) # APPLY GROUP-BY RULES
1058     if len(l)>0:
1059       return l
1060     raise ValueError('no LPO in nlmsaSlice.seqBounds?  Debug!')
1061
1062
1063   ########################################### LETTER METHODS
1064   property letters:
1065     'interface to individual LPO letters in this interval'
1066     def __get__(self):
1067       return NLMSASliceLetters(self)
1068
1069   def __cmp__(self,other):
1070     if isinstance(other,NLMSASlice):
1071       return cmp(self.nlmsaSequence,other.nlmsaSequence)
1072     else:
1073       return -1
1074
1075   def rawIvals(self):
1076     'return list of raw numeric intervals in this slice'
1077     cdef int i
1078     l=[]
1079     for i from 0 <= i < self.n:
1080       l.append((self.im[i].start,self.im[i].end,self.im[i].target_id,
1081                 self.im[i].target_start,self.im[i].target_end))
1082     return l
1083
1084
1085
1086
1087
1088 def advanceStartStop(int ipos,NLMSASlice nlmsaSlice not None,
1089                      int istart,int istop):
1090   cdef int i
1091   if istop>=nlmsaSlice.n:
1092     raise IndexError('out of bounds')
1093   for i from istop <= i < nlmsaSlice.n:
1094     if ipos>=nlmsaSlice.im[i].start: # ENTERS THIS INTERVAL
1095       istop = i + 1 # ADVANCE THE END MARKER
1096     else:
1097       break # BEYOND ipos, SEARCH NO FURTHR
1098   for i from istart <= i < istop: # FIND 1ST OVERLAP
1099     if ipos<nlmsaSlice.im[i].end:
1100       break
1101   return i,istop
1102
1103
1104
1105
1106
1107
1108 cdef class NLMSASliceIterator:
1109   'generate letters (nodes) in this LPO slice'
1110   def __new__(self,NLMSASlice nlmsaSlice not None):
1111     self.nlmsaSlice=nlmsaSlice
1112     self.ipos= nlmsaSlice.start - 1
1113
1114   def __iter__(self):
1115     return self
1116
1117   def __next__(self):
1118     self.ipos = self.ipos + 1
1119     # ADJUST istart,istop TO OVERLAP ipos
1120     self.istart,self.istop= \
1121       advanceStartStop(self.ipos,self.nlmsaSlice,self.istart,self.istop)
1122     if self.istart>=self.istop: # HMM, NO OVERLAPS TO ipos
1123       if self.istop<self.nlmsaSlice.n: # ANY MORE INTERVALS?
1124         self.ipos=self.nlmsaSlice.im[self.istop].start # START OF NEXT INTERVAL
1125         # ADJUST istart,istop TO OVERLAP ipos
1126         self.istart,self.istop= \
1127           advanceStartStop(self.ipos,self.nlmsaSlice,self.istart,self.istop)
1128       else:
1129         raise StopIteration # NO MORE POSITIONS IN THIS SLICE
1130     return NLMSANode(self.ipos,self.nlmsaSlice,self.istart,self.istop)
1131
1132
1133
1134
1135
1136
1137
1138 cdef class NLMSANode:
1139   'interface to a node in NLMSA storage of LPO alignment'
1140   def __new__(self,int ipos,NLMSASlice nlmsaSlice not None,
1141               int istart=0,int istop= -1):
1142     cdef int i,n
1143     cdef NLMSA nl
1144     self.nlmsaSlice=nlmsaSlice
1145     nl=nlmsaSlice.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
1146     self.ipos=ipos
1147     if istop<0:
1148       istart,istop= \
1149         advanceStartStop(ipos,nlmsaSlice,istart,istart) # COMPUTE PROPER BOUNDS
1150     self.istart=istart # SAVE BOUNDS
1151     self.istop=istop
1152     self.id=ipos # DEFAULT: ASSUME SLICE IS IN LPO...
1153     for i from istart <= i < istop:
1154       if nlmsaSlice.im[i].start<=ipos and ipos<nlmsaSlice.im[i].end:
1155         if nl.seqlist.is_lpo(nlmsaSlice.im[i].target_id):
1156           self.id=nlmsaSlice.im[i].target_start+ipos-nlmsaSlice.im[i].start #LPO ipos
1157         else: # DON'T COUNT THE LPO SEQUENCE
1158           self.n = self.n + 1
1159
1160   ############################################# ALIGNED LETTER METHODS
1161   def __len__(self):
1162     return self.n
1163   def __iter__(self):
1164     cdef int i,j
1165     cdef NLMSA nl
1166     nl=self.nlmsaSlice.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
1167     l=[]
1168     for i from self.istart <= i < self.istop:
1169       if self.nlmsaSlice.im[i].start<=self.ipos \
1170              and self.ipos<self.nlmsaSlice.im[i].end \
1171              and not nl.seqlist.is_lpo(self.nlmsaSlice.im[i].target_id):
1172         j=self.nlmsaSlice.im[i].target_start+self.ipos-self.nlmsaSlice.im[i].start
1173         l.append(nl.seqInterval(self.nlmsaSlice.im[i].target_id,j,j+1))
1174     return iter(l)
1175
1176   def getSeqPos(self,seq):
1177     'return seqpos for this seq at this node'
1178     cdef int i,j,id
1179     try:
1180       id=self.nlmsaSlice.nlmsaSequence.nlmsaLetters.seqs.getID(seq)
1181     except KeyError:
1182       raise KeyError('seq not in this alignment')
1183     for i from self.istart <= i < self.istop:
1184       if self.nlmsaSlice.im[i].target_id==id: # RETURN THE SEQUENCE INTERVAL
1185         j=self.nlmsaSlice.im[i].target_start+self.ipos-self.nlmsaSlice.im[i].start
1186         return sequence.absoluteSlice(seq,j,j+1)
1187     raise KeyError('seq not in node')
1188
1189   property edges:
1190     "get this node's edges as list of tuples (ival1,ival2,edge)"
1191     def __get__(self):
1192       l=[]
1193       ival1=sequence.absoluteSlice(self.nlmsaSlice.seq,
1194                                    self.ipos,self.ipos+1) # OUR SEQ INTERVAL
1195       for ival2 in self:
1196         l.append((ival1,ival2,None)) # GET EDGE INFO!!!
1197       return l
1198
1199   def __getitem__(self,seq):
1200     raise NotImplementedError('hey! write some code here!')
1201 ##     from lpo import POMSANodeRef # PROBABLY WONT NEED THIS AFTER RENAMING!!!
1202 ##     try:
1203 ##       s=self.getSeqPos(seq)
1204 ##       return POMSANodeRef(self,seq.path)
1205 ##     else:
1206 ##       raise KeyError('seq not in node')
1207
1208   def __cmp__(self,other):
1209     if isinstance(other,NLMSANode):
1210       return cmp((self.nlmsaSlice,ipos),(other.nlmsaSlice,ipos))
1211     else:
1212       return -1
1213
1214   ########################################## NODE-TO-NODE EDGE METHODS
1215   cdef int check_edge(self,int iseq,int ipos):
1216     cdef int i
1217     for i from self.istart <= i < self.istop:
1218       if self.nlmsaSlice.im[i].start<=self.ipos \
1219          and self.ipos<self.nlmsaSlice.im[i].end \
1220          and self.nlmsaSlice.im[i].target_id==iseq \
1221          and self.nlmsaSlice.im[i].target_start==ipos:
1222         return 1 # MATCH!
1223     return 0 # NO MATCH!
1224
1225   def getEdgeSeqs(self,NLMSANode other):
1226     "return dict of sequences that traverse edge from self -> other"
1227     cdef int i
1228     cdef NLMSA nl
1229     nl=self.nlmsaSlice.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
1230     d={}
1231     if self.id+1==other.id: #other ADJACENT IN LPO
1232       for i from self.istart <= i < self.istop:
1233         if self.nlmsaSlice.im[i].start<=self.ipos+1 \
1234                and self.ipos+1<self.nlmsaSlice.im[i].end: # INTERVAL CONTAINS other
1235           seq=nl.seqlist.getSeq(self.nlmsaSlice.im[i].target_id)
1236           d[seq]=self.nlmsaSlice.im[i].target_start+self.ipos-self.nlmsaSlice.im[i].start
1237         elif self.ipos+1==self.im[i].end \
1238                  and other.check_edge(self.nlmsaSlice.im[i].target_id,
1239                                       self.nlmsaSlice.im[i].target_end):
1240           seq=nl.seqlist.getSeq(self.nlmsaSlice.im[i].target_id) # BRIDGE TO NEXT INTERVAL
1241           d[seq]=self.nlmsaSlice.im[i].target_start+self.ipos-self.nlmsaSlice.im[i].start
1242     else: # other NOT ADJACENT, SO INTERVALS THAT END HERE MIGHT JUMP TO other
1243       for i from self.istart <= i < self.istop:
1244         if self.ipos+1==self.nlmsaSlice.im[i].end \
1245                and other.check_edge(self.nlmsaSlice.im[i].target_id,
1246                                     self.nlmsaSlice.im[i].target_end):
1247           seq=nl.seqlist.getSeq(self.nlmsaSlice.im[i].target_id) # BRIDGE TO NEXT INTERVAL
1248           d[seq]=self.nlmsaSlice.im[i].target_start+self.ipos-self.nlmsaSlice.im[i].start
1249     return d
1250
1251   def nodeEdges(self):
1252     'get all outgoing edges from this node'
1253     cdef int i,has_continuation
1254     has_continuation= -1
1255     d={}
1256     nodes={} # LIST OF TARGET NODES WE HAVE AN EDGE TO
1257     for i from self.istart <= i < self.nlmsaSlice.n:
1258       if i>=self.istop and len(d)==0: # NO FURTHER CHECKS TO DO
1259         break
1260       if self.nlmsaSlice.im[i].start<=self.ipos and \
1261          self.ipos<self.nlmsaSlice.im[i].end-1:
1262         has_continuation=i # ipos INSIDE THIS INTERVAL, SO HAS EDGE TO ipos+1
1263       elif self.ipos==self.nlmsaSlice.im[i].end-1: # END OF THIS INTERVAL
1264         d[self.nlmsaSlice.im[i].target_id]=self.nlmsaSlice.im[i].target_end
1265       else:
1266         try: # CHECK FOR START OF AN "ADJACENT" INTERVAL
1267           if d[self.nlmsaSlice.im[i].target_id]==self.nlmsaSlice.im[i].target_start:
1268             nodes[self.nlmsaSlice.im[i].start]=i
1269             del d[self.nlmsaSlice.im[i].target_id] # REMOVE FROM ADJACENCY LIST
1270         except KeyError:
1271           pass
1272     if has_continuation>=0:
1273       nodes[self.ipos+1]=has_continuation
1274     result={}
1275     for i in nodes:
1276       node=NLMSANode(i,self.nlmsaSlice,self.istart)
1277       result[node]=sequence.LetterEdge(self,node) # EDGE OBJECT
1278     return result
1279
1280
1281
1282
1283
1284
1285 cdef class NLMSASequence:
1286   'sequence interface to NLMSA storage of an LPO alignment'
1287   def __init__(self,NLMSA nl not None,filestem,seq,mode='r',is_union=0,
1288                length=None):
1289     self.nlmsaLetters=nl
1290     self.filestem=filestem
1291     self.is_union=is_union
1292     self.is_lpo=0 # DEFAULT: NOT AN LPO
1293     self.seq=seq
1294     if length is not None: # ALLOW USER TO SUPPLY A LENGTH FOR THIS COORD SYSTEM
1295       self.length=length
1296     import types
1297     if isinstance(seq,types.StringType):
1298       self.name=seq # ALLOW USER TO BUILD INDEXES WITH A STRING NAME
1299     elif seq is not None: # REGULAR SEQUENCE
1300       seq= seq.pathForward # GET THE WHOLE SEQUENCE, IN FORWARD ORIENTATION
1301       try: # MAKE SURE seq HAS A UNIQUE NAME FOR INDEXING IT...
1302         self.name=str(seq.path.name)
1303       except AttributeError:
1304         try:
1305           self.name=str(seq.path.id)
1306         except AttributeError:
1307           raise AttributeError('NLMSASequence: seq must have name or id attribute')
1308     else:
1309       self.length=0 # LPO AND UNION SEQUENCES EXPAND AUTOMATICALLY
1310       if not is_union:
1311         self.is_lpo=1
1312         if len(nl.lpoList)>0:  # CALCULATE OFFSET OF NEW LPO, BASED ON LAST LPO
1313           lastLPO=nl.lpoList[-1]
1314           self.offset=lastLPO.offset+lastLPO.length
1315         else:
1316           self.offset=0
1317         nl.lpoList.append(self) # ADD TO THE LPO LIST
1318     self.idb=None # DEFAULT: NOT USING IN-MEMORY DATABASE.
1319     self.db=None # DEFAULT: WAIT TO OPEN DB UNTIL ACTUALLY NEEDED
1320     if mode=='r': # IMMEDIATELY OPEN DATABASE, UNLIKE onDemand MODE
1321       self.db=IntervalFileDB(filestem,mode)
1322     elif mode=='memory': # OPEN IN-MEMORY DATABASE
1323       self.idb=IntervalDB()
1324     elif mode=='w': # WRITE .build FILE
1325       filename=filestem+'.build'
1326       self.build_ifile=fopen(filename,'wb') # binary file
1327       if self.build_ifile==NULL:
1328         errmsg='unable to open in write mode: '+filename
1329         raise IOError(errmsg)
1330       self.nbuild=0
1331
1332   def __dealloc__(self):
1333     'remember: dealloc cannot call other methods!'
1334     if self.build_ifile:
1335       fclose(self.build_ifile)
1336
1337   def forceLoad(self):
1338     'force database to be initialized, if not already open'
1339     self.db=IntervalFileDB(self.filestem,'r')
1340
1341   def close(self):
1342     'free memory and close files associated with this sequence index'
1343     if self.db is not None:
1344       self.db.close() # CLOSE THE DATABASE, RELEASE MEMORY
1345       self.db=None # DISCONNECT FROM DATABASE
1346     if self.idb is not None:
1347       self.idb.close() # CLOSE THE DATABASE, RELEASE MEMORY
1348       self.idb=None # DISCONNECT FROM DATABASE
1349     if self.build_ifile:
1350       fclose(self.build_ifile)
1351       self.build_ifile=NULL
1352
1353   def buildFiles(self,**kwargs):
1354     'build nested list from saved unsorted alignment data'
1355     cdef IntervalDB db
1356     if self.build_ifile==NULL:
1357       raise IOError('not opened in write mode')
1358     fclose(self.build_ifile)
1359     self.build_ifile=NULL
1360     filename=self.filestem+'.build'
1361     db=IntervalDB() # CREATE EMPTY NL IN MEMORY
1362     if self.nbuild>0:
1363       db.buildFromUnsortedFile(filename,self.nbuild,**kwargs) # BUILD FROM .build
1364     db.write_binaries(self.filestem) # SAVE AS IntervalDBFile
1365     db.close() # DUMP NESTEDLIST FROM MEMORY
1366     import os
1367     os.remove(filename) # REMOVE OUR .build FILE, NO LONGER NEEDED
1368     self.db=IntervalFileDB(self.filestem) # NOW OPEN THE IntervalFileDB
1369     return self.nbuild # return count of intervals
1370
1371   def buildInMemory(self, **kwargs):
1372     try:
1373       n = len(self.buildList)
1374     except TypeError:
1375       return 0
1376     else:
1377       self.idb.save_tuples(self.buildList,**kwargs)
1378       self.buildList = None
1379       return n
1380
1381   cdef int saveInterval(self,IntervalMap im[],int n,int expand_self,FILE *ifile):
1382     cdef int i
1383     if ifile==NULL:
1384       raise IOError('not opened in write mode')
1385     if expand_self: # AN LPO THAT EXPANDS AS WE ADD TO IT...
1386       for i from 0 <= i < n:
1387         if im[i].start>=0:
1388           if im[i].end>self.length: # EXPAND IT...
1389             self.length=im[i].end
1390         elif -(im[i].start)>self.length:
1391           self.length= -(im[i].start) # THIS HANDLES NEGATIVE ORI CASE
1392     i=write_padded_binary(im,n,1,ifile)
1393     if i!=n:
1394       raise IOError('write_padded_binary failed???')
1395     return i
1396
1397   def __setitem__(self,k,t): # SAVE TO .build FILE
1398     'save mapping [k.start:k.stop] --> (id,start,stop)'
1399     cdef int i
1400     cdef IntervalMap im_tmp
1401     if self.build_ifile: # SAVE TO BUILD FILE
1402       im_tmp.start,im_tmp.end=(k.start,k.stop)
1403       im_tmp.target_id,im_tmp.target_start,im_tmp.target_end=t
1404       im_tmp.sublist= -1
1405       i=self.saveInterval(&im_tmp,1,self.is_lpo,self.build_ifile)
1406       #logger.debug('saveInterval: %s %s %s  %s %s %s' % (self.id, im_tmp.start, im_tmp.end,
1407       #             im_tmp.target_id, im_tmp.target_start, im_tmp.target_end))
1408       self.nbuild=self.nbuild+i # INCREMENT COUNTER OF INTERVALS SAVED
1409     elif self.nlmsaLetters.in_memory_mode:
1410       t=(k.start,k.stop)+t
1411       try:
1412         self.buildList.append(t)
1413       except AttributeError:
1414         self.buildList=[t]
1415     else:
1416       raise ValueError('not opened in write mode')
1417
1418   def __getitem__(self,k):
1419     try:
1420       if k.pathForward is self.seq:
1421         return NLMSASlice(self,k.start,k.stop)
1422     except AttributeError: pass
1423     raise KeyError('key must be a sequence interval of this sequence')
1424
1425   def __len__(self):
1426     'call len(self.seq) if we have a seq.  Otherwise self.length'
1427     if self.seq is None:
1428       return self.length
1429     else:
1430       return len(self.seq)
1431
1432   def __iadd__(self,seq):
1433     'add sequence to our union'
1434     try: # CHECK WHETHER THIS IS ALREADY IN THE INDEX
1435       x=self.nlmsaLetters.seqs[seq]
1436       return self # ALREADY IN THE INDEX, NO NEED TO ANYTHING
1437     except KeyError: # OK, WE REALLY DO NEED TO ADD IT...
1438       pass
1439     # CHECK FOR OVERFLOW... CREATE A NEW UNION IF NEEDED
1440     seq=seq.pathForward # GET THE ENTIRE SEQUENCE
1441     if self.length+len(seq)>self.nlmsaLetters.maxlen: # TOO BIG!
1442       if self.nlmsaLetters.pairwiseMode: # NEED TO CREATE CORRESPONDING LPO
1443         ns=self.nlmsaLetters.newSequence(None) # CREATE NEW LPO
1444       ns=self.nlmsaLetters.newSequence(None,is_union=1) # NEW UNION
1445       ns.__iadd__(seq) # ADD seq TO BRAND-NEW UNION
1446       return ns # RETURN THE NEW UNION COORDINATE SYSTEM
1447     # USE OUR EXISTING UNION
1448     self.nlmsaLetters.seqs.saveSeq(seq,self.id,self.length)
1449     self.length=self.length+len(seq) # EXPAND COORDINATE SYSTEM
1450     return self # iadd MUST ALWAYS RETURN self!!!
1451
1452
1453
1454
1455
1456
1457 cdef class NLMSA:
1458   'toplevel interface to NLMSA storage of an LPO alignment'
1459   def __init__(self,pathstem='',mode='r',seqDict=None,mafFiles=None,axtFiles=None,
1460                maxOpenFiles=1024,maxlen=None,nPad=1000000,maxint=41666666,
1461                trypath=None,bidirectional=True,pairwiseMode= -1,
1462                bidirectionalRule=nlmsa_utils.prune_self_mappings,
1463                use_virtual_lpo=None,maxLPOcoord=None,
1464                inverseDB=None, alignedIvals=None, **kwargs):
1465     try:
1466       import resource # WE MAY NEED TO OPEN A LOT OF FILES...
1467       resource.setrlimit(resource.RLIMIT_NOFILE,(maxOpenFiles,-1))
1468     except: # BUT THIS IS OPTIONAL...
1469       pass
1470     self.lpoList=[] # EMPTY LIST OF LPO
1471     self.seqs=nlmsa_utils.NLMSASeqDict(self,pathstem,mode,**kwargs)
1472     self.seqlist=self.seqs.seqlist
1473     self.pathstem=pathstem
1474     self.inverseDB = inverseDB
1475     if maxlen is None:
1476       maxlen=C_int_max-65536 # C_int_max MAXIMUM VALUE REPRESENTABLE BY int
1477       if axtFiles is not None:
1478         maxlen = maxlen/2
1479     self.maxlen=maxlen
1480     self.inlmsa=nPad
1481     self._ignoreShadowAttr={'sourceDB':None,'targetDB':None} # SCHEMA INFO
1482     self.seqDict=seqDict # SAVE FOR USER TO ACCESS...
1483     self.in_memory_mode=0
1484     if bidirectional:
1485       self.is_bidirectional=1
1486     else:
1487       self.is_bidirectional=0
1488     if use_virtual_lpo is not None: # DEPRECATED: MUST HAVE COME FROM PICKLE...
1489       pairwiseMode = use_virtual_lpo # USE SETTING FROM PICKLE...
1490     if pairwiseMode is True:
1491       self.pairwiseMode=1
1492     elif pairwiseMode is False:
1493       self.pairwiseMode=0
1494     else:
1495       self.pairwiseMode=pairwiseMode
1496     if maxLPOcoord is not None: # SIZE OF THE ENTIRE LPO COORD SYSTEM
1497       self.maxLPOcoord=maxLPOcoord
1498       if self.pairwiseMode==1:
1499         raise ValueError('maxLPOcoord and pairwiseMode options incompatible!')
1500       self.pairwiseMode=0
1501     else: # DEFAULT: RESTRICT USER TO A SINGLE LPO
1502       self.maxLPOcoord=self.maxlen
1503     if mode=='r': # OPEN FROM DISK FILES
1504       if self.seqDict is None:
1505         self.seqDict = nlmsa_utils.read_seq_dict(pathstem,trypath)
1506       self.read_indexes(self.seqDict)
1507       self.read_attrs()
1508     elif mode=='w': # WRITE TO DISK FILES
1509       self.do_build=1
1510       self.lpo_id=0
1511       if mafFiles is not None:
1512         self.newSequence() # CREATE INITIAL LPO
1513         self.readMAFfiles(mafFiles,maxint)
1514       elif axtFiles is not None:
1515         self.newSequence() # CREATE INITIAL LPO
1516         self.readAxtNet(axtFiles,bidirectionalRule)
1517       else: # USER WILL ADD INTERVAL MAPPINGS HIMSELF...
1518         if self.seqDict is None:
1519           import seqdb
1520           self.seqDict=seqdb.SeqPrefixUnionDict(addAll=True)
1521         self.initLPO() # CREATE AS MANY LPOs AS WE NEED
1522         self.newSequence(is_union=1) # SO HE NEEDS AN INITIAL UNION
1523       if alignedIvals is not None:
1524         self.add_aligned_intervals(alignedIvals)
1525         self.build()
1526     elif mode=='memory': # CONSTRUCT IN-MEMORY
1527       if self.seqDict is None:
1528         import seqdb
1529         self.seqDict=seqdb.SeqPrefixUnionDict(addAll=True)
1530       self.in_memory_mode=1
1531       self.do_build=1
1532       self.initLPO() # CREATE AS MANY LPOs AS WE NEED
1533       self.newSequence(is_union=1) # CREATE INITIAL UNION
1534       self.lpo_id=0
1535       if alignedIvals is not None:
1536         self.add_aligned_intervals(alignedIvals)
1537         self.build()
1538     elif mode!='xmlrpc':
1539       raise ValueError('unknown mode %s' % mode)
1540
1541   def close(self):
1542     'close our shelve index files'
1543     cdef NLMSASequence ns
1544     for ns in self.seqlist: # tell each seq to close its index files
1545       ns.close()
1546     self.seqs.close()
1547
1548   def __reduce__(self): ############################# SUPPORT FOR PICKLING
1549     import classutil
1550     return (classutil.ClassicUnpickler, (self.__class__,self.__getstate__()))
1551   def __getstate__(self):
1552     if self.in_memory_mode:
1553       raise ValueError("can't pickle NLMSA.in_memory_mode")
1554     return dict(pathstem=self.pathstem,seqDict=self.seqDict,
1555                 inverseDB=self.inverseDB)
1556   def __setstate__(self,state):
1557     self.__init__(**state) #JUST PASS KWARGS TO CONSTRUCTOR
1558
1559   def read_indexes(self,seqDict):
1560     'open all nestedlist indexes in this LPO database for immediate use'
1561     cdef NLMSASequence ns
1562     try:
1563       ifile=file(self.pathstem+'.NLMSAindex', 'rU') # text file
1564     except IOError:
1565       ifile=file(self.pathstem+'NLMSAindex', 'rU') # FOR BACKWARDS COMPATIBILITY
1566     try:
1567       for line in ifile:
1568         id,name,is_union,length=line.strip().split('\t')
1569         id=int(id)
1570         is_union=int(is_union)
1571         if id!=len(self.seqlist):
1572           raise IOError('corrupted NLMSAIndex???')
1573         filestem=self.pathstem+str(id)
1574         seq=None # DEFAULT: NO ACTUAL SEQUENCE ASSOCIATED WITH LPO OR UNION
1575         if name=='NLMSA_LPO_Internal': # AN LPO REFERENCE
1576           self.lpo_id=id
1577         elif not is_union: # REGULAR SEQUENCE
1578           try:
1579             seq=seqDict[name]
1580           except KeyError:
1581             raise KeyError('unable to find sequence %s in seqDict!' % name)
1582         # CREATE THE SEQ INTERFACE, BUT DELAY OPENING THE IntervalDBFile
1583         ns=NLMSASequence(self,filestem,seq,'onDemand',is_union) # UNTIL NEEDED
1584         ns.length=int(length) # SAVE STORED LENGTH
1585         self.addToSeqlist(ns,seq)
1586     finally:
1587       ifile.close()
1588   def read_attrs(self):
1589     'read pickled attribute dictionary from file and apply to self'
1590     import pickle
1591     try:
1592       ifile = file(self.pathstem+'.attrDict', 'rb') # pickle is binary file!
1593     except IOError: # BACKWARDS COMPATIBILITY: OLD NLMSA HAS NOT ATTRDICT
1594       return
1595     try:
1596       d = pickle.load(ifile)
1597       for k,v in d.items():
1598         if k=='is_bidirectional':
1599           self.is_bidirectional = v
1600         elif k=='pairwiseMode':
1601           self.pairwiseMode = v
1602         else:
1603           setattr(self,k,v)
1604     finally:
1605       ifile.close()
1606   def addToSeqlist(self,NLMSASequence ns,seq=None):
1607     'add an NLMSASequence to our seqlist, and set its id'
1608     ns.id=self.seqlist.nextID()
1609     self.seqs[seq]=ns # SAVE TO OUR INDEX
1610
1611   def newSequence(self,seq=None,is_union=0):
1612     'create a new nestedlist index for sequence seq'
1613     cdef NLMSASequence ns
1614     id=self.seqlist.nextID() # USE ITS INDEX AS UNIQUE ID FOR THIS SEQUENCE
1615     if self.pairwiseMode==1 and is_union and id>1: # NEED A VIRTUAL_LPO FOR THIS UNION
1616       self.newSequence() # CREATE AN LPO
1617       id=self.seqlist.nextID() # NOW GET ID FOR THE UNION
1618     filestem=self.pathstem+str(id)
1619     if self.in_memory_mode:
1620       mode='memory'
1621     else:
1622       mode='w'
1623     ns=NLMSASequence(self,filestem,seq,mode,is_union=is_union) #OPEN FOR WRITING
1624     if is_union: # RECORD THIS AS OUR CURRENT UNION OBJECT
1625       self.currentUnion=ns
1626     self.addToSeqlist(ns,seq) # SAVE TO OUR INDEX
1627     #logger.debug('Opened build file for ns_id %s, is_union %s' % (ns.id,
1628     #                                                              ns.is_union))
1629     return ns
1630
1631   def nextID(self):
1632     'get a unique nlmsaID and advance counter'
1633     nlmsaID=self.inlmsa # USE THE NEXT FREE ID
1634     self.inlmsa=self.inlmsa+1 # ADVANCE THE COUNTER
1635     return nlmsaID
1636
1637   def initLPO(self):
1638     'create LPOs for this alignment'
1639     cdef NLMSASequence ns
1640     offset=0
1641     while offset<self.maxLPOcoord:
1642       ns=self.newSequence() # CREATE AN LPO
1643       ns.offset=offset # FORCE OUR DESIRED OFFSET... EVEN THOUGH ALL LPOs EMPTY
1644       offset=offset+self.maxlen
1645   def init_pairwise_mode(self):
1646     'turn on use of virtual LPO mapping (i.e. no actual LPO is present!)'
1647     if self.pairwiseMode==0:
1648       raise ValueError('this alignment is already using an LPO!')
1649     elif self.pairwiseMode!=1: # NOT ALREADY SET TO PAIRWISE MODE
1650       logger.info('''
1651 Because you are aligning a pair of sequence intervals,
1652 the pairwiseMode=True option is automatically being applied.
1653 To avoid this message in the future, pass the pairwiseMode=True
1654 option to the NLMSA constructor.
1655 See the NLMSA documentation for more details.\n''')
1656     self.pairwiseMode=1 # TURN ON THE VIRTUAL LPO FEATURE
1657
1658
1659   def __setitem__(self,k,v): # THIS METHOD EXISTS ONLY FOR nlmsa[s1]+=s2
1660     if isinstance(v,nlmsa_utils.BuildMSASlice):
1661       if v.seq is not None and v.seq==k: # CASE WHERE k IS A SEQ INTERVAL
1662         return # MATCHES += USAGE: NO NEED TO DO ANYTHING!
1663       elif v.is_lpo and isinstance(k,slice) \
1664                and k.start==v.start and k.stop==v.stop: # CASE: k IS A slice
1665         return # MATCHES += USAGE: NO NEED TO DO ANYTHING!
1666     raise KeyError('Usage only nlmsa[s1]+=s2 allowed. nlmsa[s1]=s2 forbidden!')
1667   def __getitem__(self,k):
1668     'return a slice of the LPO'
1669     if isinstance(k,sequence.SeqPath): # TREAT k AS A SEQUENCE INTERVAL
1670       id,ns,offset=self.seqs[k] # GET UNION INFO FOR THIS SEQ
1671       if self.do_build:
1672         return nlmsa_utils.BuildMSASlice(ns,k.start,k.stop,id,offset,0,k)
1673       else: # QUERY THE ALIGNMENT
1674         try:
1675           return NLMSASlice(ns,k.start,k.stop,id,offset,k)
1676         except nlmsa_utils.EmptySliceError:
1677           return nlmsa_utils.EmptySlice(k)
1678     try: # TREAT k AS A PYTHON SLICE OBJECT
1679       i=k.start
1680     except AttributeError:
1681       raise KeyError('key must be a sequence interval or python slice object')
1682     if self.do_build:
1683       return nlmsa_utils.BuildMSASlice(self.lpoList[0],k.start,k.stop,-1,0,1,None)
1684     else: # QUERY THE ALIGNMENT
1685       l=nlmsa_utils.splitLPOintervals(self.lpoList,k) # MAP TO LPO(S)
1686       if len(l)>1:
1687         raise ValueError('Sorry!  Query interval spans multiple LPOs!')
1688       for ns,myslice in l: # ONLY RETURN ONE SLICE OBJECT
1689           return NLMSASlice(ns,myslice.start,myslice.stop)
1690
1691   def __iter__(self):
1692     raise NotImplementedError, 'you cannot iterate over NLMSAs'
1693
1694   def edges(self, *args, **kwargs):
1695     return nlmsa_utils.generate_nlmsa_edges(self, *args, **kwargs)
1696
1697   def __iadd__(self,seq):
1698     'add seq to our union'
1699     self.seqs.saveSeq(seq)
1700     return self  # iadd MUST ALWAYS RETURN self!
1701   def addAnnotation(self,a):
1702     'save alignment of sequence interval --> an annotation object'
1703     ival = a.sequence # GET PURE SEQUENCE INTERVAL
1704     self.__iadd__(ival) # ADD SEQ AS A NODE IN OUR ALIGNMENT
1705     self[ival].__iadd__(a) # ADD ALIGNMENT BETWEEN ival AND ANNOTATION
1706   def add_aligned_intervals(self, alignedIvals):
1707     'add alignedIvals to this alignment'
1708     nlmsa_utils.add_aligned_intervals(self, alignedIvals)
1709
1710   cdef void free_seqidmap(self,int nseq0,SeqIDMap *seqidmap):
1711     cdef int i
1712     for i from 0 <= i <nseq0: # DUMP STRING STORAGE FOR SEQUENCE IDENTIFIERS
1713       free(seqidmap[i].id)
1714     free(seqidmap) # WE CAN NOW FREE THE SEQUENCE LOOKUP ARRAY
1715
1716   cdef void save_nbuild(self,int nbuild[]):
1717     cdef NLMSASequence ns
1718     for ns in self.seqlist: # SAVE INTERVAL COUNTS BACK TO EACH SEQUENCE
1719       if not ns.is_lpo or self.pairwiseMode==1:
1720         ns.nbuild=nbuild[ns.id]  # SAVE INTERVAL COUNTS BACK TO REGULAR SEQUENCES
1721         #logger.debug('nbuild[%d] = %s' % (i, ns.nbuild))
1722
1723
1724   def readMAFfiles(self,mafFiles,maxint):
1725     'read alignment from a set of MAF files'
1726     cdef int i,j,nseq0,nseq1,n,nseq,block_len
1727     cdef SeqIDMap *seqidmap
1728     cdef char tmp[32768],*p,a_header[4]
1729     cdef FILE *ifile
1730     cdef IntervalMap im[4096],im_tmp
1731     cdef NLMSASequence ns_lpo,ns # ns IS OUR CURRENT UNION
1732     cdef FILE *build_ifile[4096]
1733     cdef int nbuild[4096],has_continuation
1734     cdef long long linecode_count[256]
1735
1736     ns_lpo=self.seqlist[self.lpo_id] # OUR INITIAL LPO
1737     self.pairwiseMode=0 # WE ARE USING A REAL LPO!
1738     memset(<void *>linecode_count,0,sizeof(linecode_count))
1739     has_continuation=0
1740
1741     nseq0=len(self.seqDict) # GET TOTAL #SEQUENCES IN ALL DATABASES
1742     seqidmap=<SeqIDMap *>calloc(nseq0,sizeof(SeqIDMap)) # ALLOCATE ARRAY
1743     i=0
1744     for pythonStr,seqInfo in self.seqDict.seqInfoDict.iteritems():
1745       seqidmap[i].id=strdup(pythonStr)
1746       try:
1747         seqidmap[i].length = seqInfo.length
1748       except OverflowError:
1749         raise OverflowError('''Sequence too long for 32 bit int: %s, %d
1750 Something is probably wrong with creation / reading of this sequence.
1751 Check the input!''' % (pythonStr, seqInfo.length))
1752       i=i+1
1753     qsort(seqidmap,nseq0,sizeof(SeqIDMap),seqidmap_qsort_cmp) # SORT BY id
1754     ns=None
1755
1756     im_tmp.sublist= -1 # DEFAULT
1757     strcpy(a_header,"a ") # MAKE C STRING
1758     for filename in mafFiles:
1759       logger.info('Processing MAF file: ' + filename)
1760       ifile=fopen(filename,'r') # text file
1761       if ifile==NULL:
1762         self.free_seqidmap(nseq0,seqidmap)
1763         self.save_nbuild(nbuild)
1764         raise IOError('unable to open file %s' % filename)
1765       if fgets(tmp,32767,ifile)==NULL or strncmp(tmp,"##maf",4): # HEADER LINE
1766         self.free_seqidmap(nseq0,seqidmap)
1767         self.save_nbuild(nbuild)
1768         raise IOError('%s: not a MAF file? Bad format.' % filename)
1769       p=fgets(tmp,32767,ifile) # READ 1ST DATA LINE OF THE MAF FILE
1770       while p: # GOT ANOTHER LINE TO PROCESS
1771         if has_continuation or 0==strncmp(tmp,a_header,2): # ALIGNMENT HEADER: READ ALIGNMENT
1772           n=readMAFrecord(im,0,seqidmap,nseq0,ns_lpo.length, # READ ONE MAF BLOCK
1773                           &block_len,ifile,4096,linecode_count,&has_continuation)
1774           if n<0: # UNRECOVERABLE ERROR OCCURRED...
1775             self.free_seqidmap(nseq0,seqidmap)
1776             self.save_nbuild(nbuild)
1777             raise ValueError('MAF block too long!  Increase max size')
1778           elif n==0:
1779             continue
1780
1781           if self.maxlen-ns_lpo.length<=block_len \
1782                  or ns_lpo.nbuild>maxint: # TOO BIG! MUST CREATE A NEW LPO
1783             j=ns_lpo.length # RECORD THE OLD OFFSET
1784             ns_lpo=self.newSequence() # CREATE A NEW LPO SEQUENCE
1785             for i from 0<= i < n: # TRANSLATE THESE INTERVALS BACK TO ZERO OFFSET
1786               if im[i].start>=0: # FORWARD INTERVAL
1787                 im[i].start = im[i].start - j
1788                 im[i].end = im[i].end - j
1789               else: # REVERSE INTERVAL
1790                 im[i].start = im[i].start + j
1791                 im[i].end = im[i].end + j
1792
1793           for i from 0 <= i < n: # SAVE EACH INTERVAL IN UNION -> LPO MAP
1794             j=im[i].target_id
1795             if seqidmap[j].nlmsa_id<=0: # NEW SEQUENCE, NEED TO ADD TO UNION
1796               if ns is None or self.maxlen-ns.length<=seqidmap[j].length:
1797                 ns=self.newSequence(None,is_union=1) # CREATE NEW UNION TO HOLD IT
1798                 build_ifile[ns.id]=ns.build_ifile # KEEP PTR SO WE CAN WRITE DIRECTLY!
1799                 nbuild[ns.id]=0
1800               seqidmap[j].ns_id=ns.id # SET IDs TO ADD THIS SEQ TO THE UNION
1801               seqidmap[j].nlmsa_id=self.inlmsa
1802               seqidmap[j].offset=ns.length
1803               self.inlmsa=self.inlmsa+1 # ADVANCE SEQUENCE ID COUNTER
1804               ns.length=ns.length+seqidmap[j].length # EXPAND UNION SIZE
1805
1806             im[i].target_id=seqidmap[j].nlmsa_id # USE THE CORRECT ID
1807             if im[i].target_start<0: # OFFSET REVERSE ORI
1808               im_tmp.start= -seqidmap[j].offset+im[i].target_start
1809               im_tmp.end=   -seqidmap[j].offset+im[i].target_end
1810             else: # OFFSET FORWARD ORI
1811               im_tmp.start= seqidmap[j].offset+im[i].target_start
1812               im_tmp.end=   seqidmap[j].offset+im[i].target_end
1813             im_tmp.target_id=ns_lpo.id
1814             im_tmp.target_start=im[i].start
1815             im_tmp.target_end=im[i].end
1816             j=seqidmap[j].ns_id # USE NLMSA ID OF THE UNION
1817             ns_lpo.saveInterval(&im_tmp,1,0,build_ifile[j]) # SAVE SEQ -> LPO
1818             nbuild[j]=nbuild[j]+1
1819
1820           ns_lpo.saveInterval(im,n,1,ns_lpo.build_ifile) # SAVE LPO -> SEQ
1821           ns_lpo.nbuild=ns_lpo.nbuild+n # INCREMENT COUNT OF SAVED INTERVALS
1822         if not has_continuation:
1823           p=fgets(tmp,32767,ifile) # TRY TO READ ANOTHER LINE...
1824       fclose(ifile) # CLOSE THIS MAF FILE
1825       #logger.debug('nbuild[0] = ' + ns_lpo.nbuild)
1826     for i from 0 <= i <256: # PRINT WARNINGS ABOUT NON-ALIGNMENT LINES
1827       if linecode_count[i]>0:
1828         logger.warn("Non-alignment text lines ignored: prefix %s, count %d" %
1829                     (chr(i), linecode_count[i]))
1830     for i from 0 <= i <nseq0: # INDEX SEQUENCES THAT WERE ALIGNED
1831       if seqidmap[i].nlmsa_id>0: # ALIGNED, SO RECORD IT
1832         self.seqs.saveSeq(seqidmap[i].id,seqidmap[i].ns_id,seqidmap[i].offset,
1833                           seqidmap[i].nlmsa_id)
1834     self.free_seqidmap(nseq0,seqidmap)
1835     self.save_nbuild(nbuild)
1836     self.build() # WILL TAKE CARE OF CLOSING ALL build_ifile STREAMS
1837
1838   cdef NLMSASequence add_seqidmap_to_union(self,int j,SeqIDMap seqidmap[],
1839                                            NLMSASequence ns,FILE *build_ifile[],
1840                                            int nbuild[]):
1841     cdef NLMSASequence ns_lpo
1842     if ns is None or self.maxlen-ns.length<=seqidmap[j].length:
1843       ns = self.newSequence(None,is_union=1) # CREATE NEW UNION TO HOLD IT
1844       build_ifile[ns.id] = ns.build_ifile # KEEP PTR SO WE CAN WRITE DIRECTLY!
1845       nbuild[ns.id] = 0
1846       if self.pairwiseMode==1: # ALSO BIND INFO FOR VIRTUAL LPO FOR THIS UNION
1847         ns_lpo = self.seqs.seqlist[ns.id - 1]
1848         build_ifile[ns_lpo.id] = ns_lpo.build_ifile
1849         nbuild[ns_lpo.id] = 0
1850     seqidmap[j].ns_id = ns.id # SET IDs TO ADD THIS SEQ TO THE UNION
1851     seqidmap[j].nlmsa_id = self.inlmsa
1852     seqidmap[j].offset = ns.length
1853     self.inlmsa = self.inlmsa+1 # ADVANCE SEQUENCE ID COUNTER
1854     ns.length = ns.length+seqidmap[j].length # EXPAND UNION SIZE
1855     return ns
1856
1857   def readAxtNet(self,axtFiles,bidirectionalRule):
1858     'read alignment from a set of axtnet files'
1859     cdef int i,j,nseq0,n,isrc,is_bidirectional
1860     cdef SeqIDMap *seqidmap
1861     cdef char tmp[32768],*p,comment[4],src_prefix[64],dest_prefix[64]
1862     cdef FILE *ifile
1863     cdef IntervalMap im[4096],im_tmp
1864     cdef NLMSASequence ns_src # SOURCE UNION VS DEST UNION
1865     cdef FILE *build_ifile[4096]
1866     cdef int nbuild[4096],has_continuation
1867
1868     self.pairwiseMode = 1 # WE ARE USING pairwiseMode
1869
1870     nseq0=len(self.seqDict) # GET TOTAL #SEQUENCES IN ALL DATABASES
1871     seqidmap=<SeqIDMap *>calloc(nseq0,sizeof(SeqIDMap)) # ALLOCATE ARRAY
1872     i=0
1873     for pythonStr,seqInfo in self.seqDict.seqInfoDict.iteritems():
1874       seqidmap[i].id=strdup(pythonStr)
1875       seqidmap[i].length = seqInfo.length
1876       i=i+1
1877     qsort(seqidmap,nseq0,sizeof(SeqIDMap),seqidmap_qsort_cmp) # SORT BY id
1878     ns_src = None
1879
1880     im_tmp.sublist= -1 # DEFAULT
1881     strcpy(comment,"#") # MAKE C STRING
1882     import string
1883     import os.path
1884     for filename in axtFiles:
1885       logger.info('Processing axtnet file: ' + filename)
1886       try:
1887         if filename[-8:] == '.net.axt':
1888           t = string.split(os.path.basename(filename)[:-8], '.')[-2:]
1889         elif filename[-4:] == '.axt':
1890           t = string.split(os.path.basename(filename)[:-4], '.')[-2:]
1891       except:
1892         raise IOError('%s is not correct axtNet file name. Correct name is (chrid.)source.target.net.axt.' % filename)
1893       #t = prefix_fun(filename) # CALL PYTHON FUNCTION TO OBTAIN PREFIXES
1894       if bidirectionalRule is None: # DETERMINE IF UNI- VS. BI-DIRECTIONAL
1895         is_bidirectional = self.is_bidirectional # JUST USE GLOBAL SETTING
1896       else: # GET SETTING FROM USER-SUPPLIED FUNCTION
1897         is_bidirectional = bidirectionalRule(t[0],t[1],self.is_bidirectional)
1898       strcpy(src_prefix,t[0]) # KEEP THEM IN STATIC C STRINGS FOR SPEED
1899       strcpy(dest_prefix,t[1])
1900       ifile=fopen(filename,'r') # text file
1901       if ifile==NULL:
1902         self.free_seqidmap(nseq0,seqidmap)
1903         self.save_nbuild(nbuild)
1904         raise IOError('unable to open file %s' % filename)
1905       while True:
1906         n = read_axtnet(im,seqidmap,nseq0,ifile,4096,&isrc,src_prefix,dest_prefix)
1907         if n<0: # UNRECOVERABLE ERROR OCCURRED...
1908           self.free_seqidmap(nseq0,seqidmap)
1909           self.save_nbuild(nbuild)
1910           raise ValueError('axtNet block too long!  Increase max size')
1911         elif n==0: # NO MORE DATA TO READ
1912           break
1913
1914         if seqidmap[isrc].nlmsa_id<=0: # NEW SEQUENCE, NEED TO ADD TO UNION
1915           ns_src = self.add_seqidmap_to_union(isrc,seqidmap,ns_src,build_ifile,nbuild)
1916
1917         for i from 0 <= i < n: # SAVE EACH INTERVAL IN SRC -> DEST MAP
1918           j=im[i].target_id
1919           #logger.debug('A: %s %s %s %s %s %s' % (im[i].start, im[i].end,
1920           #                                       im[i].target_id,
1921           #                                       im[i].target_start,
1922           #                                       im[i].target_end,
1923           #                                       im_tmp.sublist))
1924           #logger.debug('B: %s %s %s %s %s %s %s' % (seqidmap[isrc].nlmsa_id, i,
1925           #                                          j, seqidmap[j].id,
1926           #                                          seqidmap[j].ns_id,
1927           #                                          seqidmap[j].offset,
1928           #                                          seqidmap[j].nlmsa_id))
1929           if seqidmap[j].nlmsa_id<=0: # NEW SEQUENCE, NEED TO ADD TO UNION
1930             ns_src = self.add_seqidmap_to_union(j,seqidmap,ns_src,build_ifile,nbuild)
1931           im[i].target_id = seqidmap[j].nlmsa_id # USE THE CORRECT ID
1932           if is_bidirectional: # SAVE DEST -> SRC ALIGNMENT MAPPING
1933             if im[i].target_start<0: # OFFSET REVERSE ORI
1934               im_tmp.start = -seqidmap[j].offset+im[i].target_start
1935               im_tmp.end = -seqidmap[j].offset+im[i].target_end
1936             else: # OFFSET FORWARD ORI
1937               im_tmp.start = seqidmap[j].offset+im[i].target_start
1938               im_tmp.end = seqidmap[j].offset+im[i].target_end
1939             im_tmp.target_id = seqidmap[isrc].nlmsa_id
1940             im_tmp.target_start = im[i].start
1941             im_tmp.target_end = im[i].end
1942             #logger.debug('C: %s %s %s %s %s' % (im_tmp.target_id,
1943             #                                    im_tmp.target_start,
1944             #                                    im_tmp.target_end,
1945             #                                    seqidmap[j].ns_id, j))
1946             j=seqidmap[j].ns_id-1 # SAVE ALL ALIGNMENTS TO THE VIRTUAL LPO
1947             ns_src.saveInterval(&im_tmp,1,0,build_ifile[j]) # SAVE DEST -> SRC
1948             nbuild[j]=nbuild[j]+1
1949           if im[i].start < 0: # OFFSET FORWARD ORI
1950             im[i].start = -seqidmap[isrc].offset + im[i].start
1951             im[i].end = -seqidmap[isrc].offset + im[i].end
1952           else: # OFFSET FORWARD ORI
1953             im[i].start = seqidmap[isrc].offset + im[i].start
1954             im[i].end = seqidmap[isrc].offset + im[i].end
1955           #logger.debug('D: %s %s %s %s %s %s' % (im_tmp.start, im_tmp.end,
1956           #                                       im_tmp.target_id,
1957           #                                       im_tmp.target_start,
1958           #                                       im_tmp.target_end,
1959           #                                       im_tmp.sublist))
1960
1961         # SAVE THE RECORD. read_axtnet FUNCTION READS SRC/DEST AT THE SAME TIME
1962         j = seqidmap[isrc].ns_id-1 # SAVE ALL ALIGNMENTS TO THE VIRTUAL LPO
1963         ns_src.saveInterval(im,n,0,build_ifile[j]) # SAVE SRC -> DEST
1964         nbuild[j]=nbuild[j]+n # INCREMENT COUNT OF SAVED INTERVALS
1965
1966       fclose(ifile) # CLOSE THIS AXTNET FILE
1967
1968     for i from 0 <= i <nseq0: # INDEX SEQUENCES THAT WERE ALIGNED
1969       if seqidmap[i].nlmsa_id>0: # ALIGNED, SO RECORD IT
1970         self.seqs.saveSeq(seqidmap[i].id,seqidmap[i].ns_id,seqidmap[i].offset,
1971                           seqidmap[i].nlmsa_id)
1972     self.free_seqidmap(nseq0,seqidmap)
1973     self.save_nbuild(nbuild)
1974     self.build() # WILL TAKE CARE OF CLOSING ALL build_ifile STREAMS
1975
1976
1977   def buildFiles(self, saveSeqDict=False, **kwargs):
1978     'build nestedlist databases on-disk, and .seqDict index if desired'
1979     cdef NLMSASequence ns
1980     self.seqs.reopenReadOnly() # SAVE INDEXES AND OPEN READ-ONLY
1981     ntotal = 0
1982     ifile=file(self.pathstem+'.NLMSAindex','w') # text file
1983     try:
1984       for ns in self.seqlist: # BUILD EACH IntervalFileDB ONE BY ONE
1985         ntotal = ntotal + ns.buildFiles(**kwargs)
1986         if ns.is_lpo:
1987           ifile.write('%d\t%s\t%d\t%d\n' %(ns.id,'NLMSA_LPO_Internal',0,ns.length))
1988         elif ns.is_union:
1989           ifile.write('%d\t%s\t%d\t%d\n' %(ns.id,'NLMSA_UNION_Internal',1,ns.length))
1990         else:
1991           ifile.write('%d\t%s\t%d\t%d\n' %(ns.id,ns.name,0,ns.length))
1992     finally:
1993       ifile.close()
1994     if ntotal==0:
1995       raise nlmsa_utils.EmptyAlignmentError('empty alignment!')
1996     import sys,pickle
1997     ifile = file(self.pathstem+'.attrDict','wb') # pickle is binary file!
1998     try:
1999       pickle.dump(dict(is_bidirectional=self.is_bidirectional,
2000                        pairwiseMode=self.pairwiseMode),ifile)
2001     finally:
2002       ifile.close()
2003     logger.info('Index files saved.')
2004     if saveSeqDict:
2005       self.save_seq_dict()
2006     else:
2007       logger.info('''Note: the NLMSA.seqDict was not saved to a file.
2008 This is not necessary if you intend to save the NLMSA to worldbase.
2009 But if you wish to open this NLMSA independently of worldbase,
2010 you should call NLMSA.save_seq_dict() to save the seqDict info to a file,
2011 or in the future pass the saveSeqDict=True option to NLMSA.build().''')
2012
2013   def save_seq_dict(self):
2014     'save seqDict to a worldbase-aware pickle file'
2015     nlmsa_utils.save_seq_dict(self.pathstem,self.seqDict)
2016
2017   def build(self,**kwargs):
2018     'build nestedlist databases from saved mappings and initialize for use'
2019     if self.do_build==0:
2020       raise ValueError('not opened in write mode')
2021     try: # TURN OFF AUTOMATIC ADDING OF SEQUENCES TO OUR SEQDICT...
2022       self.seqDict.addAll=False
2023     except AttributeError: # THAT WAS PURELY OPTIONAL...
2024       pass
2025     if self.in_memory_mode:
2026       ntotal = 0
2027       for ns in self.seqlist: # BUILD EACH IntervalDB ONE BY ONE
2028         ntotal = ntotal + ns.buildInMemory(**kwargs)
2029       if ntotal==0:
2030         raise nlmsa_utils.EmptyAlignmentError('empty alignment!')
2031     else:
2032       self.buildFiles(**kwargs)
2033     self.do_build=0
2034
2035   def seqInterval(self,int iseq,int istart,int istop):
2036     'get specified interval in the target sequence'
2037     seq=self.seqlist.getSeq(iseq) # JUST THE SEQ OBJECT
2038     return sequence.relativeSlice(seq,istart,istop)
2039   def __invert__(self):
2040     if self.inverseDB is not None: # use the specified inverseDB
2041       return self.inverseDB
2042     elif self.is_bidirectional: # provides mapping both directions
2043       return self
2044     else:
2045       raise ValueError('this mapping is not invertible')
2046
2047
2048
2049
2050 def dump_textfile(pathstem, outfilename=None):
2051   'dump NLMSA binary files to a text file'
2052   cdef int n,nlmsaID,nsID,offset,is_bidirectional,pairwiseMode,nprefix
2053   cdef FILE *outfile
2054   cdef char err_msg[2048],tmp[2048],seqDictID[256]
2055   err_msg[0] = 0 # ENSURE STRING IS EMPTY
2056   if outfilename is None:
2057     outfilename = pathstem+'.txt' # DEFAULT TEXTFILE NAME
2058   import classutil,pickle,sys # NEED TO COPY THE WHOLE seqIDdict
2059   seqIDdict = classutil.open_shelve(pathstem+'.seqIDdict','r')
2060   n = len(seqIDdict)
2061   seqDict = nlmsa_utils.read_seq_dict(pathstem)
2062   try: # OBTAIN PREFIX INFO FOR SEQDICT
2063     prefixDict = seqDict.prefixDict
2064     nprefix = len(prefixDict)
2065     strcpy(seqDictID,"None")
2066   except AttributeError: # NO PREFIXUNION.  TRY TO GET ID OF seqDict
2067     nprefix = 0
2068     prefixDict = {}
2069     try:
2070       strcpy(seqDictID,seqDict._persistent_id)
2071     except AttributeError:
2072       strcpy(seqDictID,"unknown")
2073       logger.info('''Warning: Because your seqDict has no worldbase ID, there
2074 is no host-independent way to save it to a textfile for transfer
2075 to another machine.  Therefore, when loading this textfile
2076 on the destination machine, you will have to provide the
2077 seqDict argument to textfile_to_binaries() on the destination machine.''')
2078   try:
2079     ifile = file(pathstem+'.attrDict', 'rb') # pickle is binary file!
2080     d = pickle.load(ifile)
2081     ifile.close()
2082   except IOError:
2083     d = {}
2084   is_bidirectional = d.get('is_bidirectional',-1)
2085   pairwiseMode = d.get('pairwiseMode',-1)
2086   outfile=fopen(outfilename,"w") # text file
2087   import os.path
2088   basestem=os.path.basename(pathstem) # GET RID OF PATH INFO
2089   strcpy(tmp,basestem) # COPY TO C STRING SO WE CAN fprintf
2090   if outfile==NULL:
2091     raise IOError('unable to open file %s' %outfilename)
2092   try:
2093     if fprintf(outfile,"PATHSTEM\t%s\t%d\t%d\t%d\t%d\t%s\n",tmp,n,
2094                is_bidirectional,pairwiseMode,nprefix,seqDictID)<0:
2095       raise IOError('error writing to file %s' %outfilename)
2096     pleaseWarn = True
2097     for id,d in prefixDict.items(): # SAVE seqDict PREFIX ENTRIES
2098       strcpy(tmp,id) # CONVERT TO C DATA TYPES FOR fprintf
2099       try:
2100         strcpy(seqDictID,d._persistent_id) # try to get worldbase ID
2101       except AttributeError:
2102         strcpy(seqDictID,"None")
2103         if pleaseWarn:
2104           pleaseWarn = False
2105           logger.info('''Warning: Because one or more of the sequence
2106 databases in the seqDict have no worldbase ID, there is no
2107 host-independent way to save it to a textfile for transfer
2108 to another machine.  Therefore, when loading this textfile
2109 on the destination machine, you will have to provide a dictionary
2110 for these sequence database(s) as the prefixDict argument
2111 to textfile_to_binaries() on the destination machine.''')
2112       if fprintf(outfile,"PREFIXUNION\t%s\t%s\n",tmp,seqDictID)<0:
2113         raise IOError('error writing to file %s' %outfilename)
2114     for id,t in seqIDdict.iteritems(): # SAVE seqIDdict
2115       strcpy(tmp,id) # CONVERT TO C DATA TYPES FOR fprintf
2116       nlmsaID=t[0]
2117       nsID=t[1]
2118       offset=t[2]
2119       if fprintf(outfile,"SEQID\t%s\t%d\t%d\t%d\n",tmp,
2120                  nlmsaID,nsID,offset)<0:
2121         raise IOError('error writing to file %s' %outfilename)
2122     try:
2123       ifile = file(pathstem+'.NLMSAindex', 'rU') # text file
2124     except IOError:
2125       ifile = file(pathstem+'NLMSAindex', 'rU')
2126   except:
2127     fclose(outfile)
2128     raise
2129   try:
2130     for line in ifile:  # NOW SAVE THE NLMSA DATA
2131       id,name,is_union,length=line.strip().split('\t')
2132       strcpy(tmp,line) # COPY TO C STRING SO WE CAN fprintf
2133       if fprintf(outfile,"NLMSASequence\t%s",tmp)<0:
2134         raise IOError('error writing file %s'%outfilename)
2135       mypath=pathstem+id
2136       mybase=basestem+id
2137       if save_text_file(mypath,mybase,err_msg,outfile)!=0:
2138         raise IOError(err_msg)
2139   finally:
2140     fclose(outfile)
2141     ifile.close()
2142
2143
2144 def textfile_to_binaries(filename,seqDict=None,prefixDict=None,buildpath=''):
2145   'convert pathstem.txt textfile to NLMSA binary files'
2146   cdef int i,n,nlmsaID,nsID,offset,is_bidirectional,pairwiseMode,nprefix
2147   cdef FILE *infile
2148   cdef char err_msg[2048],line[32768],tmp[2048],basestem[2048],seqDictID[2048]
2149   if seqDict is not None:
2150     ignorePrefix = True
2151   else:
2152     ignorePrefix = False
2153   err_msg[0]=0 # ENSURE STRING IS EMPTY
2154   infile=fopen(filename,"r") # text file
2155   if infile==NULL:
2156     raise IOError('unable to open file %s' %filename)
2157   try:
2158     if fgets(line,32767,infile)==NULL:
2159       raise IOError('error or EOF reading %s'%filename)
2160     is_bidirectional = -1 # INVALID INITIAL SETTING
2161     pairwiseMode = -1
2162     nprefix = 0
2163     strcpy(tmp,"None")
2164     if 2>sscanf(line,"PATHSTEM\t%s\t%d\t%d\t%d\t%d\t%s",basestem,&n,
2165                 &is_bidirectional,&pairwiseMode,&nprefix,tmp):
2166       raise IOError('bad format in %s'%filename)
2167     if buildpath!='': # USER-SPECIFIED PATH FOR BINARIES
2168       import os
2169       buildpath1 = os.path.join(buildpath,basestem) # CONSTRUCT FILE PATH
2170       strcpy(basestem,buildpath1) # COPY BACK TO C STRING USABLE IN C FUNCTIONS
2171     else: # JUST USE PATH IN CURRENT DIRECTORY
2172       buildpath1 = basestem
2173     if 0==strcmp(tmp,"unknown"):
2174       if seqDict is None:
2175         raise ValueError('You must provide a seqDict for this NLMSA!')
2176     elif 0!=strcmp(tmp,"None"): # try obtaining as worldbase ID
2177       from pygr import worldbase
2178       seqDict = worldbase(tmp)
2179     import classutil,pickle # CREATE THE seqIDdict
2180     seqIDdict = classutil.open_shelve(basestem+'.seqIDdict','n')
2181     IDdict = classutil.open_shelve(basestem+'.idDict','n')
2182     d = {}
2183     if is_bidirectional != -1:
2184       d['is_bidirectional'] = is_bidirectional
2185     if pairwiseMode != -1:
2186       d['pairwiseMode'] = pairwiseMode
2187     ifile = file(basestem+'.attrDict',"wb") # pickle is binary file!
2188     try:
2189       pickle.dump(d,ifile)
2190     finally:
2191       ifile.close()
2192     if prefixDict is None or ignorePrefix:
2193       prefixDict = {}
2194     missing = []
2195     for i from 0 <= i < nprefix: # READ seqDICT PREFIX ENTRIES
2196       if fgets(line,32767,infile)==NULL:
2197         raise IOError('error or EOF reading %s'%filename)
2198       if 2!=sscanf(line,"PREFIXUNION\t%s\t%s",tmp,seqDictID):
2199         raise IOError('bad format in %s'%filename)
2200       if ignorePrefix: # JUST IGNORE THE PREFIX INFO WE READ
2201         continue
2202       if 0==strcmp(seqDictID,"None"):
2203         if tmp not in prefixDict:
2204           missing.append(tmp) # MISSING A SEQDICT DICTIONARY ENTRY!
2205       else: # load it from worldbase
2206         from pygr import worldbase
2207         prefixDict[tmp] = worldbase(seqDictID)
2208     if len(missing)>0:
2209       raise KeyError('''You must supply sequence database(s) for the
2210 following prefixes, by passing them in the prefixDict optional
2211 dictionary argument: %s''' % missing)
2212     if len(prefixDict)>0: # CREATE A PREFIX UNION
2213       import seqdb
2214       seqDict = seqdb.PrefixUnionDict(prefixDict)
2215     nlmsa_utils.save_seq_dict(basestem,seqDict) # SAVE SEQDICT
2216     for i from 0 <= i <n: # seqIDDict READING
2217       if fgets(line,32767,infile)==NULL:
2218         raise IOError('error or EOF reading %s'%filename)
2219       if 4!=sscanf(line,"SEQID\t%s\t%d %d %d",tmp,
2220                    &nlmsaID,&nsID,&offset):
2221         raise IOError('bad format in %s'%filename)
2222       seqIDdict[tmp]=(nlmsaID,nsID,offset) # SAVE THIS ENTRY
2223       IDdict[str(nlmsaID)]=(tmp,nsID)
2224     seqIDdict.close() # DONE WRITING THE seqIDdict
2225     IDdict.close() # DONE WRITING THE seqIDdict
2226
2227     NLMSAindexText = ''
2228     if buildpath!='': # USER-SPECIFIED PATH FOR BINARIES
2229       import os
2230       buildpath2 = os.path.join(buildpath,'') # ENSURE THIS ENDS IN DIRECTORY SEPARATOR
2231       strcpy(basestem,buildpath2) # COPY BACK TO C STRING USABLE IN C FUNCTIONS
2232     else:
2233       strcpy(basestem,'') # JUST USE BLANK STRING TO SAVE IN CURRENT DIRECTORY
2234     import sys
2235     while fgets(line,32767,infile)!=NULL:
2236       s=line # CONVERT STRING TO PYTHON OBJECT
2237       if not s.startswith('NLMSASequence'):
2238         raise IOError('bad format in file %s'%filename)
2239       NLMSAindexText = NLMSAindexText + s[14:] # JUST SAVE THE DATA FIELDS
2240       logger.info('Saving NLMSA binary index: ' + s[14:] + '...')
2241       if text_file_to_binaries(infile,basestem,err_msg)<0:
2242         raise IOError(err_msg)
2243     ifile = file(buildpath1+'.NLMSAindex',"w") # text file
2244     ifile.write(NLMSAindexText) # LAST, WRITE TOP INDEX FILE
2245     ifile.close()
2246   finally:
2247     fclose(infile)
2248   return buildpath1 # ACTUAL PATH TO NLMSA INDEX FILESET