5 cdef class IntervalDBIterator
:
6 def __new__
(self,int start
,int end
,IntervalDB db
not None
):
7 self.it
=interval_iterator_alloc
()
16 cdef int cnext
(self): # C VERSION OF ITERATOR next METHOD RETURNS INDEX
18 if self.ihit
>=self.nhit
: # TRY TO GET ONE MORE BUFFER CHUNK OF HITS
19 if self.it
==NULL
: # ITERATOR IS EXHAUSTED
21 find_intervals
(self.it
,self.start
,self.end
,self.db
.im
,self.db
.ntop
,
22 self.db
.subheader
,self.db
.nlists
,self.im_buf
,1024,
23 &(self.nhit
),&(self.it
)) # GET NEXT BUFFER CHUNK
24 self.ihit
=0 # START ITERATING FROM START OF BUFFER
25 if self.ihit
<self.nhit
: # RETURN NEXT ITEM FROM BUFFER
27 self.ihit
= self.ihit
+1 # ADVANCE THE BUFFER COUNTER
29 else: # BUFFER WAS EMPTY, NO HITS TO ITERATE OVER...
32 # PYTHON VERSION OF next RETURNS HIT AS A TUPLE
33 def __next__
(self): # PYREX USES THIS NON-STANDARD NAME INSTEAD OF next()!!!
37 return (self.im_buf
[i
].start
,self.im_buf
[i
].end
,self.im_buf
[i
].target_id
,
38 self.im_buf
[i
].target_start
,self.im_buf
[i
].target_end
)
42 def __dealloc__
(self):
43 'remember: dealloc cannot call other methods!'
44 free_interval_iterator
(self.it_alloc
)
49 cdef class IntervalDB
:
50 def __new__
(self,filename
='noname',nsize
=0,**kwargs
):
55 ifile
=fopen
(filename
,"r") # text file, one interval per line
57 self.im
=read_intervals
(self.n
,ifile
)
60 self.runBuildMethod
(**kwargs
)
62 msg
='could not open file %s' % filename
65 def save_tuples
(self,l
,**kwargs
):
66 'build in-memory NLMSA from list of alignment tuples'
68 self.close
() # DUMP OUR EXISTING MEMORY
70 self.im
=interval_map_alloc
(self.n
)
72 raise MemoryError('unable to allocate IntervalMap[%d]' % self.n
)
77 self.im
[i
].target_id
=t
[2]
78 self.im
[i
].target_start
=t
[3]
79 self.im
[i
].target_end
=t
[4]
80 self.im
[i
].sublist
= -1
82 self.runBuildMethod
(**kwargs
)
84 def runBuildMethod
(self,buildInPlace
=True
):
85 'build either in-place or using older build method'
87 self.subheader
=build_nested_list_inplace
(self.im
,self.n
,&(self.ntop
),&(self.nlists
))
89 self.subheader
=build_nested_list
(self.im
,self.n
,&(self.ntop
),&(self.nlists
))
91 def buildFromUnsortedFile
(self,filename
,int n
,**kwargs
):
92 'load unsorted binary data, and build nested list'
95 cdef IntervalMap
*im_new
97 ifile
=fopen
(filename
,'rb') # binary file
99 raise IOError('unable to open '+filename
)
100 im_new
=interval_map_alloc
(n
)
102 raise MemoryError('unable to allocate IntervalMap[%d]' % n
)
103 i
=read_imdiv
(ifile
,im_new
,n
,0,n
)
106 raise IOError('IntervalMap file corrupted?')
109 self.runBuildMethod
(**kwargs
)
111 def find_overlap
(self,int start
,int end
):
112 self.check_nonempty
() # RAISE EXCEPTION IF NO DATA
113 return IntervalDBIterator
(start
,end
,self)
115 def find_overlap_list
(self,int start
,int end
):
117 cdef IntervalIterator
*it
,*it_alloc
118 cdef IntervalMap im_buf
[1024]
119 self.check_nonempty
() # RAISE EXCEPTION IF NO DATA
120 it
=interval_iterator_alloc
()
122 l
=[] # LIST OF RESULTS TO HAND BACK
124 find_intervals
(it
,start
,end
,self.im
,self.ntop
,
125 self.subheader
,self.nlists
,im_buf
,1024,
126 &(nhit
),&(it
)) # GET NEXT BUFFER CHUNK
127 for i
from 0 <= i
< nhit
:
128 l
.append
((im_buf
[i
].start
,im_buf
[i
].end
,im_buf
[i
].target_id
,im_buf
[i
].target_start
,im_buf
[i
].target_end
))
129 free_interval_iterator
(it_alloc
)
132 def check_nonempty
(self):
136 msg
='empty IntervalDB, not searchable!'
137 raise IndexError(msg
)
139 def write_binaries
(self,filestem
,div
=256):
141 err_msg
=write_binary_files
(self.im
,self.n
,self.ntop
,div
,
142 self.subheader
,self.nlists
,filestem
)
144 raise IOError(err_msg
)
146 def __dealloc__
(self):
147 'remember: dealloc cannot call other methods!'
165 cdef class IntervalFileDBIterator
:
166 def __new__
(self,int start
,int end
,IntervalFileDB db
=None
,
167 NLMSASequence ns
=None
,
168 int nbuffer
=1024,rawIvals
=None
):
170 self.it_alloc
=interval_iterator_alloc
()
171 self.restart
(start
, end
, db
, ns
)
172 if rawIvals
is not None
and len(rawIvals
)>nbuffer
:
173 nbuffer
=len(rawIvals
)
174 self.im_buf
=interval_map_alloc
(nbuffer
)
176 if rawIvals
is not None
:
178 for ival
in rawIvals
:
179 self.im_buf
[i
].start
=ival
[0] # SAVE INTERVAL INFO
180 self.im_buf
[i
].end
=ival
[1]
181 self.im_buf
[i
].target_id
=ival
[2]
182 self.im_buf
[i
].target_start
=ival
[3]
183 self.im_buf
[i
].target_end
=ival
[4]
185 self.nhit
=i
# TOTAL NUMBER OF INTERVALS STORED
187 cdef int restart
(self,int start
,int end
,IntervalFileDB db
,
188 NLMSASequence ns
) except -2:
189 'reuse this iterator for another search without reallocing memory'
190 self.nhit
=0 # FLUSH ANY EXISTING DATA
195 if ns
.idb
is not None
:
200 self.it
=self.it_alloc
# REUSE OUR CURRENT ITERATOR
201 reset_interval_iterator
(self.it
) # RESET IT FOR REUSE
204 cdef int reset
(self) except -2:
205 'flush the buffer so we can reuse this iterator'
209 cdef int extend
(self,int ikeep
):
210 'expand the buffer if necessary, keeping elements [ikeep:nbuf]'
211 cdef int length
,istart
212 cdef IntervalMap
*new_buf
213 istart
=self.nbuf
-ikeep
214 length
=sizeof(IntervalMap
)*istart
# #BYTES WE MUST KEEP
215 if ikeep
>0 and length
>0: # SHIFT [ikeep:] SLICE OF BUFFER TO [0:istart]
216 memmove
(self.im_buf
,self.im_buf
+ikeep
,length
)
217 if ikeep
<8: # RUNNING OUT OF ROOM, SO DOUBLE OUR BUFFER
218 new_buf
=<IntervalMap
*>realloc
(self.im_buf
,
219 sizeof(IntervalMap
)*2*self.nbuf
)
221 raise MemoryError('out of memory')
223 self.nbuf
=2*self.nbuf
224 return istart
# RETURN START OF EMPTY BLOCK WHERE WE CAN ADD NEW DATA
226 cdef int saveInterval
(self,int start
,int end
,int target_id
,
227 int target_start
,int target_end
):
228 'save an interval, expanding array if necessary'
230 if self.nhit
>=self.nbuf
: # EXPAND ARRAY IF NECESSARY
233 self.im_buf
[i
].start
=start
# SAVE INTERVAL INFO
234 self.im_buf
[i
].end
=end
235 self.im_buf
[i
].target_id
=target_id
236 self.im_buf
[i
].target_start
=target_start
237 self.im_buf
[i
].target_end
=target_end
241 cdef int nextBlock
(self,int *pkeep
) except -2:
242 'load one more block of overlapping intervals'
244 if self.it
==NULL
: # ITERATOR IS EXHAUSTED
246 if pkeep
and pkeep
[0]>=0 and pkeep
[0]<self.nhit
: #MUST KEEP [ikeep:] SLICE
247 i
=self.extend
(pkeep
[0]) # MOVE SLICE TO THE FRONT
248 else: # WE CAN USE THE WHOLE BUFFER
250 if self.db
is not None
: # ON-DISK DATABASE
251 find_file_intervals
(self.it
,self.start
,self.end
,
252 self.db
.db
[0].ii
,self.db
.db
[0].nii
,
253 self.db
.db
[0].subheader
,self.db
.db
[0].nlists
,
254 &(self.db
.db
[0].subheader_file
),
255 self.db
.db
[0].ntop
,self.db
.db
[0].div
,
256 self.db
.db
[0].ifile_idb
,
257 self.im_buf
+i
,self.nbuf
-i
,
258 &(self.nhit
),&(self.it
)) # GET NEXT BUFFER CHUNK
259 elif self.idb
is not None
: # IN-MEMORY DATABASE
260 find_intervals
(self.it
,self.start
,self.end
,self.idb
.im
,self.idb
.ntop
,
261 self.idb
.subheader
,self.idb
.nlists
,self.im_buf
+i
,self.nbuf
-i
,
262 &(self.nhit
),&(self.it
)) # GET NEXT BUFFER CHUNK
264 raise IOError('Iterator has no database! Please provide a db argument.')
265 self.nhit
=self.nhit
+i
# TOTAL #HITS IN THE BUFFER
266 self.ihit
=i
# START ITERATING FROM START OF NEW HITS
267 if pkeep
and pkeep
[0]>=0: # RESET ikeep INDEX TO START OF BUFFER
269 return self.nhit
-self.ihit
# RETURN #NEW HITS IN NEXT BLOCK
271 cdef IntervalMap
*getIntervalMap
(self):
272 '''return the IntervalMap array loaded by iterator,
273 and release it from iterator. User must free the array!'''
276 if self.nhit
==0: # NO HITS
278 elif self.nhit
<self.nbuf
: # LARGER BUFFER THAN WE ACTUALLY NEED
279 len=sizeof(IntervalMap
)*self.nhit
# COMPUTE FINAL SIZE
280 im
=<IntervalMap
*>realloc
(self.im_buf
,len) # COMPACT TO FINAL SIZE
281 else: # JUST HAND BACK OUR FULL BUFFER
283 self.im_buf
=NULL
# RELEASE THIS STORAGE FROM ITERATOR; USER MUST FREE IT!
284 return im
# HAND BACK THE STORAGE
286 cdef int loadAll
(self) except -1:
287 'load all overlapping interval hits, return count of hits'
290 ikeep
=0 # DON'T LET extend DISCARD ANY HITS, KEEP THEM ALL!
291 while len>0: # LOAD BLOCKS UNTIL NO MORE...
292 len=self.nextBlock
(&ikeep
) # LOAD ANOTHER BLOCK OF INTERVALS
295 cdef int cnext
(self,int *pkeep
): # C VERSION OF ITERATOR next METHOD
296 'get one more overlapping interval'
298 if self.ihit
>=self.nhit
: # TRY TO GET ONE MORE BUFFER CHUNK OF HITS
299 self.nextBlock
(pkeep
) # LOAD THE NEXT BLOCK IF ANY
300 if self.ihit
<self.nhit
: # RETURN NEXT ITEM FROM BUFFER
302 self.ihit
= self.ihit
+1 # ADVANCE THE BUFFER COUNTER
304 else: # BUFFER WAS EMPTY, NO HITS TO ITERATE OVER...
307 cdef int copy
(self,IntervalFileDBIterator src
):
308 'copy items from src to this iterator buffer'
309 cdef IntervalMap
*new_buf
311 raise ValueError('src is None! Debug!!')
312 if src
.nhit
>self.nbuf
: # NEED TO EXPAND OUR BUFFER
313 new_buf
=<IntervalMap
*>realloc
(self.im_buf
,src
.nhit
*sizeof(IntervalMap
))
315 raise MemoryError('out of memory')
316 self.im_buf
=new_buf
# RECORD NEW BUFFER LOCATION AND SIZE
318 self.nhit
=src
.nhit
# COPY ARRAY AND SET CORRECT SIZE
319 if src
.nhit
>0: # ONLY COPY IF NON-EMPTY
320 memcpy
(self.im_buf
,src
.im_buf
,src
.nhit
*sizeof(IntervalMap
))
324 'merge intervals into single interval per sequence orientation'
326 if self.nhit
<=0: # NOTHING TO MERGE, SO JUST RETURN
328 qsort
(self.im_buf
,self.nhit
,sizeof(IntervalMap
),target_qsort_cmp
) # ORDER BY id,start
331 for i
from 0 <= i
< self.nhit
:
332 if self.im_buf
[i
].target_id
!=id or (self.im_buf
[j
].target_start
<0 and
333 self.im_buf
[i
].target_start
>=0):
334 if id>=0: # WE NEED TO SAVE PREVIOUS INTERVAL
335 if n
<j
: # COPY MERGED INTERVAL TO COMPACTED LOCATION
336 memcpy
(self.im_buf
+n
,self.im_buf
+j
,sizeof(IntervalMap
))
338 j
=i
# RECORD THIS AS START OF THE NEW SEQUENCE / ORIENTATION
339 id=self.im_buf
[i
].target_id
340 elif self.im_buf
[i
].target_end
>self.im_buf
[j
].target_end
:
341 self.im_buf
[j
].target_end
=self.im_buf
[i
].target_end
# EXPAND THIS INTERVAL
342 self.im_buf
[j
].end
=self.im_buf
[i
].end
# COPY SOURCE SEQ COORDS AS WELL
343 if n
<j
: # COPY LAST MERGED INTERVAL TO COMPACTED LOCATION
344 memcpy
(self.im_buf
+n
,self.im_buf
+j
,sizeof(IntervalMap
))
345 self.nhit
=n
+1 # TOTAL #MERGED INTERVALS
351 # PYTHON VERSION OF next RETURNS HIT AS A TUPLE
352 def __next__
(self): # PYREX USES THIS NON-STANDARD NAME INSTEAD OF next()!!!
356 return (self.im_buf
[i
].start
,self.im_buf
[i
].end
,self.im_buf
[i
].target_id
,
357 self.im_buf
[i
].target_start
,self.im_buf
[i
].target_end
)
361 def __dealloc__
(self):
362 'remember: dealloc cannot call other methods!'
363 free_interval_iterator
(self.it_alloc
)
371 cdef class IntervalFileDB
:
372 def __new__
(self,filestem
=None
,mode
='r'):
373 if filestem
is not None
and mode
=='r':
376 def open(self,filestem
):
377 cdef char err_msg
[1024]
378 self.db
=read_binary_files
(filestem
,err_msg
,1024)
380 raise IOError(err_msg
)
382 def find_overlap
(self,int start
,int end
):
383 self.check_nonempty
() # RAISE EXCEPTION IF NO DATA
384 return IntervalFileDBIterator
(start
,end
,self)
386 def find_overlap_list
(self,int start
,int end
):
388 cdef IntervalIterator
*it
,*it_alloc
389 cdef IntervalMap im_buf
[1024]
390 self.check_nonempty
() # RAISE EXCEPTION IF NO DATA
391 it
=interval_iterator_alloc
()
393 l
=[] # LIST OF RESULTS TO HAND BACK
395 find_file_intervals
(it
,start
,end
,self.db
[0].ii
,self.db
[0].nii
,
396 self.db
[0].subheader
,self.db
[0].nlists
,
397 &(self.db
[0].subheader_file
),
398 self.db
[0].ntop
,self.db
[0].div
,
399 self.db
[0].ifile_idb
,im_buf
,1024,
400 &(nhit
),&(it
)) # GET NEXT BUFFER CHUNK
401 for i
from 0 <= i
< nhit
:
402 l
.append
((im_buf
[i
].start
,im_buf
[i
].end
,im_buf
[i
].target_id
,
403 im_buf
[i
].target_start
,im_buf
[i
].target_end
))
404 free_interval_iterator
(it_alloc
)
407 def check_nonempty
(self):
409 raise IndexError('empty IntervalFileDB, not searchable!')
413 free_interval_dbfile
(self.db
)
416 def __dealloc__
(self):
417 'remember: dealloc cannot call other methods!'
419 free_interval_dbfile
(self.db
)
424 cdef class NLMSASliceLetters
:
425 'graph interface to letter graph within this region'
426 def __new__
(self,NLMSASlice nlmsaSlice
):
427 self.nlmsaSlice
=nlmsaSlice
429 return NLMSASliceIterator
(self.nlmsaSlice
)
430 def __getitem__
(self,NLMSANode node
):
431 return node
.nodeEdges
()
433 'list of tuples (node,{target_node:edge})'
436 l
.append
((node
,node
.nodeEdges
()))
439 return iter
(self.items
())
444 cdef class NLMSASlice
:
445 def __new__
(self,NLMSASequence ns
not None
,int start
,int stop
,
446 int id= -1,int offset
=0,seq
=None
):
447 cdef int i
,j
,n
,start_max
,end_min
,start2
,stop2
,nseq
,istart
,istop
,localQuery
448 cdef NLMSASequence ns_lpo
449 cdef IntervalFileDBIterator it
,it2
450 cdef IntervalMap
*im
,*im2
453 if seq
is None
: # GET FROM NLMSASequence
455 self.nlmsaSequence
=ns
# SAVE BASIC INFO
456 self.nlmsa
= ns
.nlmsaLetters
459 self.offset
=offset
# ALWAYS STORE offset IN POSITIVE ORIENTATION
461 try: # USE PYTHON METHOD TO DO QUERY
462 id,ivals
=ns
.nlmsaLetters
.doSlice
(seq
) # doSlice() RETURNS RAW INTERVALS
463 self.id=id # SAVE OUR SEQUENCE'S nlmsa_id
464 it
=IntervalFileDBIterator
(start
,stop
,rawIvals
=ivals
) # STORE IN BINARY FMT
465 it2
=IntervalFileDBIterator
(start
,stop
) # HOLDER FOR SUBSEQUENT MERGE
466 localQuery
=0 # DO NOT PERFORM LOCAL QUERY CODE BELOW!!
467 except AttributeError:
469 if localQuery
: ################################## PERFORM LOCAL QUERY
474 if start
<0: # NEED TO TRANSLATE OFFSETS TO MINUS ORIENTATION
476 if ns
.nlmsaLetters
.pairwiseMode
==1: # TRANSLATE SEQ DIRECTLY TO LPO
477 it
=IntervalFileDBIterator
(start
,stop
,rawIvals
=((start
,stop
,ns
.id-1,
478 start
+offset
,stop
+offset
),))
479 n
=1 # JUST THE SINGLE IDENTITY MAPPING FROM SEQ TO LPO
480 else: # PERFORM NORMAL SEQ --> LPO QUERY
481 it
=IntervalFileDBIterator
(start
+offset
,stop
+offset
,ns
=ns
)
482 n
=it
.loadAll
() # GET ALL OVERLAPPING INTERVALS
484 raise nlmsa_utils
.EmptySliceError
('this interval is not aligned!')
485 for i
from 0 <= i
< n
: # CLIP INTERVALS TO FIT [start:stop]
486 it
.im_buf
[i
].start
=it
.im_buf
[i
].start
-offset
# XLATE TO SRC SEQ COORDS
487 it
.im_buf
[i
].end
=it
.im_buf
[i
].end
-offset
488 if stop
<it
.im_buf
[i
].end
: # TRUNCATE TO FIT WITHIN [start:stop]
489 it
.im_buf
[i
].target_end
= it
.im_buf
[i
].target_end \
490 +stop
-it
.im_buf
[i
].end
# CALCULATE NEW ENDPOINT
491 it
.im_buf
[i
].end
=stop
492 if start
>it
.im_buf
[i
].start
: # CALCULATE NEW STARTPOINT
493 it
.im_buf
[i
].target_start
=it
.im_buf
[i
].target_start \
494 +start
-it
.im_buf
[i
].start
495 it
.im_buf
[i
].start
=start
497 if ns
.is_lpo
: # TARGET INTERVALS MUST BE LPO, MUST MAP TO REAL SEQUENCES
498 it2
=IntervalFileDBIterator
(start
,stop
) # HOLDER FOR SUBSEQUENT MERGE
500 ns_lpo
=ns
.nlmsaLetters
.seqlist
[ns
.nlmsaLetters
.lpo_id
] # DEFAULT LPO
501 for i
from 0 <= i
< n
:
502 if it
.im_buf
[i
].target_id
!= ns_lpo
.id: # SWITCHING TO A DIFFERENT LPO?
503 ns_lpo
=ns
.nlmsaLetters
.seqlist
[it
.im_buf
[i
].target_id
]
504 if not ns_lpo
.is_lpo
:
505 raise ValueError('sequence mapped to non-LPO target??')
506 if it2
is None
: # NEED TO ALLOCATE NEW ITERATOR
507 it2
=IntervalFileDBIterator
(it
.im_buf
[i
].target_start
,
508 it
.im_buf
[i
].target_end
,ns
=ns_lpo
)
509 else: # JUST REUSE THIS ITERATOR WITHOUT REALLOCING MEMORY
510 it2
.restart
(it
.im_buf
[i
].target_start
,
511 it
.im_buf
[i
].target_end
,None
,ns_lpo
)
512 it2
.loadAll
() # GET ALL OVERLAPPING INTERVALS
513 if it2
.nhit
<=0: # NO HITS, SO TRY THE NEXT INTERVAL???
515 im2
=it2
.im_buf
# ARRAY FROM THIS ITERATOR
516 for j
from 0 <= j
< it2
.nhit
: # MAP EACH INTERVAL BACK TO ns
517 if it
.im_buf
[i
].target_start
>im2
[j
].start
: # GET INTERSECTION INTERVAL
518 start_max
=it
.im_buf
[i
].target_start
520 start_max
=im2
[j
].start
521 if it
.im_buf
[i
].target_end
<im2
[j
].end
:
522 end_min
=it
.im_buf
[i
].target_end
525 istart
=it
.im_buf
[i
].start
+start_max
-it
.im_buf
[i
].target_start
# SRC COORDS
526 istop
=it
.im_buf
[i
].start
+end_min
-it
.im_buf
[i
].target_start
527 start2
=im2
[j
].target_start
+start_max
-im2
[j
].start
# COORDS IN TARGET
528 stop2
=im2
[j
].target_start
+end_min
-im2
[j
].start
529 if im2
[j
].target_id
!=id or istart
!=start2 \
530 or ns
.nlmsaLetters
.pairwiseMode
==1: # DISCARD SELF-MATCH
531 it
.saveInterval
(istart
,istop
,im2
[j
].target_id
,start2
,stop2
) # SAVE IT!
532 assert ns_lpo
.id!=im2
[j
].target_id
535 raise nlmsa_utils
.EmptySliceError
('this interval is not aligned!')
536 it2
.copy
(it
) # COPY FULL SET OF SAVED INTERVALS
537 self.nseqBounds
=it2
.mergeSeq
() # MERGE TO ONE INTERVAL PER SEQUENCE ORIENTATION
538 self.seqBounds
=it2
.getIntervalMap
() # SAVE SORTED ARRAY & DETACH FROM ITERATOR
540 self.im
=it
.getIntervalMap
() # RELEASE THIS ARRAY FROM THE ITERATOR
541 self.n
=it
.nhit
# TOTAL #INTERVALS SAVED FROM JOIN
542 qsort
(self.im
,self.n
,sizeof(IntervalMap
),imstart_qsort_cmp
) # ORDER BY start
545 for i
from 0 <= i
< self.nseqBounds
: # COUNT NON-LPO SEQUENCES
546 if not ns
.nlmsaLetters
.seqlist
.is_lpo
(self.seqBounds
[i
].target_id
):
548 self.nrealseq
=n
# SAVE THE COUNT
550 try: # _cache_max=0 TURNS OFF CACHING...
551 cacheMax
=ns
.nlmsaLetters
.seqDict
._cache_max
552 except AttributeError:
553 cacheMax
=1 # ALLOW CACHING...
554 try: # SAVE OUR COVERING INTERVALS AS CACHE HINTS IF POSSIBLE...
555 saveCache
=ns
.nlmsaLetters
.seqDict
.cacheHint
556 except AttributeError:
557 cacheMax
=0 # TURN OFF CACHING
558 if cacheMax
>0: # CONSTRUCT & SAVE DICT OF CACHE HINTS: COVERING INTERVALS
561 try: # ADD A CACHE HINT FOR QUERY SEQ IVAL
562 seqID
=ns
.nlmsaLetters
.seqs
.getSeqID
(seq
) # GET FULL-LENGTH ID
563 cacheDict
[seqID
]=(self.start
,self.stop
)
566 for i
from 0 <= i
< self.nseqBounds
: # ONLY SAVE NON-LPO SEQUENCES
567 if not ns
.nlmsaLetters
.seqlist
.is_lpo
(self.seqBounds
[i
].target_id
):
568 cacheDict
[ns
.nlmsaLetters
.seqlist
.getSeqID
(self.seqBounds
[i
].target_id
)]=(self.seqBounds
[i
].target_start
,self.seqBounds
[i
].target_end
)
571 self.weakestLink
= nlmsa_utils
.SeqCacheOwner
()
572 saveCache
(cacheDict
, self.weakestLink
) # SAVE COVERING IVALS AS CACHE HINT
578 return "<NLMSASlice object at 0x%x (seq=%s)>" % (id(self), self.seq
.id,)
580 def __dealloc__
(self):
581 'remember: dealloc cannot call other methods!'
589 cdef object get_seq_interval
(self, NLMSA nl
, int targetID
,
590 int start
, int stop
):
591 'get seq interval and ensure cache owner keeps it in the cache'
593 ival
= nl
.seqInterval
(targetID
, start
, stop
)
594 else: # GET THE SEQUENCE OBJECT
595 ival
= nl
.seqlist
.getSeq
(targetID
)
596 try: # tell owner to keep it in the cache
597 self.weakestLink
.cache_reference
(ival
.pathForward
)
598 except AttributeError:
602 ########################################### ITERATOR METHODS
603 def edges
(self,mergeAll
=False
,**kwargs
):
604 'get list of tuples (srcIval,destIval,edge) aligned in this slice'
605 seqIntervals
=self.groupByIntervals
(mergeAll
=mergeAll
,**kwargs
)
606 ivals
=self.groupBySequences
(seqIntervals
,**kwargs
)
608 for ival1
,ival2
,mergeIntervals
in ivals
:
609 l
.append
((ival1
,ival2
,sequence
.Seq2SeqEdge
(self,ival2
,ival1
,mergeIntervals
)))
611 def items
(self,**kwargs
):
612 'get list of tuples (ival2,edge) aligned to this slice'
614 for ival1
,ival2
,edge
in self.edges
(**kwargs
):
615 l
.append
((ival2
,edge
))
617 def iteritems
(self,**kwargs
):
618 return iter
(self.items
(**kwargs
))
619 def keys
(self,mergeAll
=False
,**kwargs
):
620 'get list of intervals aligned to this slice according to groupBy options'
621 seqIntervals
=self.groupByIntervals
(mergeAll
=mergeAll
,**kwargs
)
622 ivals
=self.groupBySequences
(seqIntervals
,**kwargs
)
624 for ival1
,ival2
,mergeIntervals
in ivals
:
627 def __iter__
(self): # PYREX DOESNT ALLOW ARGS TO __iter__ !
628 return iter
(self.keys
())
629 def __getitem__
(self,k
):
630 return sequence
.Seq2SeqEdge
(self,k
)
631 def __setitem__
(self,k
,v
):
632 raise ValueError('''this NLMSA is read-only! Currently, you cannot add new
633 alignment intervals to an NLMSA after calling its build() method.''')
635 return self.nrealseq
# NUMBER OF NON-LPO SEQUENCE/ORIS ALIGNED HERE
638 ##################################### 1:1 INTERVAL METHODS
639 def matchIntervals
(self,seq
=None
):
640 '''get all 1:1 match intervals in this region of alignment
641 as list of tuples. if seq argument not None, only match intervals
642 for that sequence will be included. No clipping is performed.'''
645 nl
=self.nlmsaSequence
.nlmsaLetters
# GET TOPLEVEL LETTERS OBJECT
647 target_id
=nl
.seqs
.getID
(seq
) # CHECK IF IN OUR ALIGNMENT
651 for i
from 0 <= i
<self.n
: # GET ALL STORED INTERVALS
652 if not nl
.seqlist
.is_lpo
(self.im
[i
].target_id
) and \
653 (target_id
<0 or self.im
[i
].target_id
==target_id
):
654 ival2
=nl
.seqInterval
(self.im
[i
].target_id
,self.im
[i
].target_start
,
655 self.im
[i
].target_end
)
656 if seq
is None
or ival2
.orientation
==seq
.orientation
:
657 ival1
=sequence
.absoluteSlice
(self.seq
,
658 self.im
[i
].start
,self.im
[i
].end
)
659 l
.append
((ival1
,ival2
)) # SAVE THE INTERVAL MATCH
662 ############################## MAXIMUM INTERVAL METHODS
663 cdef int findSeqBounds
(self,int id,int ori
):
664 'find the specified sequence / orientation using binary search'
665 cdef int left
,right
,mid
667 right
=self.nseqBounds
670 if self.seqBounds
[mid
].target_id
<id:
672 elif self.seqBounds
[mid
].target_id
>id:
674 elif ori
>0 and seqBounds
[mid
].target_start
<0:
676 elif ori
<0 and seqBounds
[mid
].target_start
>=0:
678 else: # MATCHES BOTH id AND ori
680 return -1 # FAILED TO FIND id,ori MATCH
682 def findSeqEnds
(self,seq
):
683 'get maximum interval of seq aligned in this interval'
686 nl
=self.nlmsaSequence
.nlmsaLetters
# GET TOPLEVEL LETTERS OBJECT
687 id=nl
.seqs
.getID
(seq
) # CHECK IF IN OUR ALIGNMENT
688 i
=self.findSeqBounds
(id,seq
.orientation
) # FIND THIS id,ORIENTATION
690 raise KeyError('seq not aligned in this interval')
691 return self.get_seq_interval
(nl
, self.seqBounds
[i
].target_id
,
692 self.seqBounds
[i
].target_start
,
693 self.seqBounds
[i
].target_end
)
695 def generateSeqEnds
(self):
696 'get list of tuples (ival1,ival2,edge)'
699 nl
=self.nlmsaSequence
.nlmsaLetters
# GET TOPLEVEL LETTERS OBJECT
701 for i
from 0 <= i
<self.nseqBounds
:
702 if nl
.seqlist
.is_lpo
(self.seqBounds
[i
].target_id
):
703 continue # DON'T RETURN EDGES TO LPO
704 #ival1=sequence.absoluteSlice(self.seq,self.seqBounds[i].start,
705 # self.seqBounds[i].end)
706 ival2
= self.get_seq_interval
(nl
, self.seqBounds
[i
].target_id
,
707 self.seqBounds
[i
].target_start
,
708 self.seqBounds
[i
].target_end
)
709 #l.append((ival1,ival2,sequence.Seq2SeqEdge(self,ival2,ival1)))
710 edge
= self[ival2
] # LET edge FIGURE OUT sourcePath FOR US
711 l
.append
((edge
.sourcePath
,ival2
,edge
))
714 ############################################## GROUP-BY METHODS
715 def groupByIntervals
(self,int maxgap
=0,int maxinsert
=0,
716 int mininsert
= 0,filterSeqs
=None
,filterList
=None
,
717 mergeMost
=False
,maxsize
=500000000,
718 mergeAll
=True
,ivalMethod
=None
,**kwargs
):
719 '''merge alignment intervals using "horizontal" group-by rules:
720 - maxgap (=0): longest gap allowed within a region
721 - maxinsert (=0): longest insert allowed within a region
722 - mininsert (=0): should be 0, to prevent cycles within a region
723 use negative values to allow some overlap / cycles.
724 - maxsize: upper bound on maximum size for interval merging
725 - mergeMost: merge, but with limits (10000,10000,-10,50000)
726 - mergeAll: merge everything without any limits
727 - filterSeqs (=None): dict of sequences to apply these rules to;
728 other sequences alignment will be ignored if
729 filterSeqs not None. Slower than the filterList option.
730 - filterList (=None): list of sequence intervals to mask the
731 result set by. Note: a single sequence should not be
732 represented by more than one interval in filterList, as only
733 one interval for each sequence will be used as the clipping region.
734 Significantly faster than filterSeqs.
735 - ivalMethod: a function to process the list of intervals
736 for each sequence (it can merge or split them in any way
738 - pAlignedMin: a fractional minimum alignment threshold e.g. (0.9)
739 - pIdentityMin: a fractional minimum identity threshold e.g. (0.9)
741 cdef int i
,j
,n
,gap
,insert
,targetStart
,targetEnd
,start
,end
,maskStart
,maskEnd
743 nl
=self.nlmsaSequence
.nlmsaLetters
# GET TOPLEVEL LETTERS OBJECT
744 if mergeMost
: # BE REASONABLE: DON'T MERGE A WHOLE CHROMOSOME
747 mininsert
=-10 # ALLOW SOME OVERLAP IN INTERVAL ALIGNMENTS
749 if filterList
is not None
:
751 for seq
in filterList
: # CREATE AN INDEX OF SEQUENCE IDs TO KEEP
752 t
= nl
.seqs
.getIDcoords
(seq
)
753 targetDict
[t
[0]] = t
[1:] # SAVE START,STOP
755 for i
from 0 <= i
< self.n
: # LIST INTERVALS FOR EACH TARGET
756 if nl
.seqlist
.is_lpo
(self.im
[i
].target_id
):
757 continue # IT IS AN LPO, SO SKIP IT
758 start
=self.im
[i
].start
760 targetStart
=self.im
[i
].target_start
761 targetEnd
=self.im
[i
].target_end
762 if filterList
is not None
:
763 try: # CHECK IF SEQUENCE IS IN MASKING DICTIONARY
764 maskStart
,maskEnd
= targetDict
[self.im
[i
].target_id
]
766 continue # FILTER THIS SEQUENCE OUT OF THE RESULT SET
767 if start
>=maskEnd
or end
<=maskStart
: # NO OVERLAP
769 if start
<maskStart
: # CLIP START TO MASKED REGION
770 targetStart
= targetStart
+maskStart
-start
772 if end
>maskEnd
:# CLIP END TO MASKED REGION
773 targetEnd
= targetEnd
+maskEnd
-end
775 elif filterSeqs
is not None
: # CLIP TARGET SEQ INTERVAL
776 target
= self.get_seq_interval
(nl
, self.im
[i
].target_id
,
777 targetStart
, targetEnd
)
779 target
=filterSeqs
[target
] # PERFORM CLIPPING
780 except KeyError: # NO OVERLAP IN filterSeqs, SO SKIP
782 start
=start
+target
.start
-targetStart
# CLIP SOURCE SEQUENCE
783 end
=end
+target
.stop
-targetEnd
784 targetStart
=target
.start
# GET COORDS OF CLIPPED TARGET
785 targetEnd
=target
.stop
786 try: # ADD INTERVAL TO EXISTING LIST
787 seqIntervals
[self.im
[i
].target_id
] \
788 .append
([start
,end
,targetStart
,targetEnd
,None
])
789 except KeyError: # CREATE A NEW LIST FOR THIS TARGET
790 seqIntervals
[self.im
[i
].target_id
]= \
791 [[start
,end
,targetStart
,targetEnd
,None
]]
793 for i
,l
in seqIntervals
.iteritems
(): # MERGE INTERVALS FOR EACH SEQ
794 if ivalMethod
is not None
: # USER-SUPPLIED GROUPING FUNCTION
795 ivalMethod
(l
,nl
.seqlist
.getSeq
(i
),msaSlice
=self,maxgap
=maxgap
,
796 maxinsert
=maxinsert
,mininsert
=mininsert
,
797 filterSeqs
=filterSeqs
,mergeAll
=mergeAll
,**kwargs
)
798 continue # NO NEED TO APPLY GENERIC MERGING OPERATION BELOW
800 for j
from 1 <= j
< len(l
): # MERGE BY INDEL LENGTH RULES
801 gap
=l
[j
][0]-l
[n
][1] # current.start - last.end
802 insert
=l
[j
][2]-l
[n
][3] # current.target_start - last.target_end
803 if not mergeAll
and \
804 (gap
>maxgap
or insert
>maxinsert
or insert
<mininsert
805 or l
[j
][1]-l
[n
][0]>maxsize
806 or l
[j
][3]-l
[n
][2]>maxsize
):
807 n
=n
+1 # SPLIT, SO START A NEW INTERVAL
808 if n
<j
: # COPY START COORDS TO NEW SLOT
811 else: # INTERVALS MERGED: SAVE ORIGINAL 1:1 INTERVAL LIST
813 lastIval
= l
[n
][4][-1] # GET LAST 1:1 INTERVAL
814 except TypeError: # EMPTY LIST: CREATE ONE
815 if l
[n
][1]==l
[j
][0] and l
[n
][3]==l
[j
][2]: # NO GAP, SO MERGE
816 l
[n
][4] = [(l
[n
][0],l
[j
][1],l
[n
][2],l
[j
][3])]
817 else: # TWO SEPARATE 1:1 INTERVALS
818 l
[n
][4] = [tuple(l
[n
][:4]),tuple(l
[j
][:4])]
819 else: # SEE IF WE CAN FUSE TO LAST 1:1 INTERVAL
820 if lastIval
[1]==l
[j
][0] and lastIval
[3]==l
[j
][2]:
821 l
[n
][4][-1] = (lastIval
[0],l
[j
][1],lastIval
[2],l
[j
][3])
822 else: # GAP, SO JUST APPEND THIS 1:1 INTERVAL
823 l
[n
][4].append
(tuple(l
[j
][:4]))
824 if n
<j
: # COPY END COORDS TO CURRENT SLOT
827 del l
[n
+1:] # DELETE REMAINING UNMERGED INTERVALS
828 for m
in l
: # CULL SINGLETON 1:1 INTERVAL LISTS (DUE TO FUSION)
830 if len(m
[4])==1: # TWO INTERVALS MUST HAVE BEEN FUSED
831 m
[4] = None
# NO NEED TO KEEP SINGLETON!
834 # SEQUENCE MASKING BY CONSERVATION OR %ALIGNED CONSTRAINT
835 if 'pAlignedMin' in kwargs
or 'pIdentityMin' in kwargs
or \
836 'minAlignSize' in kwargs
or 'maxAlignSize' in kwargs
:
837 self.filterIvalConservation
(seqIntervals
,**kwargs
)
840 def conservationFilter
(self,seq
,m
,pIdentityMin
=None
,
841 minAlignSize
=None
,maxAlignSize
=None
,**kwargs
):
842 if minAlignSize
is not None
and m
[1]-m
[0]<minAlignSize
:
844 if maxAlignSize
is not None
and m
[1]-m
[0]>maxAlignSize
:
846 if pIdentityMin
is not None
:
847 seqEdge
=sequence
.Seq2SeqEdge
(self,sequence
.relativeSlice
(seq
,m
[2],m
[3]),
848 sequence
.absoluteSlice
(self.seq
,m
[0],m
[1]),m
[4])
849 t
= seqEdge
.conservedSegment
(pIdentityMin
=pIdentityMin
, # GET CLIPPED INTERVAL
850 minAlignSize
=minAlignSize
,**kwargs
)
853 mergeIntervals
= self.clip_interval_list
(t
[0],t
[1],m
[4]) # CLIP mergeIntervals
854 return list(t
)+[mergeIntervals
] # RECOMBINE
857 ## if pAlignedMin is not None and seqEdge.pAligned()<pAlignedMin:
858 ## return False # INTERVAL FAILED ALIGNMENT THRESHOLD, SO REMOVE IT
859 ## if pIdentityMin is not None and seqEdge.pIdentity()<pIdentityMin:
860 ## return False # INTERVAL FAILED CONSERVATION THRESHOLD, SO REMOVE IT
863 def filterIvalConservation
(self,seqIntervals
,pIdentityMin
=None
,
864 filterFun
=None
,**kwargs
):
868 if filterFun
is None
:
869 filterFun
=self.conservationFilter
870 nl
=self.nlmsaSequence
.nlmsaLetters
# GET TOPLEVEL LETTERS OBJECT
871 pIdentityMin0
=pIdentityMin
872 for targetID
,l
in seqIntervals
.items
(): # MERGE INTERVALS FOR EACH SEQ
873 seq
= self.get_seq_interval
(nl
, targetID
, 0, 0) # GET THE SEQUENCE OBJECT
874 if pIdentityMin0
is not None
and not isinstance(pIdentityMin0
,types
.FloatType
):
876 pIdentityMin
=pIdentityMin0
[seq
] # LOOK UP DESIRED IDENTITY FOR THIS SEQ
878 del seqIntervals
[targetID
] # SO REMOVE TARGET ENTIRELY
879 continue # NO NEED TO PROCESS THIS TARGET ANY FURTHER
881 for i
from 0 <= i
<len(l
): # CHECK EACH INTERVAL FOR CONSERVATION THRESHOLD
882 newIval
=filterFun
(seq
,l
[i
],pIdentityMin
=pIdentityMin
,**kwargs
)
884 continue # l[i] FAILED FILTER CRITERIA, SO SKIP IT
885 l
[j
]=newIval
# COMPACT THE ARRAY: KEEP newIval IN LOCATION j
886 j
=j
+1 # KEEP THIS ARRAY ENTRY, SO INCREMENT COUNT OF ENTRIES
887 if j
==0: # NO INTERVALS FOR THIS SEQUENCE SURVIVED MASKING
888 del seqIntervals
[targetID
] # SO REMOVE TARGET ENTIRELY
889 elif j
<i
: # SOME INTERVALS REMOVED, SO SHRINK ITS LIST
890 del l
[j
:] # JUST TRUNCATE THE LIST TO ENTRIES THAT PASSED
892 def groupBySequences
(self,seqIntervals
,sourceOnly
=False
,
893 indelCut
=False
,seqGroups
=None
,minAligned
=1,
894 pMinAligned
=0.,seqMethod
=None
,**kwargs
):
895 '''merge groups of sequences using "vertical" group-by rules:
896 - seqGroups: a list of one or more lists of sequences to group.
897 Each group will be analyzed separately, as follows:
898 - sourceOnly: output intervals will be reported giving only
899 the corresponding interval on the source sequence; redundant
900 output intervals (mapping to the same source interval) are
901 culled. Has the effect of giving a single interval traversal
903 - indelCut: for sourceOnly mode, do not merge separate
904 intervals as reported by seqIntervals (in other words,
905 that the groupByIntervals analysis separated due to an indel).
906 - minAligned: the minimum #sequences that must be aligned to
907 the source sequence for masking the output. Regions below
908 this threshold are masked out; no intervals will be reported
910 - pMinAligned: the minimum fraction of sequences (out of the
911 total in the group) that must be aligned to the source
912 sequence for masking the output.
913 - seqMethod: you may supply your own function for grouping.
914 Called as seqMethod(bounds,seqs,**kwargs), where
915 bounds is a sorted list of
916 (ipos,isStart,i,ns,isIndel,(start,end,targetStart,targetEnd))
917 seqs is a list of sequences in the group.
918 Must return a list of (sourceIval,targetIval). See the docs.
920 cdef int i
,j
,start
,end
,targetStart
,targetEnd
,ipos
,id
923 nl
=self.nlmsaSequence
.nlmsaLetters
# GET TOPLEVEL LETTERS OBJECT
924 if seqGroups
is None
:
925 seqGroups
=[seqIntervals
] # JUST USE THE WHOLE SET
927 import mapping
# GET ACCESS TO DictQueue CLASS
928 for seqs
in seqGroups
: # PROCESS EACH SEQ GROUP
931 for seq
in seqs
: # CONSTRUCT INTERVAL BOUNDS LIST
932 if isinstance(seq
,int): # seqIntervals USES INT INDEX VALUES
934 seq
= self.get_seq_interval
(nl
, id, 0, 0) # GET THE SEQUENCE OBJECT
935 else: # EXPECT USER TO SUPPLY ACTUAL SEQUENCE OBJECTS
936 id=nl
.seqs
.getID
(seq
)
937 seq
=seq
.pathForward
# ENSURE WE HAVE TOP-LEVEL SEQ OBJECT
939 ivals
=seqIntervals
[id]
940 except KeyError: # SEQUENCE NOT IN THIS ALIGNMENT REGION, SO SKIP
944 bounds
.append
((ival
[1],False
,j
,seq
,isIndel
,ival
))
945 bounds
.append
((ival
[0],True
,j
,seq
,isIndel
,ival
))
947 j
=j
+1 # SEQUENCE COUNTER ENSURES ORDER OF SEQS IN SORTED LIST
948 bounds
.sort
() # ASCENDING ORDER OF source_pos, SORT stop B4 start
949 if seqMethod
is not None
:
950 result
=result
+seqMethod
(bounds
,seqs
,sourceOnly
=sourceOnly
,
951 msaSlice
=self,minAligned
=minAligned
,
952 pMinAligned
=pMinAligned
,
953 indelCut
=indelCut
,**kwargs
)
954 continue # DON'T USE GENERIC GROUPING METHOD BELOW
955 seqStart
=mapping
.DictQueue
() # setitem PUSHES, delitem POPS
957 for bound
in bounds
: # GENERIC GROUPING: APPLY MASKING, sourceOnly
958 ipos
,isStart
,j
,seq
,isIndel
=bound
[0:5]
959 if isStart
: # INTERVAL START
960 seqStart
[seq
]=bound
[5] # JUST RECORD START OF INTERVAL
961 else: # INTERVAL STOP
962 start
,end
,targetStart
,targetEnd
,mergeIntervals
=bound
[5]
963 if maskStart
is not None
and not sourceOnly
: # SAVE TARGET IVAL
964 if maskStart
>start
: # TRUNCATE TARGET IVAL START
965 targetStart
=targetStart
+maskStart
-start
967 mergeIntervals
= self.clip_interval_list
(maskStart
,None
,mergeIntervals
)
968 result
.append
((sequence
.absoluteSlice
(self.seq
,start
,end
),
969 sequence
.relativeSlice
(seq
,targetStart
,
970 targetEnd
),mergeIntervals
))
971 del seqStart
[seq
] # POP THIS SEQ FROM START DICT
973 f
=len(seqStart
) # #ALIGNED SEQS IN THIS REGION
974 if f
<minAligned
or f
/len(seqs
)<pMinAligned
: # APPLY MASKING
975 if maskStart
is not None
:
976 if sourceOnly
: # JUST SAVE MERGED SOURCE INTERVAL
977 result
.append
(sequence
.absoluteSlice
(self.seq
,maskStart
,end
))
978 else: # REPORT TARGET IVALS WITHIN (maskStart,end) REGION
979 for seq
in seqStart
: # CANNOT USE items() BECAUSE THIS IS A QUEUE!
980 (start
,i
,targetStart
,targetEnd
,mergeIntervals
)=seqStart
[seq
]
982 if maskStart
>start
: # TRUNCATE TARGET IVAL START
983 targetStart
=targetStart
+maskStart
-start
986 if end
<i
: # TRUNCATE TARGET IVAL END
987 targetEnd
=targetEnd
+end
-i
990 mergeIntervals
= self.clip_interval_list
(maskStart
,end
,mergeIntervals
)
991 result
.append
((sequence
.absoluteSlice
(self.seq
,start
,end
),
992 sequence
.relativeSlice
(seq
,targetStart
,
993 targetEnd
),mergeIntervals
))
994 maskStart
=None
# REGION NOW BELOW THRESHOLD
995 elif maskStart
is None
:
996 maskStart
=ipos
# START OF REGION ABOVE THRESHOLD
997 if maskStart
is not None
and sourceOnly
and indelCut \
998 and isIndel
and maskStart
<ipos
:
999 result
.append
(sequence
.absoluteSlice
(self.seq
,maskStart
,ipos
))
1002 def clip_interval_list
(self,start
,end
,l
):
1003 'truncate list of 1:1 intervals using start,end'
1007 for srcStart
,srcEnd
,destStart
,destEnd
in l
:
1008 if (start
is not None
and start
>=srcEnd
) or (end
is not None
and end
<=srcStart
):
1010 if start
is not None
and start
>srcStart
:
1011 destStart
= destStart
+start
-srcStart
1013 if end
is not None
and end
<srcEnd
:
1014 destEnd
= destEnd
+end
-srcEnd
1016 result
.append
((srcStart
,srcEnd
,destStart
,destEnd
))
1022 ############################################## LPO REGION METHODS
1023 def split
(self,minAligned
=0,**kwargs
):
1024 '''Use groupByIntervals() and groupBySequences() methods to
1025 divide this slice into subslices using indel rules etc.'''
1026 seqIntervals
=self.groupByIntervals
(**kwargs
)
1027 kwargs
['sourceOnly']=True
1028 kwargs
['indelCut']=True
1029 ivals
=self.groupBySequences
(seqIntervals
,minAligned
=minAligned
,
1033 if ival
.start
==self.start
and ival
.stop
==self.stop
:
1034 l
.append
(self) # SAME INTERVAL, SO JUST RETURN self
1036 subslice
=NLMSASlice
(self.nlmsaSequence
,ival
.start
,ival
.stop
,
1037 self.id,self.offset
,self.seq
)
1042 def regions
(self,dummyArg
=None
,**kwargs
):
1043 '''get LPO region(s) corresponding to this interval
1044 Same group-by rules apply here as for the split() method.'''
1047 cdef NLMSASequence ns_lpo
1048 if self.nlmsaSequence
.is_lpo
: # ALREADY AN LPO REGION!
1049 return self.split
(**kwargs
) # JUST APPLY GROUP-BY RULES TO self
1050 nl
=self.nlmsaSequence
.nlmsaLetters
# GET TOPLEVEL LETTERS OBJECT
1052 for i
from 0 <= i
<self.nseqBounds
:
1053 ns_lpo
=nl
.seqlist
[self.seqBounds
[i
].target_id
]
1054 if ns_lpo
.is_lpo
: # ADD ALL LPO INTERVALS ALIGNED HERE
1055 subslice
=NLMSASlice
(ns_lpo
,self.seqBounds
[i
].target_start
,
1056 self.seqBounds
[i
].target_end
)
1057 l
=l
+subslice
.split
(**kwargs
) # APPLY GROUP-BY RULES
1060 raise ValueError('no LPO in nlmsaSlice.seqBounds? Debug!')
1063 ########################################### LETTER METHODS
1065 'interface to individual LPO letters in this interval'
1067 return NLMSASliceLetters
(self)
1069 def __cmp__
(self,other
):
1070 if isinstance(other
,NLMSASlice
):
1071 return cmp(self.nlmsaSequence
,other
.nlmsaSequence
)
1076 'return list of raw numeric intervals in this slice'
1079 for i
from 0 <= i
< self.n
:
1080 l
.append
((self.im
[i
].start
,self.im
[i
].end
,self.im
[i
].target_id
,
1081 self.im
[i
].target_start
,self.im
[i
].target_end
))
1088 def advanceStartStop
(int ipos
,NLMSASlice nlmsaSlice
not None
,
1089 int istart
,int istop
):
1091 if istop
>=nlmsaSlice
.n
:
1092 raise IndexError('out of bounds')
1093 for i
from istop
<= i
< nlmsaSlice
.n
:
1094 if ipos
>=nlmsaSlice
.im
[i
].start
: # ENTERS THIS INTERVAL
1095 istop
= i
+ 1 # ADVANCE THE END MARKER
1097 break # BEYOND ipos, SEARCH NO FURTHR
1098 for i
from istart
<= i
< istop
: # FIND 1ST OVERLAP
1099 if ipos
<nlmsaSlice
.im
[i
].end
:
1108 cdef class NLMSASliceIterator
:
1109 'generate letters (nodes) in this LPO slice'
1110 def __new__
(self,NLMSASlice nlmsaSlice
not None
):
1111 self.nlmsaSlice
=nlmsaSlice
1112 self.ipos
= nlmsaSlice
.start
- 1
1118 self.ipos
= self.ipos
+ 1
1119 # ADJUST istart,istop TO OVERLAP ipos
1120 self.istart
,self.istop
= \
1121 advanceStartStop
(self.ipos
,self.nlmsaSlice
,self.istart
,self.istop
)
1122 if self.istart
>=self.istop
: # HMM, NO OVERLAPS TO ipos
1123 if self.istop
<self.nlmsaSlice
.n
: # ANY MORE INTERVALS?
1124 self.ipos
=self.nlmsaSlice
.im
[self.istop
].start
# START OF NEXT INTERVAL
1125 # ADJUST istart,istop TO OVERLAP ipos
1126 self.istart
,self.istop
= \
1127 advanceStartStop
(self.ipos
,self.nlmsaSlice
,self.istart
,self.istop
)
1129 raise StopIteration
# NO MORE POSITIONS IN THIS SLICE
1130 return NLMSANode
(self.ipos
,self.nlmsaSlice
,self.istart
,self.istop
)
1138 cdef class NLMSANode
:
1139 'interface to a node in NLMSA storage of LPO alignment'
1140 def __new__
(self,int ipos
,NLMSASlice nlmsaSlice
not None
,
1141 int istart
=0,int istop
= -1):
1144 self.nlmsaSlice
=nlmsaSlice
1145 nl
=nlmsaSlice
.nlmsaSequence
.nlmsaLetters
# GET TOPLEVEL LETTERS OBJECT
1149 advanceStartStop
(ipos
,nlmsaSlice
,istart
,istart
) # COMPUTE PROPER BOUNDS
1150 self.istart
=istart
# SAVE BOUNDS
1152 self.id=ipos
# DEFAULT: ASSUME SLICE IS IN LPO...
1153 for i
from istart
<= i
< istop
:
1154 if nlmsaSlice
.im
[i
].start
<=ipos
and ipos
<nlmsaSlice
.im
[i
].end
:
1155 if nl
.seqlist
.is_lpo
(nlmsaSlice
.im
[i
].target_id
):
1156 self.id=nlmsaSlice
.im
[i
].target_start
+ipos
-nlmsaSlice
.im
[i
].start
#LPO ipos
1157 else: # DON'T COUNT THE LPO SEQUENCE
1160 ############################################# ALIGNED LETTER METHODS
1166 nl
=self.nlmsaSlice
.nlmsaSequence
.nlmsaLetters
# GET TOPLEVEL LETTERS OBJECT
1168 for i
from self.istart
<= i
< self.istop
:
1169 if self.nlmsaSlice
.im
[i
].start
<=self.ipos \
1170 and self.ipos
<self.nlmsaSlice
.im
[i
].end \
1171 and not nl
.seqlist
.is_lpo
(self.nlmsaSlice
.im
[i
].target_id
):
1172 j
=self.nlmsaSlice
.im
[i
].target_start
+self.ipos
-self.nlmsaSlice
.im
[i
].start
1173 l
.append
(nl
.seqInterval
(self.nlmsaSlice
.im
[i
].target_id
,j
,j
+1))
1176 def getSeqPos
(self,seq
):
1177 'return seqpos for this seq at this node'
1180 id=self.nlmsaSlice
.nlmsaSequence
.nlmsaLetters
.seqs
.getID
(seq
)
1182 raise KeyError('seq not in this alignment')
1183 for i
from self.istart
<= i
< self.istop
:
1184 if self.nlmsaSlice
.im
[i
].target_id
==id: # RETURN THE SEQUENCE INTERVAL
1185 j
=self.nlmsaSlice
.im
[i
].target_start
+self.ipos
-self.nlmsaSlice
.im
[i
].start
1186 return sequence
.absoluteSlice
(seq
,j
,j
+1)
1187 raise KeyError('seq not in node')
1190 "get this node's edges as list of tuples (ival1,ival2,edge)"
1193 ival1
=sequence
.absoluteSlice
(self.nlmsaSlice
.seq
,
1194 self.ipos
,self.ipos
+1) # OUR SEQ INTERVAL
1196 l
.append
((ival1
,ival2
,None
)) # GET EDGE INFO!!!
1199 def __getitem__
(self,seq
):
1200 raise NotImplementedError('hey! write some code here!')
1201 ## from lpo import POMSANodeRef # PROBABLY WONT NEED THIS AFTER RENAMING!!!
1203 ## s=self.getSeqPos(seq)
1204 ## return POMSANodeRef(self,seq.path)
1206 ## raise KeyError('seq not in node')
1208 def __cmp__
(self,other
):
1209 if isinstance(other
,NLMSANode
):
1210 return cmp((self.nlmsaSlice
,ipos
),(other
.nlmsaSlice
,ipos
))
1214 ########################################## NODE-TO-NODE EDGE METHODS
1215 cdef int check_edge
(self,int iseq
,int ipos
):
1217 for i
from self.istart
<= i
< self.istop
:
1218 if self.nlmsaSlice
.im
[i
].start
<=self.ipos \
1219 and self.ipos
<self.nlmsaSlice
.im
[i
].end \
1220 and self.nlmsaSlice
.im
[i
].target_id
==iseq \
1221 and self.nlmsaSlice
.im
[i
].target_start
==ipos
:
1223 return 0 # NO MATCH!
1225 def getEdgeSeqs
(self,NLMSANode other
):
1226 "return dict of sequences that traverse edge from self -> other"
1229 nl
=self.nlmsaSlice
.nlmsaSequence
.nlmsaLetters
# GET TOPLEVEL LETTERS OBJECT
1231 if self.id+1==other
.id: #other ADJACENT IN LPO
1232 for i
from self.istart
<= i
< self.istop
:
1233 if self.nlmsaSlice
.im
[i
].start
<=self.ipos
+1 \
1234 and self.ipos
+1<self.nlmsaSlice
.im
[i
].end
: # INTERVAL CONTAINS other
1235 seq
=nl
.seqlist
.getSeq
(self.nlmsaSlice
.im
[i
].target_id
)
1236 d
[seq
]=self.nlmsaSlice
.im
[i
].target_start
+self.ipos
-self.nlmsaSlice
.im
[i
].start
1237 elif self.ipos
+1==self.im
[i
].end \
1238 and other
.check_edge
(self.nlmsaSlice
.im
[i
].target_id
,
1239 self.nlmsaSlice
.im
[i
].target_end
):
1240 seq
=nl
.seqlist
.getSeq
(self.nlmsaSlice
.im
[i
].target_id
) # BRIDGE TO NEXT INTERVAL
1241 d
[seq
]=self.nlmsaSlice
.im
[i
].target_start
+self.ipos
-self.nlmsaSlice
.im
[i
].start
1242 else: # other NOT ADJACENT, SO INTERVALS THAT END HERE MIGHT JUMP TO other
1243 for i
from self.istart
<= i
< self.istop
:
1244 if self.ipos
+1==self.nlmsaSlice
.im
[i
].end \
1245 and other
.check_edge
(self.nlmsaSlice
.im
[i
].target_id
,
1246 self.nlmsaSlice
.im
[i
].target_end
):
1247 seq
=nl
.seqlist
.getSeq
(self.nlmsaSlice
.im
[i
].target_id
) # BRIDGE TO NEXT INTERVAL
1248 d
[seq
]=self.nlmsaSlice
.im
[i
].target_start
+self.ipos
-self.nlmsaSlice
.im
[i
].start
1251 def nodeEdges
(self):
1252 'get all outgoing edges from this node'
1253 cdef int i
,has_continuation
1254 has_continuation
= -1
1256 nodes
={} # LIST OF TARGET NODES WE HAVE AN EDGE TO
1257 for i
from self.istart
<= i
< self.nlmsaSlice
.n
:
1258 if i
>=self.istop
and len(d
)==0: # NO FURTHER CHECKS TO DO
1260 if self.nlmsaSlice
.im
[i
].start
<=self.ipos
and \
1261 self.ipos
<self.nlmsaSlice
.im
[i
].end
-1:
1262 has_continuation
=i
# ipos INSIDE THIS INTERVAL, SO HAS EDGE TO ipos+1
1263 elif self.ipos
==self.nlmsaSlice
.im
[i
].end
-1: # END OF THIS INTERVAL
1264 d
[self.nlmsaSlice
.im
[i
].target_id
]=self.nlmsaSlice
.im
[i
].target_end
1266 try: # CHECK FOR START OF AN "ADJACENT" INTERVAL
1267 if d
[self.nlmsaSlice
.im
[i
].target_id
]==self.nlmsaSlice
.im
[i
].target_start
:
1268 nodes
[self.nlmsaSlice
.im
[i
].start
]=i
1269 del d
[self.nlmsaSlice
.im
[i
].target_id
] # REMOVE FROM ADJACENCY LIST
1272 if has_continuation
>=0:
1273 nodes
[self.ipos
+1]=has_continuation
1276 node
=NLMSANode
(i
,self.nlmsaSlice
,self.istart
)
1277 result
[node
]=sequence
.LetterEdge
(self,node
) # EDGE OBJECT
1285 cdef class NLMSASequence
:
1286 'sequence interface to NLMSA storage of an LPO alignment'
1287 def __init__
(self,NLMSA nl
not None
,filestem
,seq
,mode
='r',is_union
=0,
1289 self.nlmsaLetters
=nl
1290 self.filestem
=filestem
1291 self.is_union
=is_union
1292 self.is_lpo
=0 # DEFAULT: NOT AN LPO
1294 if length
is not None
: # ALLOW USER TO SUPPLY A LENGTH FOR THIS COORD SYSTEM
1297 if isinstance(seq
,types
.StringType
):
1298 self.name
=seq
# ALLOW USER TO BUILD INDEXES WITH A STRING NAME
1299 elif seq
is not None
: # REGULAR SEQUENCE
1300 seq
= seq
.pathForward
# GET THE WHOLE SEQUENCE, IN FORWARD ORIENTATION
1301 try: # MAKE SURE seq HAS A UNIQUE NAME FOR INDEXING IT...
1302 self.name
=str(seq
.path
.name
)
1303 except AttributeError:
1305 self.name
=str(seq
.path
.id)
1306 except AttributeError:
1307 raise AttributeError('NLMSASequence: seq must have name or id attribute')
1309 self.length
=0 # LPO AND UNION SEQUENCES EXPAND AUTOMATICALLY
1312 if len(nl
.lpoList
)>0: # CALCULATE OFFSET OF NEW LPO, BASED ON LAST LPO
1313 lastLPO
=nl
.lpoList
[-1]
1314 self.offset
=lastLPO
.offset
+lastLPO
.length
1317 nl
.lpoList
.append
(self) # ADD TO THE LPO LIST
1318 self.idb
=None
# DEFAULT: NOT USING IN-MEMORY DATABASE.
1319 self.db
=None
# DEFAULT: WAIT TO OPEN DB UNTIL ACTUALLY NEEDED
1320 if mode
=='r': # IMMEDIATELY OPEN DATABASE, UNLIKE onDemand MODE
1321 self.db
=IntervalFileDB
(filestem
,mode
)
1322 elif mode
=='memory': # OPEN IN-MEMORY DATABASE
1323 self.idb
=IntervalDB
()
1324 elif mode
=='w': # WRITE .build FILE
1325 filename
=filestem
+'.build'
1326 self.build_ifile
=fopen
(filename
,'wb') # binary file
1327 if self.build_ifile
==NULL
:
1328 errmsg
='unable to open in write mode: '+filename
1329 raise IOError(errmsg
)
1332 def __dealloc__
(self):
1333 'remember: dealloc cannot call other methods!'
1334 if self.build_ifile
:
1335 fclose
(self.build_ifile
)
1337 def forceLoad
(self):
1338 'force database to be initialized, if not already open'
1339 self.db
=IntervalFileDB
(self.filestem
,'r')
1342 'free memory and close files associated with this sequence index'
1343 if self.db
is not None
:
1344 self.db
.close
() # CLOSE THE DATABASE, RELEASE MEMORY
1345 self.db
=None
# DISCONNECT FROM DATABASE
1346 if self.idb
is not None
:
1347 self.idb
.close
() # CLOSE THE DATABASE, RELEASE MEMORY
1348 self.idb
=None
# DISCONNECT FROM DATABASE
1349 if self.build_ifile
:
1350 fclose
(self.build_ifile
)
1351 self.build_ifile
=NULL
1353 def buildFiles
(self,**kwargs
):
1354 'build nested list from saved unsorted alignment data'
1356 if self.build_ifile
==NULL
:
1357 raise IOError('not opened in write mode')
1358 fclose
(self.build_ifile
)
1359 self.build_ifile
=NULL
1360 filename
=self.filestem
+'.build'
1361 db
=IntervalDB
() # CREATE EMPTY NL IN MEMORY
1363 db
.buildFromUnsortedFile
(filename
,self.nbuild
,**kwargs
) # BUILD FROM .build
1364 db
.write_binaries
(self.filestem
) # SAVE AS IntervalDBFile
1365 db
.close
() # DUMP NESTEDLIST FROM MEMORY
1367 os
.remove
(filename
) # REMOVE OUR .build FILE, NO LONGER NEEDED
1368 self.db
=IntervalFileDB
(self.filestem
) # NOW OPEN THE IntervalFileDB
1369 return self.nbuild
# return count of intervals
1371 def buildInMemory
(self, **kwargs
):
1373 n
= len(self.buildList
)
1377 self.idb
.save_tuples
(self.buildList
,**kwargs
)
1378 self.buildList
= None
1381 cdef int saveInterval
(self,IntervalMap im
[],int n
,int expand_self
,FILE
*ifile
):
1384 raise IOError('not opened in write mode')
1385 if expand_self
: # AN LPO THAT EXPANDS AS WE ADD TO IT...
1386 for i
from 0 <= i
< n
:
1388 if im
[i
].end
>self.length
: # EXPAND IT...
1389 self.length
=im
[i
].end
1390 elif -(im
[i
].start
)>self.length
:
1391 self.length
= -(im
[i
].start
) # THIS HANDLES NEGATIVE ORI CASE
1392 i
=write_padded_binary
(im
,n
,1,ifile
)
1394 raise IOError('write_padded_binary failed???')
1397 def __setitem__
(self,k
,t
): # SAVE TO .build FILE
1398 'save mapping [k.start:k.stop] --> (id,start,stop)'
1400 cdef IntervalMap im_tmp
1401 if self.build_ifile
: # SAVE TO BUILD FILE
1402 im_tmp
.start
,im_tmp
.end
=(k
.start
,k
.stop
)
1403 im_tmp
.target_id
,im_tmp
.target_start
,im_tmp
.target_end
=t
1405 i
=self.saveInterval
(&im_tmp
,1,self.is_lpo
,self.build_ifile
)
1406 #logger.debug('saveInterval: %s %s %s %s %s %s' % (self.id, im_tmp.start, im_tmp.end,
1407 # im_tmp.target_id, im_tmp.target_start, im_tmp.target_end))
1408 self.nbuild
=self.nbuild
+i
# INCREMENT COUNTER OF INTERVALS SAVED
1409 elif self.nlmsaLetters
.in_memory_mode
:
1410 t
=(k
.start
,k
.stop
)+t
1412 self.buildList
.append
(t
)
1413 except AttributeError:
1416 raise ValueError('not opened in write mode')
1418 def __getitem__
(self,k
):
1420 if k
.pathForward
is self.seq
:
1421 return NLMSASlice
(self,k
.start
,k
.stop
)
1422 except AttributeError: pass
1423 raise KeyError('key must be a sequence interval of this sequence')
1426 'call len(self.seq) if we have a seq. Otherwise self.length'
1427 if self.seq
is None
:
1430 return len(self.seq
)
1432 def __iadd__
(self,seq
):
1433 'add sequence to our union'
1434 try: # CHECK WHETHER THIS IS ALREADY IN THE INDEX
1435 x
=self.nlmsaLetters
.seqs
[seq
]
1436 return self # ALREADY IN THE INDEX, NO NEED TO ANYTHING
1437 except KeyError: # OK, WE REALLY DO NEED TO ADD IT...
1439 # CHECK FOR OVERFLOW... CREATE A NEW UNION IF NEEDED
1440 seq
=seq
.pathForward
# GET THE ENTIRE SEQUENCE
1441 if self.length
+len(seq
)>self.nlmsaLetters
.maxlen
: # TOO BIG!
1442 if self.nlmsaLetters
.pairwiseMode
: # NEED TO CREATE CORRESPONDING LPO
1443 ns
=self.nlmsaLetters
.newSequence
(None
) # CREATE NEW LPO
1444 ns
=self.nlmsaLetters
.newSequence
(None
,is_union
=1) # NEW UNION
1445 ns
.__iadd__
(seq
) # ADD seq TO BRAND-NEW UNION
1446 return ns
# RETURN THE NEW UNION COORDINATE SYSTEM
1447 # USE OUR EXISTING UNION
1448 self.nlmsaLetters
.seqs
.saveSeq
(seq
,self.id,self.length
)
1449 self.length
=self.length
+len(seq
) # EXPAND COORDINATE SYSTEM
1450 return self # iadd MUST ALWAYS RETURN self!!!
1458 'toplevel interface to NLMSA storage of an LPO alignment'
1459 def __init__
(self,pathstem
='',mode
='r',seqDict
=None
,mafFiles
=None
,axtFiles
=None
,
1460 maxOpenFiles
=1024,maxlen
=None
,nPad
=1000000,maxint
=41666666,
1461 trypath
=None
,bidirectional
=True
,pairwiseMode
= -1,
1462 bidirectionalRule
=nlmsa_utils
.prune_self_mappings
,
1463 use_virtual_lpo
=None
,maxLPOcoord
=None
,
1464 inverseDB
=None
, alignedIvals
=None
, **kwargs
):
1466 import resource
# WE MAY NEED TO OPEN A LOT OF FILES...
1467 resource
.setrlimit
(resource
.RLIMIT_NOFILE
,(maxOpenFiles
,-1))
1468 except: # BUT THIS IS OPTIONAL...
1470 self.lpoList
=[] # EMPTY LIST OF LPO
1471 self.seqs
=nlmsa_utils
.NLMSASeqDict
(self,pathstem
,mode
,**kwargs
)
1472 self.seqlist
=self.seqs
.seqlist
1473 self.pathstem
=pathstem
1474 self.inverseDB
= inverseDB
1476 maxlen
=C_int_max
-65536 # C_int_max MAXIMUM VALUE REPRESENTABLE BY int
1477 if axtFiles
is not None
:
1481 self._ignoreShadowAttr
={'sourceDB':None
,'targetDB':None
} # SCHEMA INFO
1482 self.seqDict
=seqDict
# SAVE FOR USER TO ACCESS...
1483 self.in_memory_mode
=0
1485 self.is_bidirectional
=1
1487 self.is_bidirectional
=0
1488 if use_virtual_lpo
is not None
: # DEPRECATED: MUST HAVE COME FROM PICKLE...
1489 pairwiseMode
= use_virtual_lpo
# USE SETTING FROM PICKLE...
1490 if pairwiseMode
is True
:
1492 elif pairwiseMode
is False
:
1495 self.pairwiseMode
=pairwiseMode
1496 if maxLPOcoord
is not None
: # SIZE OF THE ENTIRE LPO COORD SYSTEM
1497 self.maxLPOcoord
=maxLPOcoord
1498 if self.pairwiseMode
==1:
1499 raise ValueError('maxLPOcoord and pairwiseMode options incompatible!')
1501 else: # DEFAULT: RESTRICT USER TO A SINGLE LPO
1502 self.maxLPOcoord
=self.maxlen
1503 if mode
=='r': # OPEN FROM DISK FILES
1504 if self.seqDict
is None
:
1505 self.seqDict
= nlmsa_utils
.read_seq_dict
(pathstem
,trypath
)
1506 self.read_indexes
(self.seqDict
)
1508 elif mode
=='w': # WRITE TO DISK FILES
1511 if mafFiles
is not None
:
1512 self.newSequence
() # CREATE INITIAL LPO
1513 self.readMAFfiles
(mafFiles
,maxint
)
1514 elif axtFiles
is not None
:
1515 self.newSequence
() # CREATE INITIAL LPO
1516 self.readAxtNet
(axtFiles
,bidirectionalRule
)
1517 else: # USER WILL ADD INTERVAL MAPPINGS HIMSELF...
1518 if self.seqDict
is None
:
1520 self.seqDict
=seqdb
.SeqPrefixUnionDict
(addAll
=True
)
1521 self.initLPO
() # CREATE AS MANY LPOs AS WE NEED
1522 self.newSequence
(is_union
=1) # SO HE NEEDS AN INITIAL UNION
1523 if alignedIvals
is not None
:
1524 self.add_aligned_intervals
(alignedIvals
)
1526 elif mode
=='memory': # CONSTRUCT IN-MEMORY
1527 if self.seqDict
is None
:
1529 self.seqDict
=seqdb
.SeqPrefixUnionDict
(addAll
=True
)
1530 self.in_memory_mode
=1
1532 self.initLPO
() # CREATE AS MANY LPOs AS WE NEED
1533 self.newSequence
(is_union
=1) # CREATE INITIAL UNION
1535 if alignedIvals
is not None
:
1536 self.add_aligned_intervals
(alignedIvals
)
1538 elif mode
!='xmlrpc':
1539 raise ValueError('unknown mode %s' % mode
)
1542 'close our shelve index files'
1543 cdef NLMSASequence ns
1544 for ns
in self.seqlist
: # tell each seq to close its index files
1548 def __reduce__
(self): ############################# SUPPORT FOR PICKLING
1550 return (classutil
.ClassicUnpickler
, (self.__class__
,self.__getstate__
()))
1551 def __getstate__
(self):
1552 if self.in_memory_mode
:
1553 raise ValueError("can't pickle NLMSA.in_memory_mode")
1554 return dict
(pathstem
=self.pathstem
,seqDict
=self.seqDict
,
1555 inverseDB
=self.inverseDB
)
1556 def __setstate__
(self,state
):
1557 self.__init__
(**state
) #JUST PASS KWARGS TO CONSTRUCTOR
1559 def read_indexes
(self,seqDict
):
1560 'open all nestedlist indexes in this LPO database for immediate use'
1561 cdef NLMSASequence ns
1563 ifile
=file
(self.pathstem
+'.NLMSAindex', 'rU') # text file
1565 ifile
=file
(self.pathstem
+'NLMSAindex', 'rU') # FOR BACKWARDS COMPATIBILITY
1568 id,name
,is_union
,length
=line
.strip
().split
('\t')
1570 is_union
=int(is_union
)
1571 if id!=len(self.seqlist
):
1572 raise IOError('corrupted NLMSAIndex???')
1573 filestem
=self.pathstem
+str(id)
1574 seq
=None
# DEFAULT: NO ACTUAL SEQUENCE ASSOCIATED WITH LPO OR UNION
1575 if name
=='NLMSA_LPO_Internal': # AN LPO REFERENCE
1577 elif not is_union
: # REGULAR SEQUENCE
1581 raise KeyError('unable to find sequence %s in seqDict!' % name
)
1582 # CREATE THE SEQ INTERFACE, BUT DELAY OPENING THE IntervalDBFile
1583 ns
=NLMSASequence
(self,filestem
,seq
,'onDemand',is_union
) # UNTIL NEEDED
1584 ns
.length
=int(length
) # SAVE STORED LENGTH
1585 self.addToSeqlist
(ns
,seq
)
1588 def read_attrs
(self):
1589 'read pickled attribute dictionary from file and apply to self'
1592 ifile
= file
(self.pathstem
+'.attrDict', 'rb') # pickle is binary file!
1593 except IOError: # BACKWARDS COMPATIBILITY: OLD NLMSA HAS NOT ATTRDICT
1596 d
= pickle
.load
(ifile
)
1597 for k
,v
in d
.items
():
1598 if k
=='is_bidirectional':
1599 self.is_bidirectional
= v
1600 elif k
=='pairwiseMode':
1601 self.pairwiseMode
= v
1606 def addToSeqlist
(self,NLMSASequence ns
,seq
=None
):
1607 'add an NLMSASequence to our seqlist, and set its id'
1608 ns
.id=self.seqlist
.nextID
()
1609 self.seqs
[seq
]=ns
# SAVE TO OUR INDEX
1611 def newSequence
(self,seq
=None
,is_union
=0):
1612 'create a new nestedlist index for sequence seq'
1613 cdef NLMSASequence ns
1614 id=self.seqlist
.nextID
() # USE ITS INDEX AS UNIQUE ID FOR THIS SEQUENCE
1615 if self.pairwiseMode
==1 and is_union
and id>1: # NEED A VIRTUAL_LPO FOR THIS UNION
1616 self.newSequence
() # CREATE AN LPO
1617 id=self.seqlist
.nextID
() # NOW GET ID FOR THE UNION
1618 filestem
=self.pathstem
+str(id)
1619 if self.in_memory_mode
:
1623 ns
=NLMSASequence
(self,filestem
,seq
,mode
,is_union
=is_union
) #OPEN FOR WRITING
1624 if is_union
: # RECORD THIS AS OUR CURRENT UNION OBJECT
1625 self.currentUnion
=ns
1626 self.addToSeqlist
(ns
,seq
) # SAVE TO OUR INDEX
1627 #logger.debug('Opened build file for ns_id %s, is_union %s' % (ns.id,
1632 'get a unique nlmsaID and advance counter'
1633 nlmsaID
=self.inlmsa
# USE THE NEXT FREE ID
1634 self.inlmsa
=self.inlmsa
+1 # ADVANCE THE COUNTER
1638 'create LPOs for this alignment'
1639 cdef NLMSASequence ns
1641 while offset
<self.maxLPOcoord
:
1642 ns
=self.newSequence
() # CREATE AN LPO
1643 ns
.offset
=offset
# FORCE OUR DESIRED OFFSET... EVEN THOUGH ALL LPOs EMPTY
1644 offset
=offset
+self.maxlen
1645 def init_pairwise_mode
(self):
1646 'turn on use of virtual LPO mapping (i.e. no actual LPO is present!)'
1647 if self.pairwiseMode
==0:
1648 raise ValueError('this alignment is already using an LPO!')
1649 elif self.pairwiseMode
!=1: # NOT ALREADY SET TO PAIRWISE MODE
1651 Because you are aligning a pair of sequence intervals,
1652 the pairwiseMode=True option is automatically being applied.
1653 To avoid this message in the future, pass the pairwiseMode=True
1654 option to the NLMSA constructor.
1655 See the NLMSA documentation for more details.\n''')
1656 self.pairwiseMode
=1 # TURN ON THE VIRTUAL LPO FEATURE
1659 def __setitem__
(self,k
,v
): # THIS METHOD EXISTS ONLY FOR nlmsa[s1]+=s2
1660 if isinstance(v
,nlmsa_utils
.BuildMSASlice
):
1661 if v
.seq
is not None
and v
.seq
==k
: # CASE WHERE k IS A SEQ INTERVAL
1662 return # MATCHES += USAGE: NO NEED TO DO ANYTHING!
1663 elif v
.is_lpo
and isinstance(k
,slice) \
1664 and k
.start
==v
.start
and k
.stop
==v
.stop
: # CASE: k IS A slice
1665 return # MATCHES += USAGE: NO NEED TO DO ANYTHING!
1666 raise KeyError('Usage only nlmsa[s1]+=s2 allowed. nlmsa[s1]=s2 forbidden!')
1667 def __getitem__
(self,k
):
1668 'return a slice of the LPO'
1669 if isinstance(k
,sequence
.SeqPath
): # TREAT k AS A SEQUENCE INTERVAL
1670 id,ns
,offset
=self.seqs
[k
] # GET UNION INFO FOR THIS SEQ
1672 return nlmsa_utils
.BuildMSASlice
(ns
,k
.start
,k
.stop
,id,offset
,0,k
)
1673 else: # QUERY THE ALIGNMENT
1675 return NLMSASlice
(ns
,k
.start
,k
.stop
,id,offset
,k
)
1676 except nlmsa_utils
.EmptySliceError
:
1677 return nlmsa_utils
.EmptySlice
(k
)
1678 try: # TREAT k AS A PYTHON SLICE OBJECT
1680 except AttributeError:
1681 raise KeyError('key must be a sequence interval or python slice object')
1683 return nlmsa_utils
.BuildMSASlice
(self.lpoList
[0],k
.start
,k
.stop
,-1,0,1,None
)
1684 else: # QUERY THE ALIGNMENT
1685 l
=nlmsa_utils
.splitLPOintervals
(self.lpoList
,k
) # MAP TO LPO(S)
1687 raise ValueError('Sorry! Query interval spans multiple LPOs!')
1688 for ns
,myslice
in l
: # ONLY RETURN ONE SLICE OBJECT
1689 return NLMSASlice
(ns
,myslice
.start
,myslice
.stop
)
1692 raise NotImplementedError, 'you cannot iterate over NLMSAs'
1694 def edges
(self, *args
, **kwargs
):
1695 return nlmsa_utils
.generate_nlmsa_edges
(self, *args
, **kwargs
)
1697 def __iadd__
(self,seq
):
1698 'add seq to our union'
1699 self.seqs
.saveSeq
(seq
)
1700 return self # iadd MUST ALWAYS RETURN self!
1701 def addAnnotation
(self,a
):
1702 'save alignment of sequence interval --> an annotation object'
1703 ival
= a
.sequence
# GET PURE SEQUENCE INTERVAL
1704 self.__iadd__
(ival
) # ADD SEQ AS A NODE IN OUR ALIGNMENT
1705 self[ival
].__iadd__
(a
) # ADD ALIGNMENT BETWEEN ival AND ANNOTATION
1706 def add_aligned_intervals
(self, alignedIvals
):
1707 'add alignedIvals to this alignment'
1708 nlmsa_utils
.add_aligned_intervals
(self, alignedIvals
)
1710 cdef void free_seqidmap
(self,int nseq0
,SeqIDMap
*seqidmap
):
1712 for i
from 0 <= i
<nseq0
: # DUMP STRING STORAGE FOR SEQUENCE IDENTIFIERS
1713 free
(seqidmap
[i
].id)
1714 free
(seqidmap
) # WE CAN NOW FREE THE SEQUENCE LOOKUP ARRAY
1716 cdef void save_nbuild
(self,int nbuild
[]):
1717 cdef NLMSASequence ns
1718 for ns
in self.seqlist
: # SAVE INTERVAL COUNTS BACK TO EACH SEQUENCE
1719 if not ns
.is_lpo
or self.pairwiseMode
==1:
1720 ns
.nbuild
=nbuild
[ns
.id] # SAVE INTERVAL COUNTS BACK TO REGULAR SEQUENCES
1721 #logger.debug('nbuild[%d] = %s' % (i, ns.nbuild))
1724 def readMAFfiles
(self,mafFiles
,maxint
):
1725 'read alignment from a set of MAF files'
1726 cdef int i
,j
,nseq0
,nseq1
,n
,nseq
,block_len
1727 cdef SeqIDMap
*seqidmap
1728 cdef char tmp
[32768],*p
,a_header
[4]
1730 cdef IntervalMap im
[4096],im_tmp
1731 cdef NLMSASequence ns_lpo
,ns
# ns IS OUR CURRENT UNION
1732 cdef FILE
*build_ifile
[4096]
1733 cdef int nbuild
[4096],has_continuation
1734 cdef long long linecode_count
[256]
1736 ns_lpo
=self.seqlist
[self.lpo_id
] # OUR INITIAL LPO
1737 self.pairwiseMode
=0 # WE ARE USING A REAL LPO!
1738 memset
(<void *>linecode_count
,0,sizeof(linecode_count
))
1741 nseq0
=len(self.seqDict
) # GET TOTAL #SEQUENCES IN ALL DATABASES
1742 seqidmap
=<SeqIDMap
*>calloc
(nseq0
,sizeof(SeqIDMap
)) # ALLOCATE ARRAY
1744 for pythonStr
,seqInfo
in self.seqDict
.seqInfoDict
.iteritems
():
1745 seqidmap
[i
].id=strdup
(pythonStr
)
1747 seqidmap
[i
].length
= seqInfo
.length
1748 except OverflowError:
1749 raise OverflowError('''Sequence too long for 32 bit int: %s, %d
1750 Something is probably wrong with creation / reading of this sequence.
1751 Check the input!''' % (pythonStr
, seqInfo
.length
))
1753 qsort
(seqidmap
,nseq0
,sizeof(SeqIDMap
),seqidmap_qsort_cmp
) # SORT BY id
1756 im_tmp
.sublist
= -1 # DEFAULT
1757 strcpy
(a_header
,"a ") # MAKE C STRING
1758 for filename
in mafFiles
:
1759 logger
.info
('Processing MAF file: ' + filename
)
1760 ifile
=fopen
(filename
,'r') # text file
1762 self.free_seqidmap
(nseq0
,seqidmap
)
1763 self.save_nbuild
(nbuild
)
1764 raise IOError('unable to open file %s' % filename
)
1765 if fgets
(tmp
,32767,ifile
)==NULL
or strncmp
(tmp
,"##maf",4): # HEADER LINE
1766 self.free_seqidmap
(nseq0
,seqidmap
)
1767 self.save_nbuild
(nbuild
)
1768 raise IOError('%s: not a MAF file? Bad format.' % filename
)
1769 p
=fgets
(tmp
,32767,ifile
) # READ 1ST DATA LINE OF THE MAF FILE
1770 while p
: # GOT ANOTHER LINE TO PROCESS
1771 if has_continuation
or 0==strncmp
(tmp
,a_header
,2): # ALIGNMENT HEADER: READ ALIGNMENT
1772 n
=readMAFrecord
(im
,0,seqidmap
,nseq0
,ns_lpo
.length
, # READ ONE MAF BLOCK
1773 &block_len
,ifile
,4096,linecode_count
,&has_continuation
)
1774 if n
<0: # UNRECOVERABLE ERROR OCCURRED...
1775 self.free_seqidmap
(nseq0
,seqidmap
)
1776 self.save_nbuild
(nbuild
)
1777 raise ValueError('MAF block too long! Increase max size')
1781 if self.maxlen
-ns_lpo
.length
<=block_len \
1782 or ns_lpo
.nbuild
>maxint
: # TOO BIG! MUST CREATE A NEW LPO
1783 j
=ns_lpo
.length
# RECORD THE OLD OFFSET
1784 ns_lpo
=self.newSequence
() # CREATE A NEW LPO SEQUENCE
1785 for i
from 0<= i
< n
: # TRANSLATE THESE INTERVALS BACK TO ZERO OFFSET
1786 if im
[i
].start
>=0: # FORWARD INTERVAL
1787 im
[i
].start
= im
[i
].start
- j
1788 im
[i
].end
= im
[i
].end
- j
1789 else: # REVERSE INTERVAL
1790 im
[i
].start
= im
[i
].start
+ j
1791 im
[i
].end
= im
[i
].end
+ j
1793 for i
from 0 <= i
< n
: # SAVE EACH INTERVAL IN UNION -> LPO MAP
1795 if seqidmap
[j
].nlmsa_id
<=0: # NEW SEQUENCE, NEED TO ADD TO UNION
1796 if ns
is None
or self.maxlen
-ns
.length
<=seqidmap
[j
].length
:
1797 ns
=self.newSequence
(None
,is_union
=1) # CREATE NEW UNION TO HOLD IT
1798 build_ifile
[ns
.id]=ns
.build_ifile
# KEEP PTR SO WE CAN WRITE DIRECTLY!
1800 seqidmap
[j
].ns_id
=ns
.id # SET IDs TO ADD THIS SEQ TO THE UNION
1801 seqidmap
[j
].nlmsa_id
=self.inlmsa
1802 seqidmap
[j
].offset
=ns
.length
1803 self.inlmsa
=self.inlmsa
+1 # ADVANCE SEQUENCE ID COUNTER
1804 ns
.length
=ns
.length
+seqidmap
[j
].length
# EXPAND UNION SIZE
1806 im
[i
].target_id
=seqidmap
[j
].nlmsa_id
# USE THE CORRECT ID
1807 if im
[i
].target_start
<0: # OFFSET REVERSE ORI
1808 im_tmp
.start
= -seqidmap
[j
].offset
+im
[i
].target_start
1809 im_tmp
.end
= -seqidmap
[j
].offset
+im
[i
].target_end
1810 else: # OFFSET FORWARD ORI
1811 im_tmp
.start
= seqidmap
[j
].offset
+im
[i
].target_start
1812 im_tmp
.end
= seqidmap
[j
].offset
+im
[i
].target_end
1813 im_tmp
.target_id
=ns_lpo
.id
1814 im_tmp
.target_start
=im
[i
].start
1815 im_tmp
.target_end
=im
[i
].end
1816 j
=seqidmap
[j
].ns_id
# USE NLMSA ID OF THE UNION
1817 ns_lpo
.saveInterval
(&im_tmp
,1,0,build_ifile
[j
]) # SAVE SEQ -> LPO
1818 nbuild
[j
]=nbuild
[j
]+1
1820 ns_lpo
.saveInterval
(im
,n
,1,ns_lpo
.build_ifile
) # SAVE LPO -> SEQ
1821 ns_lpo
.nbuild
=ns_lpo
.nbuild
+n
# INCREMENT COUNT OF SAVED INTERVALS
1822 if not has_continuation
:
1823 p
=fgets
(tmp
,32767,ifile
) # TRY TO READ ANOTHER LINE...
1824 fclose
(ifile
) # CLOSE THIS MAF FILE
1825 #logger.debug('nbuild[0] = ' + ns_lpo.nbuild)
1826 for i
from 0 <= i
<256: # PRINT WARNINGS ABOUT NON-ALIGNMENT LINES
1827 if linecode_count
[i
]>0:
1828 logger
.warn
("Non-alignment text lines ignored: prefix %s, count %d" %
1829 (chr(i
), linecode_count
[i
]))
1830 for i
from 0 <= i
<nseq0
: # INDEX SEQUENCES THAT WERE ALIGNED
1831 if seqidmap
[i
].nlmsa_id
>0: # ALIGNED, SO RECORD IT
1832 self.seqs
.saveSeq
(seqidmap
[i
].id,seqidmap
[i
].ns_id
,seqidmap
[i
].offset
,
1833 seqidmap
[i
].nlmsa_id
)
1834 self.free_seqidmap
(nseq0
,seqidmap
)
1835 self.save_nbuild
(nbuild
)
1836 self.build
() # WILL TAKE CARE OF CLOSING ALL build_ifile STREAMS
1838 cdef NLMSASequence add_seqidmap_to_union
(self,int j
,SeqIDMap seqidmap
[],
1839 NLMSASequence ns
,FILE
*build_ifile
[],
1841 cdef NLMSASequence ns_lpo
1842 if ns
is None
or self.maxlen
-ns
.length
<=seqidmap
[j
].length
:
1843 ns
= self.newSequence
(None
,is_union
=1) # CREATE NEW UNION TO HOLD IT
1844 build_ifile
[ns
.id] = ns
.build_ifile
# KEEP PTR SO WE CAN WRITE DIRECTLY!
1846 if self.pairwiseMode
==1: # ALSO BIND INFO FOR VIRTUAL LPO FOR THIS UNION
1847 ns_lpo
= self.seqs
.seqlist
[ns
.id - 1]
1848 build_ifile
[ns_lpo
.id] = ns_lpo
.build_ifile
1849 nbuild
[ns_lpo
.id] = 0
1850 seqidmap
[j
].ns_id
= ns
.id # SET IDs TO ADD THIS SEQ TO THE UNION
1851 seqidmap
[j
].nlmsa_id
= self.inlmsa
1852 seqidmap
[j
].offset
= ns
.length
1853 self.inlmsa
= self.inlmsa
+1 # ADVANCE SEQUENCE ID COUNTER
1854 ns
.length
= ns
.length
+seqidmap
[j
].length
# EXPAND UNION SIZE
1857 def readAxtNet
(self,axtFiles
,bidirectionalRule
):
1858 'read alignment from a set of axtnet files'
1859 cdef int i
,j
,nseq0
,n
,isrc
,is_bidirectional
1860 cdef SeqIDMap
*seqidmap
1861 cdef char tmp
[32768],*p
,comment
[4],src_prefix
[64],dest_prefix
[64]
1863 cdef IntervalMap im
[4096],im_tmp
1864 cdef NLMSASequence ns_src
# SOURCE UNION VS DEST UNION
1865 cdef FILE
*build_ifile
[4096]
1866 cdef int nbuild
[4096],has_continuation
1868 self.pairwiseMode
= 1 # WE ARE USING pairwiseMode
1870 nseq0
=len(self.seqDict
) # GET TOTAL #SEQUENCES IN ALL DATABASES
1871 seqidmap
=<SeqIDMap
*>calloc
(nseq0
,sizeof(SeqIDMap
)) # ALLOCATE ARRAY
1873 for pythonStr
,seqInfo
in self.seqDict
.seqInfoDict
.iteritems
():
1874 seqidmap
[i
].id=strdup
(pythonStr
)
1875 seqidmap
[i
].length
= seqInfo
.length
1877 qsort
(seqidmap
,nseq0
,sizeof(SeqIDMap
),seqidmap_qsort_cmp
) # SORT BY id
1880 im_tmp
.sublist
= -1 # DEFAULT
1881 strcpy
(comment
,"#") # MAKE C STRING
1884 for filename
in axtFiles
:
1885 logger
.info
('Processing axtnet file: ' + filename
)
1887 if filename
[-8:] == '.net.axt':
1888 t
= string
.split
(os
.path
.basename
(filename
)[:-8], '.')[-2:]
1889 elif filename
[-4:] == '.axt':
1890 t
= string
.split
(os
.path
.basename
(filename
)[:-4], '.')[-2:]
1892 raise IOError('%s is not correct axtNet file name. Correct name is (chrid.)source.target.net.axt.' % filename
)
1893 #t = prefix_fun(filename) # CALL PYTHON FUNCTION TO OBTAIN PREFIXES
1894 if bidirectionalRule
is None
: # DETERMINE IF UNI- VS. BI-DIRECTIONAL
1895 is_bidirectional
= self.is_bidirectional
# JUST USE GLOBAL SETTING
1896 else: # GET SETTING FROM USER-SUPPLIED FUNCTION
1897 is_bidirectional
= bidirectionalRule
(t
[0],t
[1],self.is_bidirectional
)
1898 strcpy
(src_prefix
,t
[0]) # KEEP THEM IN STATIC C STRINGS FOR SPEED
1899 strcpy
(dest_prefix
,t
[1])
1900 ifile
=fopen
(filename
,'r') # text file
1902 self.free_seqidmap
(nseq0
,seqidmap
)
1903 self.save_nbuild
(nbuild
)
1904 raise IOError('unable to open file %s' % filename
)
1906 n
= read_axtnet
(im
,seqidmap
,nseq0
,ifile
,4096,&isrc
,src_prefix
,dest_prefix
)
1907 if n
<0: # UNRECOVERABLE ERROR OCCURRED...
1908 self.free_seqidmap
(nseq0
,seqidmap
)
1909 self.save_nbuild
(nbuild
)
1910 raise ValueError('axtNet block too long! Increase max size')
1911 elif n
==0: # NO MORE DATA TO READ
1914 if seqidmap
[isrc
].nlmsa_id
<=0: # NEW SEQUENCE, NEED TO ADD TO UNION
1915 ns_src
= self.add_seqidmap_to_union
(isrc
,seqidmap
,ns_src
,build_ifile
,nbuild
)
1917 for i
from 0 <= i
< n
: # SAVE EACH INTERVAL IN SRC -> DEST MAP
1919 #logger.debug('A: %s %s %s %s %s %s' % (im[i].start, im[i].end,
1921 # im[i].target_start,
1924 #logger.debug('B: %s %s %s %s %s %s %s' % (seqidmap[isrc].nlmsa_id, i,
1925 # j, seqidmap[j].id,
1926 # seqidmap[j].ns_id,
1927 # seqidmap[j].offset,
1928 # seqidmap[j].nlmsa_id))
1929 if seqidmap
[j
].nlmsa_id
<=0: # NEW SEQUENCE, NEED TO ADD TO UNION
1930 ns_src
= self.add_seqidmap_to_union
(j
,seqidmap
,ns_src
,build_ifile
,nbuild
)
1931 im
[i
].target_id
= seqidmap
[j
].nlmsa_id
# USE THE CORRECT ID
1932 if is_bidirectional
: # SAVE DEST -> SRC ALIGNMENT MAPPING
1933 if im
[i
].target_start
<0: # OFFSET REVERSE ORI
1934 im_tmp
.start
= -seqidmap
[j
].offset
+im
[i
].target_start
1935 im_tmp
.end
= -seqidmap
[j
].offset
+im
[i
].target_end
1936 else: # OFFSET FORWARD ORI
1937 im_tmp
.start
= seqidmap
[j
].offset
+im
[i
].target_start
1938 im_tmp
.end
= seqidmap
[j
].offset
+im
[i
].target_end
1939 im_tmp
.target_id
= seqidmap
[isrc
].nlmsa_id
1940 im_tmp
.target_start
= im
[i
].start
1941 im_tmp
.target_end
= im
[i
].end
1942 #logger.debug('C: %s %s %s %s %s' % (im_tmp.target_id,
1943 # im_tmp.target_start,
1944 # im_tmp.target_end,
1945 # seqidmap[j].ns_id, j))
1946 j
=seqidmap
[j
].ns_id
-1 # SAVE ALL ALIGNMENTS TO THE VIRTUAL LPO
1947 ns_src
.saveInterval
(&im_tmp
,1,0,build_ifile
[j
]) # SAVE DEST -> SRC
1948 nbuild
[j
]=nbuild
[j
]+1
1949 if im
[i
].start
< 0: # OFFSET FORWARD ORI
1950 im
[i
].start
= -seqidmap
[isrc
].offset
+ im
[i
].start
1951 im
[i
].end
= -seqidmap
[isrc
].offset
+ im
[i
].end
1952 else: # OFFSET FORWARD ORI
1953 im
[i
].start
= seqidmap
[isrc
].offset
+ im
[i
].start
1954 im
[i
].end
= seqidmap
[isrc
].offset
+ im
[i
].end
1955 #logger.debug('D: %s %s %s %s %s %s' % (im_tmp.start, im_tmp.end,
1957 # im_tmp.target_start,
1958 # im_tmp.target_end,
1961 # SAVE THE RECORD. read_axtnet FUNCTION READS SRC/DEST AT THE SAME TIME
1962 j
= seqidmap
[isrc
].ns_id
-1 # SAVE ALL ALIGNMENTS TO THE VIRTUAL LPO
1963 ns_src
.saveInterval
(im
,n
,0,build_ifile
[j
]) # SAVE SRC -> DEST
1964 nbuild
[j
]=nbuild
[j
]+n
# INCREMENT COUNT OF SAVED INTERVALS
1966 fclose
(ifile
) # CLOSE THIS AXTNET FILE
1968 for i
from 0 <= i
<nseq0
: # INDEX SEQUENCES THAT WERE ALIGNED
1969 if seqidmap
[i
].nlmsa_id
>0: # ALIGNED, SO RECORD IT
1970 self.seqs
.saveSeq
(seqidmap
[i
].id,seqidmap
[i
].ns_id
,seqidmap
[i
].offset
,
1971 seqidmap
[i
].nlmsa_id
)
1972 self.free_seqidmap
(nseq0
,seqidmap
)
1973 self.save_nbuild
(nbuild
)
1974 self.build
() # WILL TAKE CARE OF CLOSING ALL build_ifile STREAMS
1977 def buildFiles
(self, saveSeqDict
=False
, **kwargs
):
1978 'build nestedlist databases on-disk, and .seqDict index if desired'
1979 cdef NLMSASequence ns
1980 self.seqs
.reopenReadOnly
() # SAVE INDEXES AND OPEN READ-ONLY
1982 ifile
=file
(self.pathstem
+'.NLMSAindex','w') # text file
1984 for ns
in self.seqlist
: # BUILD EACH IntervalFileDB ONE BY ONE
1985 ntotal
= ntotal
+ ns
.buildFiles
(**kwargs
)
1987 ifile
.write
('%d\t%s\t%d\t%d\n' %(ns
.id,'NLMSA_LPO_Internal',0,ns
.length
))
1989 ifile
.write
('%d\t%s\t%d\t%d\n' %(ns
.id,'NLMSA_UNION_Internal',1,ns
.length
))
1991 ifile
.write
('%d\t%s\t%d\t%d\n' %(ns
.id,ns
.name
,0,ns
.length
))
1995 raise nlmsa_utils
.EmptyAlignmentError
('empty alignment!')
1997 ifile
= file
(self.pathstem
+'.attrDict','wb') # pickle is binary file!
1999 pickle
.dump
(dict
(is_bidirectional
=self.is_bidirectional
,
2000 pairwiseMode
=self.pairwiseMode
),ifile
)
2003 logger
.info
('Index files saved.')
2005 self.save_seq_dict
()
2007 logger
.info
('''Note: the NLMSA.seqDict was not saved to a file.
2008 This is not necessary if you intend to save the NLMSA to worldbase.
2009 But if you wish to open this NLMSA independently of worldbase,
2010 you should call NLMSA.save_seq_dict() to save the seqDict info to a file,
2011 or in the future pass the saveSeqDict=True option to NLMSA.build().''')
2013 def save_seq_dict
(self):
2014 'save seqDict to a worldbase-aware pickle file'
2015 nlmsa_utils
.save_seq_dict
(self.pathstem
,self.seqDict
)
2017 def build
(self,**kwargs
):
2018 'build nestedlist databases from saved mappings and initialize for use'
2019 if self.do_build
==0:
2020 raise ValueError('not opened in write mode')
2021 try: # TURN OFF AUTOMATIC ADDING OF SEQUENCES TO OUR SEQDICT...
2022 self.seqDict
.addAll
=False
2023 except AttributeError: # THAT WAS PURELY OPTIONAL...
2025 if self.in_memory_mode
:
2027 for ns
in self.seqlist
: # BUILD EACH IntervalDB ONE BY ONE
2028 ntotal
= ntotal
+ ns
.buildInMemory
(**kwargs
)
2030 raise nlmsa_utils
.EmptyAlignmentError
('empty alignment!')
2032 self.buildFiles
(**kwargs
)
2035 def seqInterval
(self,int iseq
,int istart
,int istop
):
2036 'get specified interval in the target sequence'
2037 seq
=self.seqlist
.getSeq
(iseq
) # JUST THE SEQ OBJECT
2038 return sequence
.relativeSlice
(seq
,istart
,istop
)
2039 def __invert__
(self):
2040 if self.inverseDB
is not None
: # use the specified inverseDB
2041 return self.inverseDB
2042 elif self.is_bidirectional
: # provides mapping both directions
2045 raise ValueError('this mapping is not invertible')
2050 def dump_textfile
(pathstem
, outfilename
=None
):
2051 'dump NLMSA binary files to a text file'
2052 cdef int n
,nlmsaID
,nsID
,offset
,is_bidirectional
,pairwiseMode
,nprefix
2054 cdef char err_msg
[2048],tmp
[2048],seqDictID
[256]
2055 err_msg
[0] = 0 # ENSURE STRING IS EMPTY
2056 if outfilename
is None
:
2057 outfilename
= pathstem
+'.txt' # DEFAULT TEXTFILE NAME
2058 import classutil
,pickle
,sys
# NEED TO COPY THE WHOLE seqIDdict
2059 seqIDdict
= classutil
.open_shelve
(pathstem
+'.seqIDdict','r')
2061 seqDict
= nlmsa_utils
.read_seq_dict
(pathstem
)
2062 try: # OBTAIN PREFIX INFO FOR SEQDICT
2063 prefixDict
= seqDict
.prefixDict
2064 nprefix
= len(prefixDict
)
2065 strcpy
(seqDictID
,"None")
2066 except AttributeError: # NO PREFIXUNION. TRY TO GET ID OF seqDict
2070 strcpy
(seqDictID
,seqDict
._persistent_id
)
2071 except AttributeError:
2072 strcpy
(seqDictID
,"unknown")
2073 logger
.info
('''Warning: Because your seqDict has no worldbase ID, there
2074 is no host-independent way to save it to a textfile for transfer
2075 to another machine. Therefore, when loading this textfile
2076 on the destination machine, you will have to provide the
2077 seqDict argument to textfile_to_binaries() on the destination machine.''')
2079 ifile
= file
(pathstem
+'.attrDict', 'rb') # pickle is binary file!
2080 d
= pickle
.load
(ifile
)
2084 is_bidirectional
= d
.get
('is_bidirectional',-1)
2085 pairwiseMode
= d
.get
('pairwiseMode',-1)
2086 outfile
=fopen
(outfilename
,"w") # text file
2088 basestem
=os
.path
.basename
(pathstem
) # GET RID OF PATH INFO
2089 strcpy
(tmp
,basestem
) # COPY TO C STRING SO WE CAN fprintf
2091 raise IOError('unable to open file %s' %outfilename
)
2093 if fprintf
(outfile
,"PATHSTEM\t%s\t%d\t%d\t%d\t%d\t%s\n",tmp
,n
,
2094 is_bidirectional
,pairwiseMode
,nprefix
,seqDictID
)<0:
2095 raise IOError('error writing to file %s' %outfilename
)
2097 for id,d
in prefixDict
.items
(): # SAVE seqDict PREFIX ENTRIES
2098 strcpy
(tmp
,id) # CONVERT TO C DATA TYPES FOR fprintf
2100 strcpy
(seqDictID
,d
._persistent_id
) # try to get worldbase ID
2101 except AttributeError:
2102 strcpy
(seqDictID
,"None")
2105 logger
.info
('''Warning: Because one or more of the sequence
2106 databases in the seqDict have no worldbase ID, there is no
2107 host-independent way to save it to a textfile for transfer
2108 to another machine. Therefore, when loading this textfile
2109 on the destination machine, you will have to provide a dictionary
2110 for these sequence database(s) as the prefixDict argument
2111 to textfile_to_binaries() on the destination machine.''')
2112 if fprintf
(outfile
,"PREFIXUNION\t%s\t%s\n",tmp
,seqDictID
)<0:
2113 raise IOError('error writing to file %s' %outfilename
)
2114 for id,t
in seqIDdict
.iteritems
(): # SAVE seqIDdict
2115 strcpy
(tmp
,id) # CONVERT TO C DATA TYPES FOR fprintf
2119 if fprintf
(outfile
,"SEQID\t%s\t%d\t%d\t%d\n",tmp
,
2120 nlmsaID
,nsID
,offset
)<0:
2121 raise IOError('error writing to file %s' %outfilename
)
2123 ifile
= file
(pathstem
+'.NLMSAindex', 'rU') # text file
2125 ifile
= file
(pathstem
+'NLMSAindex', 'rU')
2130 for line
in ifile
: # NOW SAVE THE NLMSA DATA
2131 id,name
,is_union
,length
=line
.strip
().split
('\t')
2132 strcpy
(tmp
,line
) # COPY TO C STRING SO WE CAN fprintf
2133 if fprintf
(outfile
,"NLMSASequence\t%s",tmp
)<0:
2134 raise IOError('error writing file %s'%outfilename
)
2137 if save_text_file
(mypath
,mybase
,err_msg
,outfile
)!=0:
2138 raise IOError(err_msg
)
2144 def textfile_to_binaries
(filename
,seqDict
=None
,prefixDict
=None
,buildpath
=''):
2145 'convert pathstem.txt textfile to NLMSA binary files'
2146 cdef int i
,n
,nlmsaID
,nsID
,offset
,is_bidirectional
,pairwiseMode
,nprefix
2148 cdef char err_msg
[2048],line
[32768],tmp
[2048],basestem
[2048],seqDictID
[2048]
2149 if seqDict
is not None
:
2152 ignorePrefix
= False
2153 err_msg
[0]=0 # ENSURE STRING IS EMPTY
2154 infile
=fopen
(filename
,"r") # text file
2156 raise IOError('unable to open file %s' %filename
)
2158 if fgets
(line
,32767,infile
)==NULL
:
2159 raise IOError('error or EOF reading %s'%filename
)
2160 is_bidirectional
= -1 # INVALID INITIAL SETTING
2164 if 2>sscanf
(line
,"PATHSTEM\t%s\t%d\t%d\t%d\t%d\t%s",basestem
,&n
,
2165 &is_bidirectional
,&pairwiseMode
,&nprefix
,tmp
):
2166 raise IOError('bad format in %s'%filename
)
2167 if buildpath
!='': # USER-SPECIFIED PATH FOR BINARIES
2169 buildpath1
= os
.path
.join
(buildpath
,basestem
) # CONSTRUCT FILE PATH
2170 strcpy
(basestem
,buildpath1
) # COPY BACK TO C STRING USABLE IN C FUNCTIONS
2171 else: # JUST USE PATH IN CURRENT DIRECTORY
2172 buildpath1
= basestem
2173 if 0==strcmp
(tmp
,"unknown"):
2175 raise ValueError('You must provide a seqDict for this NLMSA!')
2176 elif 0!=strcmp
(tmp
,"None"): # try obtaining as worldbase ID
2177 from pygr
import worldbase
2178 seqDict
= worldbase
(tmp
)
2179 import classutil
,pickle
# CREATE THE seqIDdict
2180 seqIDdict
= classutil
.open_shelve
(basestem
+'.seqIDdict','n')
2181 IDdict
= classutil
.open_shelve
(basestem
+'.idDict','n')
2183 if is_bidirectional
!= -1:
2184 d
['is_bidirectional'] = is_bidirectional
2185 if pairwiseMode
!= -1:
2186 d
['pairwiseMode'] = pairwiseMode
2187 ifile
= file
(basestem
+'.attrDict',"wb") # pickle is binary file!
2189 pickle
.dump
(d
,ifile
)
2192 if prefixDict
is None
or ignorePrefix
:
2195 for i
from 0 <= i
< nprefix
: # READ seqDICT PREFIX ENTRIES
2196 if fgets
(line
,32767,infile
)==NULL
:
2197 raise IOError('error or EOF reading %s'%filename
)
2198 if 2!=sscanf
(line
,"PREFIXUNION\t%s\t%s",tmp
,seqDictID
):
2199 raise IOError('bad format in %s'%filename
)
2200 if ignorePrefix
: # JUST IGNORE THE PREFIX INFO WE READ
2202 if 0==strcmp
(seqDictID
,"None"):
2203 if tmp
not in prefixDict
:
2204 missing
.append
(tmp
) # MISSING A SEQDICT DICTIONARY ENTRY!
2205 else: # load it from worldbase
2206 from pygr
import worldbase
2207 prefixDict
[tmp
] = worldbase
(seqDictID
)
2209 raise KeyError('''You must supply sequence database(s) for the
2210 following prefixes, by passing them in the prefixDict optional
2211 dictionary argument: %s''' % missing
)
2212 if len(prefixDict
)>0: # CREATE A PREFIX UNION
2214 seqDict
= seqdb
.PrefixUnionDict
(prefixDict
)
2215 nlmsa_utils
.save_seq_dict
(basestem
,seqDict
) # SAVE SEQDICT
2216 for i
from 0 <= i
<n
: # seqIDDict READING
2217 if fgets
(line
,32767,infile
)==NULL
:
2218 raise IOError('error or EOF reading %s'%filename
)
2219 if 4!=sscanf
(line
,"SEQID\t%s\t%d %d %d",tmp
,
2220 &nlmsaID
,&nsID
,&offset
):
2221 raise IOError('bad format in %s'%filename
)
2222 seqIDdict
[tmp
]=(nlmsaID
,nsID
,offset
) # SAVE THIS ENTRY
2223 IDdict
[str(nlmsaID
)]=(tmp
,nsID
)
2224 seqIDdict
.close
() # DONE WRITING THE seqIDdict
2225 IDdict
.close
() # DONE WRITING THE seqIDdict
2228 if buildpath
!='': # USER-SPECIFIED PATH FOR BINARIES
2230 buildpath2
= os
.path
.join
(buildpath
,'') # ENSURE THIS ENDS IN DIRECTORY SEPARATOR
2231 strcpy
(basestem
,buildpath2
) # COPY BACK TO C STRING USABLE IN C FUNCTIONS
2233 strcpy
(basestem
,'') # JUST USE BLANK STRING TO SAVE IN CURRENT DIRECTORY
2235 while fgets
(line
,32767,infile
)!=NULL
:
2236 s
=line
# CONVERT STRING TO PYTHON OBJECT
2237 if not s
.startswith
('NLMSASequence'):
2238 raise IOError('bad format in file %s'%filename
)
2239 NLMSAindexText
= NLMSAindexText
+ s
[14:] # JUST SAVE THE DATA FIELDS
2240 logger
.info
('Saving NLMSA binary index: ' + s
[14:] + '...')
2241 if text_file_to_binaries
(infile
,basestem
,err_msg
)<0:
2242 raise IOError(err_msg
)
2243 ifile
= file
(buildpath1
+'.NLMSAindex',"w") # text file
2244 ifile
.write
(NLMSAindexText
) # LAST, WRITE TOP INDEX FILE
2248 return buildpath1
# ACTUAL PATH TO NLMSA INDEX FILESET