added SQLTable pickle test
[pygr.git] / pygr / cnestedlist.pyx
blob1e2d96e7daa65e54f614e04e0e3cde58fcb495e1
1 import sequence
2 import nlmsa_utils
3 import logger
5 cdef class IntervalDBIterator:
6 def __new__(self,int start,int end,IntervalDB db not None):
7 self.it=interval_iterator_alloc()
8 self.it_alloc=self.it
9 self.start=start
10 self.end=end
11 self.db=db
13 def __iter__(self):
14 return self
16 cdef int cnext(self): # C VERSION OF ITERATOR next METHOD RETURNS INDEX
17 cdef int i
18 if self.ihit>=self.nhit: # TRY TO GET ONE MORE BUFFER CHUNK OF HITS
19 if self.it==NULL: # ITERATOR IS EXHAUSTED
20 return -1
21 find_intervals(self.it,self.start,self.end,self.db.im,self.db.ntop,
22 self.db.subheader,self.db.nlists,self.im_buf,1024,
23 &(self.nhit),&(self.it)) # GET NEXT BUFFER CHUNK
24 self.ihit=0 # START ITERATING FROM START OF BUFFER
25 if self.ihit<self.nhit: # RETURN NEXT ITEM FROM BUFFER
26 i=self.ihit
27 self.ihit = self.ihit+1 # ADVANCE THE BUFFER COUNTER
28 return i
29 else: # BUFFER WAS EMPTY, NO HITS TO ITERATE OVER...
30 return -1
32 # PYTHON VERSION OF next RETURNS HIT AS A TUPLE
33 def __next__(self): # PYREX USES THIS NON-STANDARD NAME INSTEAD OF next()!!!
34 cdef int i
35 i=self.cnext()
36 if i>=0:
37 return (self.im_buf[i].start,self.im_buf[i].end,self.im_buf[i].target_id,
38 self.im_buf[i].target_start,self.im_buf[i].target_end)
39 else:
40 raise StopIteration
42 def __dealloc__(self):
43 'remember: dealloc cannot call other methods!'
44 free_interval_iterator(self.it_alloc)
49 cdef class IntervalDB:
50 def __new__(self,filename='noname',nsize=0,**kwargs):
51 cdef int i
52 cdef FILE *ifile
53 self.n=nsize
54 if nsize>0:
55 ifile=fopen(filename,"r") # text file, one interval per line
56 if ifile:
57 self.im=read_intervals(self.n,ifile)
58 fclose(ifile)
59 if self.im!=NULL:
60 self.runBuildMethod(**kwargs)
61 else:
62 msg='could not open file %s' % filename
63 raise IOError(msg)
65 def save_tuples(self,l,**kwargs):
66 'build in-memory NLMSA from list of alignment tuples'
67 cdef int i
68 self.close() # DUMP OUR EXISTING MEMORY
69 self.n=len(l)
70 self.im=interval_map_alloc(self.n)
71 if self.im==NULL:
72 raise MemoryError('unable to allocate IntervalMap[%d]' % self.n)
73 i=0
74 for t in l:
75 self.im[i].start=t[0]
76 self.im[i].end=t[1]
77 self.im[i].target_id=t[2]
78 self.im[i].target_start=t[3]
79 self.im[i].target_end=t[4]
80 self.im[i].sublist= -1
81 i=i+1
82 self.runBuildMethod(**kwargs)
84 def runBuildMethod(self,buildInPlace=True):
85 'build either in-place or using older build method'
86 if buildInPlace:
87 self.subheader=build_nested_list_inplace(self.im,self.n,&(self.ntop),&(self.nlists))
88 else:
89 self.subheader=build_nested_list(self.im,self.n,&(self.ntop),&(self.nlists))
91 def buildFromUnsortedFile(self,filename,int n,**kwargs):
92 'load unsorted binary data, and build nested list'
93 cdef FILE *ifile
94 cdef int i
95 cdef IntervalMap *im_new
96 self.close()
97 ifile=fopen(filename,'rb') # binary file
98 if ifile==NULL:
99 raise IOError('unable to open '+filename)
100 im_new=interval_map_alloc(n)
101 if im_new==NULL:
102 raise MemoryError('unable to allocate IntervalMap[%d]' % n)
103 i=read_imdiv(ifile,im_new,n,0,n)
104 fclose(ifile)
105 if i!=n:
106 raise IOError('IntervalMap file corrupted?')
107 self.n=n
108 self.im=im_new
109 self.runBuildMethod(**kwargs)
111 def find_overlap(self,int start,int end):
112 self.check_nonempty() # RAISE EXCEPTION IF NO DATA
113 return IntervalDBIterator(start,end,self)
115 def find_overlap_list(self,int start,int end):
116 cdef int i,nhit
117 cdef IntervalIterator *it,*it_alloc
118 cdef IntervalMap im_buf[1024]
119 self.check_nonempty() # RAISE EXCEPTION IF NO DATA
120 it=interval_iterator_alloc()
121 it_alloc=it
122 l=[] # LIST OF RESULTS TO HAND BACK
123 while it:
124 find_intervals(it,start,end,self.im,self.ntop,
125 self.subheader,self.nlists,im_buf,1024,
126 &(nhit),&(it)) # GET NEXT BUFFER CHUNK
127 for i from 0 <= i < nhit:
128 l.append((im_buf[i].start,im_buf[i].end,im_buf[i].target_id,im_buf[i].target_start,im_buf[i].target_end))
129 free_interval_iterator(it_alloc)
130 return l
132 def check_nonempty(self):
133 if self.im:
134 return True
135 else:
136 msg='empty IntervalDB, not searchable!'
137 raise IndexError(msg)
139 def write_binaries(self,filestem,div=256):
140 cdef char *err_msg
141 err_msg=write_binary_files(self.im,self.n,self.ntop,div,
142 self.subheader,self.nlists,filestem)
143 if err_msg:
144 raise IOError(err_msg)
146 def __dealloc__(self):
147 'remember: dealloc cannot call other methods!'
148 if self.subheader:
149 free(self.subheader)
150 if self.im:
151 free(self.im)
153 def close(self):
154 if self.subheader:
155 free(self.subheader)
156 if self.im:
157 free(self.im)
158 self.subheader=NULL
159 self.im=NULL
161 return None
165 cdef class IntervalFileDBIterator:
166 def __new__(self,int start,int end,IntervalFileDB db=None,
167 NLMSASequence ns=None,
168 int nbuffer=1024,rawIvals=None):
169 cdef int i
170 self.it_alloc=interval_iterator_alloc()
171 self.restart(start, end, db, ns)
172 if rawIvals is not None and len(rawIvals)>nbuffer:
173 nbuffer=len(rawIvals)
174 self.im_buf=interval_map_alloc(nbuffer)
175 self.nbuf=nbuffer
176 if rawIvals is not None:
178 for ival in rawIvals:
179 self.im_buf[i].start=ival[0] # SAVE INTERVAL INFO
180 self.im_buf[i].end=ival[1]
181 self.im_buf[i].target_id=ival[2]
182 self.im_buf[i].target_start=ival[3]
183 self.im_buf[i].target_end=ival[4]
184 i=i+1
185 self.nhit=i # TOTAL NUMBER OF INTERVALS STORED
187 cdef int restart(self,int start,int end,IntervalFileDB db,
188 NLMSASequence ns) except -2:
189 'reuse this iterator for another search without reallocing memory'
190 self.nhit=0 # FLUSH ANY EXISTING DATA
191 self.start=start
192 self.end=end
193 self.db=db
194 if ns is not None:
195 if ns.idb is not None:
196 self.idb=ns.idb
197 elif ns.db is None:
198 ns.forceLoad()
199 self.db=ns.db
200 self.it=self.it_alloc # REUSE OUR CURRENT ITERATOR
201 reset_interval_iterator(self.it) # RESET IT FOR REUSE
202 return 0
204 cdef int reset(self) except -2:
205 'flush the buffer so we can reuse this iterator'
206 self.nhit=0
207 return 0
209 cdef int extend(self,int ikeep):
210 'expand the buffer if necessary, keeping elements [ikeep:nbuf]'
211 cdef int length,istart
212 cdef IntervalMap *new_buf
213 istart=self.nbuf-ikeep
214 length=sizeof(IntervalMap)*istart # #BYTES WE MUST KEEP
215 if ikeep>0 and length>0: # SHIFT [ikeep:] SLICE OF BUFFER TO [0:istart]
216 memmove(self.im_buf,self.im_buf+ikeep,length)
217 if ikeep<8: # RUNNING OUT OF ROOM, SO DOUBLE OUR BUFFER
218 new_buf=<IntervalMap *>realloc(self.im_buf,
219 sizeof(IntervalMap)*2*self.nbuf)
220 if new_buf==NULL:
221 raise MemoryError('out of memory')
222 self.im_buf=new_buf
223 self.nbuf=2*self.nbuf
224 return istart # RETURN START OF EMPTY BLOCK WHERE WE CAN ADD NEW DATA
226 cdef int saveInterval(self,int start,int end,int target_id,
227 int target_start,int target_end):
228 'save an interval, expanding array if necessary'
229 cdef int i
230 if self.nhit>=self.nbuf: # EXPAND ARRAY IF NECESSARY
231 self.extend(0)
232 i=self.nhit
233 self.im_buf[i].start=start # SAVE INTERVAL INFO
234 self.im_buf[i].end=end
235 self.im_buf[i].target_id=target_id
236 self.im_buf[i].target_start=target_start
237 self.im_buf[i].target_end=target_end
238 self.nhit = i+1
239 return self.nhit
241 cdef int nextBlock(self,int *pkeep) except -2:
242 'load one more block of overlapping intervals'
243 cdef int i
244 if self.it==NULL: # ITERATOR IS EXHAUSTED
245 return -1
246 if pkeep and pkeep[0]>=0 and pkeep[0]<self.nhit: #MUST KEEP [ikeep:] SLICE
247 i=self.extend(pkeep[0]) # MOVE SLICE TO THE FRONT
248 else: # WE CAN USE THE WHOLE BUFFER
250 if self.db is not None: # ON-DISK DATABASE
251 find_file_intervals(self.it,self.start,self.end,
252 self.db.db[0].ii,self.db.db[0].nii,
253 self.db.db[0].subheader,self.db.db[0].nlists,
254 &(self.db.db[0].subheader_file),
255 self.db.db[0].ntop,self.db.db[0].div,
256 self.db.db[0].ifile_idb,
257 self.im_buf+i,self.nbuf-i,
258 &(self.nhit),&(self.it)) # GET NEXT BUFFER CHUNK
259 elif self.idb is not None: # IN-MEMORY DATABASE
260 find_intervals(self.it,self.start,self.end,self.idb.im,self.idb.ntop,
261 self.idb.subheader,self.idb.nlists,self.im_buf+i,self.nbuf-i,
262 &(self.nhit),&(self.it)) # GET NEXT BUFFER CHUNK
263 else:
264 raise IOError('Iterator has no database! Please provide a db argument.')
265 self.nhit=self.nhit+i # TOTAL #HITS IN THE BUFFER
266 self.ihit=i # START ITERATING FROM START OF NEW HITS
267 if pkeep and pkeep[0]>=0: # RESET ikeep INDEX TO START OF BUFFER
268 pkeep[0]=0
269 return self.nhit-self.ihit # RETURN #NEW HITS IN NEXT BLOCK
271 cdef IntervalMap *getIntervalMap(self):
272 '''return the IntervalMap array loaded by iterator,
273 and release it from iterator. User must free the array!'''
274 cdef int len
275 cdef IntervalMap *im
276 if self.nhit==0: # NO HITS
277 return NULL
278 elif self.nhit<self.nbuf: # LARGER BUFFER THAN WE ACTUALLY NEED
279 len=sizeof(IntervalMap)*self.nhit # COMPUTE FINAL SIZE
280 im=<IntervalMap *>realloc(self.im_buf,len) # COMPACT TO FINAL SIZE
281 else: # JUST HAND BACK OUR FULL BUFFER
282 im=self.im_buf
283 self.im_buf=NULL # RELEASE THIS STORAGE FROM ITERATOR; USER MUST FREE IT!
284 return im # HAND BACK THE STORAGE
286 cdef int loadAll(self) except -1:
287 'load all overlapping interval hits, return count of hits'
288 cdef int len,ikeep
289 len=1
290 ikeep=0 # DON'T LET extend DISCARD ANY HITS, KEEP THEM ALL!
291 while len>0: # LOAD BLOCKS UNTIL NO MORE...
292 len=self.nextBlock(&ikeep) # LOAD ANOTHER BLOCK OF INTERVALS
293 return self.nhit
295 cdef int cnext(self,int *pkeep): # C VERSION OF ITERATOR next METHOD
296 'get one more overlapping interval'
297 cdef int i
298 if self.ihit>=self.nhit: # TRY TO GET ONE MORE BUFFER CHUNK OF HITS
299 self.nextBlock(pkeep) # LOAD THE NEXT BLOCK IF ANY
300 if self.ihit<self.nhit: # RETURN NEXT ITEM FROM BUFFER
301 i=self.ihit
302 self.ihit = self.ihit+1 # ADVANCE THE BUFFER COUNTER
303 return i
304 else: # BUFFER WAS EMPTY, NO HITS TO ITERATE OVER...
305 return -1
307 cdef int copy(self,IntervalFileDBIterator src):
308 'copy items from src to this iterator buffer'
309 cdef IntervalMap *new_buf
310 if src is None:
311 raise ValueError('src is None! Debug!!')
312 if src.nhit>self.nbuf: # NEED TO EXPAND OUR BUFFER
313 new_buf=<IntervalMap *>realloc(self.im_buf,src.nhit*sizeof(IntervalMap))
314 if new_buf==NULL:
315 raise MemoryError('out of memory')
316 self.im_buf=new_buf # RECORD NEW BUFFER LOCATION AND SIZE
317 self.nbuf=src.nhit
318 self.nhit=src.nhit # COPY ARRAY AND SET CORRECT SIZE
319 if src.nhit>0: # ONLY COPY IF NON-EMPTY
320 memcpy(self.im_buf,src.im_buf,src.nhit*sizeof(IntervalMap))
321 return 0
323 def mergeSeq(self):
324 'merge intervals into single interval per sequence orientation'
325 cdef int i,j,n,id
326 if self.nhit<=0: # NOTHING TO MERGE, SO JUST RETURN
327 return 0
328 qsort(self.im_buf,self.nhit,sizeof(IntervalMap),target_qsort_cmp) # ORDER BY id,start
330 id= -1
331 for i from 0 <= i < self.nhit:
332 if self.im_buf[i].target_id!=id or (self.im_buf[j].target_start<0 and
333 self.im_buf[i].target_start>=0):
334 if id>=0: # WE NEED TO SAVE PREVIOUS INTERVAL
335 if n<j: # COPY MERGED INTERVAL TO COMPACTED LOCATION
336 memcpy(self.im_buf+n,self.im_buf+j,sizeof(IntervalMap))
337 n=n+1
338 j=i # RECORD THIS AS START OF THE NEW SEQUENCE / ORIENTATION
339 id=self.im_buf[i].target_id
340 elif self.im_buf[i].target_end>self.im_buf[j].target_end:
341 self.im_buf[j].target_end=self.im_buf[i].target_end # EXPAND THIS INTERVAL
342 self.im_buf[j].end=self.im_buf[i].end # COPY SOURCE SEQ COORDS AS WELL
343 if n<j: # COPY LAST MERGED INTERVAL TO COMPACTED LOCATION
344 memcpy(self.im_buf+n,self.im_buf+j,sizeof(IntervalMap))
345 self.nhit=n+1 # TOTAL #MERGED INTERVALS
346 return self.nhit
348 def __iter__(self):
349 return self
351 # PYTHON VERSION OF next RETURNS HIT AS A TUPLE
352 def __next__(self): # PYREX USES THIS NON-STANDARD NAME INSTEAD OF next()!!!
353 cdef int i
354 i=self.cnext(NULL)
355 if i>=0:
356 return (self.im_buf[i].start,self.im_buf[i].end,self.im_buf[i].target_id,
357 self.im_buf[i].target_start,self.im_buf[i].target_end)
358 else:
359 raise StopIteration
361 def __dealloc__(self):
362 'remember: dealloc cannot call other methods!'
363 free_interval_iterator(self.it_alloc)
364 if self.im_buf:
365 free(self.im_buf)
371 cdef class IntervalFileDB:
372 def __new__(self,filestem=None,mode='r'):
373 if filestem is not None and mode=='r':
374 self.open(filestem)
376 def open(self,filestem):
377 cdef char err_msg[1024]
378 self.db=read_binary_files(filestem,err_msg,1024)
379 if self.db==NULL:
380 raise IOError(err_msg)
382 def find_overlap(self,int start,int end):
383 self.check_nonempty() # RAISE EXCEPTION IF NO DATA
384 return IntervalFileDBIterator(start,end,self)
386 def find_overlap_list(self,int start,int end):
387 cdef int i,nhit
388 cdef IntervalIterator *it,*it_alloc
389 cdef IntervalMap im_buf[1024]
390 self.check_nonempty() # RAISE EXCEPTION IF NO DATA
391 it=interval_iterator_alloc()
392 it_alloc=it
393 l=[] # LIST OF RESULTS TO HAND BACK
394 while it:
395 find_file_intervals(it,start,end,self.db[0].ii,self.db[0].nii,
396 self.db[0].subheader,self.db[0].nlists,
397 &(self.db[0].subheader_file),
398 self.db[0].ntop,self.db[0].div,
399 self.db[0].ifile_idb,im_buf,1024,
400 &(nhit),&(it)) # GET NEXT BUFFER CHUNK
401 for i from 0 <= i < nhit:
402 l.append((im_buf[i].start,im_buf[i].end,im_buf[i].target_id,
403 im_buf[i].target_start,im_buf[i].target_end))
404 free_interval_iterator(it_alloc)
405 return l
407 def check_nonempty(self):
408 if self.db==NULL:
409 raise IndexError('empty IntervalFileDB, not searchable!')
411 def close(self):
412 if self.db:
413 free_interval_dbfile(self.db)
414 self.db=NULL
416 def __dealloc__(self):
417 'remember: dealloc cannot call other methods!'
418 if self.db:
419 free_interval_dbfile(self.db)
424 cdef class NLMSASliceLetters:
425 'graph interface to letter graph within this region'
426 def __new__(self,NLMSASlice nlmsaSlice):
427 self.nlmsaSlice=nlmsaSlice
428 def __iter__(self):
429 return NLMSASliceIterator(self.nlmsaSlice)
430 def __getitem__(self,NLMSANode node):
431 return node.nodeEdges()
432 def items(self):
433 'list of tuples (node,{target_node:edge})'
434 l=[]
435 for node in self:
436 l.append((node,node.nodeEdges()))
437 return l
438 def iteritems(self):
439 return iter(self.items())
444 cdef class NLMSASlice:
445 def __new__(self,NLMSASequence ns not None,int start,int stop,
446 int id= -1,int offset=0,seq=None):
447 cdef int i,j,n,start_max,end_min,start2,stop2,nseq,istart,istop,localQuery
448 cdef NLMSASequence ns_lpo
449 cdef IntervalFileDBIterator it,it2
450 cdef IntervalMap *im,*im2
451 cdef int cacheMax
453 if seq is None: # GET FROM NLMSASequence
454 seq=ns.seq
455 self.nlmsaSequence=ns # SAVE BASIC INFO
456 self.nlmsa = ns.nlmsaLetters
457 self.start=start
458 self.stop=stop
459 self.offset=offset # ALWAYS STORE offset IN POSITIVE ORIENTATION
460 self.seq=seq
461 try: # USE PYTHON METHOD TO DO QUERY
462 id,ivals=ns.nlmsaLetters.doSlice(seq) # doSlice() RETURNS RAW INTERVALS
463 self.id=id # SAVE OUR SEQUENCE'S nlmsa_id
464 it=IntervalFileDBIterator(start,stop,rawIvals=ivals) # STORE IN BINARY FMT
465 it2=IntervalFileDBIterator(start,stop) # HOLDER FOR SUBSEQUENT MERGE
466 localQuery=0 # DO NOT PERFORM LOCAL QUERY CODE BELOW!!
467 except AttributeError:
468 localQuery=1
469 if localQuery: ################################## PERFORM LOCAL QUERY
470 if id<0:
471 id=ns.id
472 self.id=id
473 it2=None
474 if start<0: # NEED TO TRANSLATE OFFSETS TO MINUS ORIENTATION
475 offset= -offset
476 if ns.nlmsaLetters.pairwiseMode==1: # TRANSLATE SEQ DIRECTLY TO LPO
477 it=IntervalFileDBIterator(start,stop,rawIvals=((start,stop,ns.id-1,
478 start+offset,stop+offset),))
479 n=1 # JUST THE SINGLE IDENTITY MAPPING FROM SEQ TO LPO
480 else: # PERFORM NORMAL SEQ --> LPO QUERY
481 it=IntervalFileDBIterator(start+offset,stop+offset,ns=ns)
482 n=it.loadAll() # GET ALL OVERLAPPING INTERVALS
483 if n<=0:
484 raise nlmsa_utils.EmptySliceError('this interval is not aligned!')
485 for i from 0 <= i < n: # CLIP INTERVALS TO FIT [start:stop]
486 it.im_buf[i].start=it.im_buf[i].start-offset # XLATE TO SRC SEQ COORDS
487 it.im_buf[i].end=it.im_buf[i].end-offset
488 if stop<it.im_buf[i].end: # TRUNCATE TO FIT WITHIN [start:stop]
489 it.im_buf[i].target_end= it.im_buf[i].target_end \
490 +stop-it.im_buf[i].end # CALCULATE NEW ENDPOINT
491 it.im_buf[i].end=stop
492 if start>it.im_buf[i].start: # CALCULATE NEW STARTPOINT
493 it.im_buf[i].target_start=it.im_buf[i].target_start \
494 +start-it.im_buf[i].start
495 it.im_buf[i].start=start
497 if ns.is_lpo: # TARGET INTERVALS MUST BE LPO, MUST MAP TO REAL SEQUENCES
498 it2=IntervalFileDBIterator(start,stop) # HOLDER FOR SUBSEQUENT MERGE
499 else:
500 ns_lpo=ns.nlmsaLetters.seqlist[ns.nlmsaLetters.lpo_id] # DEFAULT LPO
501 for i from 0 <= i < n:
502 if it.im_buf[i].target_id != ns_lpo.id: # SWITCHING TO A DIFFERENT LPO?
503 ns_lpo=ns.nlmsaLetters.seqlist[it.im_buf[i].target_id]
504 if not ns_lpo.is_lpo:
505 raise ValueError('sequence mapped to non-LPO target??')
506 if it2 is None: # NEED TO ALLOCATE NEW ITERATOR
507 it2=IntervalFileDBIterator(it.im_buf[i].target_start,
508 it.im_buf[i].target_end,ns=ns_lpo)
509 else: # JUST REUSE THIS ITERATOR WITHOUT REALLOCING MEMORY
510 it2.restart(it.im_buf[i].target_start,
511 it.im_buf[i].target_end,None,ns_lpo)
512 it2.loadAll() # GET ALL OVERLAPPING INTERVALS
513 if it2.nhit<=0: # NO HITS, SO TRY THE NEXT INTERVAL???
514 continue
515 im2=it2.im_buf # ARRAY FROM THIS ITERATOR
516 for j from 0 <= j < it2.nhit: # MAP EACH INTERVAL BACK TO ns
517 if it.im_buf[i].target_start>im2[j].start: # GET INTERSECTION INTERVAL
518 start_max=it.im_buf[i].target_start
519 else:
520 start_max=im2[j].start
521 if it.im_buf[i].target_end<im2[j].end:
522 end_min=it.im_buf[i].target_end
523 else:
524 end_min=im2[j].end
525 istart=it.im_buf[i].start+start_max-it.im_buf[i].target_start # SRC COORDS
526 istop=it.im_buf[i].start+end_min-it.im_buf[i].target_start
527 start2=im2[j].target_start+start_max-im2[j].start # COORDS IN TARGET
528 stop2=im2[j].target_start+end_min-im2[j].start
529 if im2[j].target_id!=id or istart!=start2 \
530 or ns.nlmsaLetters.pairwiseMode==1: # DISCARD SELF-MATCH
531 it.saveInterval(istart,istop,im2[j].target_id,start2,stop2) # SAVE IT!
532 assert ns_lpo.id!=im2[j].target_id
534 if it.nhit<=0:
535 raise nlmsa_utils.EmptySliceError('this interval is not aligned!')
536 it2.copy(it) # COPY FULL SET OF SAVED INTERVALS
537 self.nseqBounds=it2.mergeSeq() # MERGE TO ONE INTERVAL PER SEQUENCE ORIENTATION
538 self.seqBounds=it2.getIntervalMap() # SAVE SORTED ARRAY & DETACH FROM ITERATOR
540 self.im=it.getIntervalMap() # RELEASE THIS ARRAY FROM THE ITERATOR
541 self.n=it.nhit # TOTAL #INTERVALS SAVED FROM JOIN
542 qsort(self.im,self.n,sizeof(IntervalMap),imstart_qsort_cmp) # ORDER BY start
545 for i from 0 <= i < self.nseqBounds: # COUNT NON-LPO SEQUENCES
546 if not ns.nlmsaLetters.seqlist.is_lpo(self.seqBounds[i].target_id):
547 n=n+1
548 self.nrealseq=n # SAVE THE COUNT
550 try: # _cache_max=0 TURNS OFF CACHING...
551 cacheMax=ns.nlmsaLetters.seqDict._cache_max
552 except AttributeError:
553 cacheMax=1 # ALLOW CACHING...
554 try: # SAVE OUR COVERING INTERVALS AS CACHE HINTS IF POSSIBLE...
555 saveCache=ns.nlmsaLetters.seqDict.cacheHint
556 except AttributeError:
557 cacheMax=0 # TURN OFF CACHING
558 if cacheMax>0: # CONSTRUCT & SAVE DICT OF CACHE HINTS: COVERING INTERVALS
559 cacheDict={}
560 if seq is not None:
561 try: # ADD A CACHE HINT FOR QUERY SEQ IVAL
562 seqID=ns.nlmsaLetters.seqs.getSeqID(seq) # GET FULL-LENGTH ID
563 cacheDict[seqID]=(self.start,self.stop)
564 except KeyError:
565 pass
566 for i from 0 <= i < self.nseqBounds: # ONLY SAVE NON-LPO SEQUENCES
567 if not ns.nlmsaLetters.seqlist.is_lpo(self.seqBounds[i].target_id):
568 cacheDict[ns.nlmsaLetters.seqlist.getSeqID(self.seqBounds[i].target_id)]=(self.seqBounds[i].target_start,self.seqBounds[i].target_end)
570 if cacheDict:
571 self.weakestLink = nlmsa_utils.SeqCacheOwner()
572 saveCache(cacheDict, self.weakestLink) # SAVE COVERING IVALS AS CACHE HINT
574 def __hash__(self):
575 return id(self)
577 def __repr__(self):
578 return "<NLMSASlice object at 0x%x (seq=%s)>" % (id(self), self.seq.id,)
580 def __dealloc__(self):
581 'remember: dealloc cannot call other methods!'
582 if self.im:
583 free(self.im)
584 self.im=NULL
585 if self.seqBounds:
586 free(self.seqBounds)
587 self.seqBounds=NULL
589 cdef object get_seq_interval(self, NLMSA nl, int targetID,
590 int start, int stop):
591 'get seq interval and ensure cache owner keeps it in the cache'
592 if start<stop:
593 ival = nl.seqInterval(targetID, start, stop)
594 else: # GET THE SEQUENCE OBJECT
595 ival = nl.seqlist.getSeq(targetID)
596 try: # tell owner to keep it in the cache
597 self.weakestLink.cache_reference(ival.pathForward)
598 except AttributeError:
599 pass
600 return ival
602 ########################################### ITERATOR METHODS
603 def edges(self,mergeAll=False,**kwargs):
604 'get list of tuples (srcIval,destIval,edge) aligned in this slice'
605 seqIntervals=self.groupByIntervals(mergeAll=mergeAll,**kwargs)
606 ivals=self.groupBySequences(seqIntervals,**kwargs)
607 l=[]
608 for ival1,ival2,mergeIntervals in ivals:
609 l.append((ival1,ival2,sequence.Seq2SeqEdge(self,ival2,ival1,mergeIntervals)))
610 return l
611 def items(self,**kwargs):
612 'get list of tuples (ival2,edge) aligned to this slice'
613 l=[]
614 for ival1,ival2,edge in self.edges(**kwargs):
615 l.append((ival2,edge))
616 return l
617 def iteritems(self,**kwargs):
618 return iter(self.items(**kwargs))
619 def keys(self,mergeAll=False,**kwargs):
620 'get list of intervals aligned to this slice according to groupBy options'
621 seqIntervals=self.groupByIntervals(mergeAll=mergeAll,**kwargs)
622 ivals=self.groupBySequences(seqIntervals,**kwargs)
623 l=[]
624 for ival1,ival2,mergeIntervals in ivals:
625 l.append(ival2)
626 return l
627 def __iter__(self): # PYREX DOESNT ALLOW ARGS TO __iter__ !
628 return iter(self.keys())
629 def __getitem__(self,k):
630 return sequence.Seq2SeqEdge(self,k)
631 def __setitem__(self,k,v):
632 raise ValueError('''this NLMSA is read-only! Currently, you cannot add new
633 alignment intervals to an NLMSA after calling its build() method.''')
634 def __len__(self):
635 return self.nrealseq # NUMBER OF NON-LPO SEQUENCE/ORIS ALIGNED HERE
638 ##################################### 1:1 INTERVAL METHODS
639 def matchIntervals(self,seq=None):
640 '''get all 1:1 match intervals in this region of alignment
641 as list of tuples. if seq argument not None, only match intervals
642 for that sequence will be included. No clipping is performed.'''
643 cdef int i,target_id
644 cdef NLMSA nl
645 nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
646 if seq is not None:
647 target_id=nl.seqs.getID(seq) # CHECK IF IN OUR ALIGNMENT
648 else:
649 target_id= -1
650 l=[]
651 for i from 0 <= i <self.n: # GET ALL STORED INTERVALS
652 if not nl.seqlist.is_lpo(self.im[i].target_id) and \
653 (target_id<0 or self.im[i].target_id==target_id):
654 ival2=nl.seqInterval(self.im[i].target_id,self.im[i].target_start,
655 self.im[i].target_end)
656 if seq is None or ival2.orientation==seq.orientation:
657 ival1=sequence.absoluteSlice(self.seq,
658 self.im[i].start,self.im[i].end)
659 l.append((ival1,ival2)) # SAVE THE INTERVAL MATCH
660 return l
662 ############################## MAXIMUM INTERVAL METHODS
663 cdef int findSeqBounds(self,int id,int ori):
664 'find the specified sequence / orientation using binary search'
665 cdef int left,right,mid
666 left=0
667 right=self.nseqBounds
668 while left<right:
669 mid=(left+right)/2
670 if self.seqBounds[mid].target_id<id:
671 left=mid+1
672 elif self.seqBounds[mid].target_id>id:
673 right=mid
674 elif ori>0 and seqBounds[mid].target_start<0:
675 left=mid+1
676 elif ori<0 and seqBounds[mid].target_start>=0:
677 right=mid
678 else: # MATCHES BOTH id AND ori
679 return mid
680 return -1 # FAILED TO FIND id,ori MATCH
682 def findSeqEnds(self,seq):
683 'get maximum interval of seq aligned in this interval'
684 cdef int i,id
685 cdef NLMSA nl
686 nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
687 id=nl.seqs.getID(seq) # CHECK IF IN OUR ALIGNMENT
688 i=self.findSeqBounds(id,seq.orientation) # FIND THIS id,ORIENTATION
689 if i<0: # NOT FOUND!
690 raise KeyError('seq not aligned in this interval')
691 return self.get_seq_interval(nl, self.seqBounds[i].target_id,
692 self.seqBounds[i].target_start,
693 self.seqBounds[i].target_end)
695 def generateSeqEnds(self):
696 'get list of tuples (ival1,ival2,edge)'
697 cdef int i
698 cdef NLMSA nl
699 nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
700 l=[]
701 for i from 0 <= i <self.nseqBounds:
702 if nl.seqlist.is_lpo(self.seqBounds[i].target_id):
703 continue # DON'T RETURN EDGES TO LPO
704 #ival1=sequence.absoluteSlice(self.seq,self.seqBounds[i].start,
705 # self.seqBounds[i].end)
706 ival2 = self.get_seq_interval(nl, self.seqBounds[i].target_id,
707 self.seqBounds[i].target_start,
708 self.seqBounds[i].target_end)
709 #l.append((ival1,ival2,sequence.Seq2SeqEdge(self,ival2,ival1)))
710 edge = self[ival2] # LET edge FIGURE OUT sourcePath FOR US
711 l.append((edge.sourcePath,ival2,edge))
712 return l
714 ############################################## GROUP-BY METHODS
715 def groupByIntervals(self,int maxgap=0,int maxinsert=0,
716 int mininsert= 0,filterSeqs=None,filterList=None,
717 mergeMost=False,maxsize=500000000,
718 mergeAll=True,ivalMethod=None,**kwargs):
719 '''merge alignment intervals using "horizontal" group-by rules:
720 - maxgap (=0): longest gap allowed within a region
721 - maxinsert (=0): longest insert allowed within a region
722 - mininsert (=0): should be 0, to prevent cycles within a region
723 use negative values to allow some overlap / cycles.
724 - maxsize: upper bound on maximum size for interval merging
725 - mergeMost: merge, but with limits (10000,10000,-10,50000)
726 - mergeAll: merge everything without any limits
727 - filterSeqs (=None): dict of sequences to apply these rules to;
728 other sequences alignment will be ignored if
729 filterSeqs not None. Slower than the filterList option.
730 - filterList (=None): list of sequence intervals to mask the
731 result set by. Note: a single sequence should not be
732 represented by more than one interval in filterList, as only
733 one interval for each sequence will be used as the clipping region.
734 Significantly faster than filterSeqs.
735 - ivalMethod: a function to process the list of intervals
736 for each sequence (it can merge or split them in any way
737 it wants)
738 - pAlignedMin: a fractional minimum alignment threshold e.g. (0.9)
739 - pIdentityMin: a fractional minimum identity threshold e.g. (0.9)
741 cdef int i,j,n,gap,insert,targetStart,targetEnd,start,end,maskStart,maskEnd
742 cdef NLMSA nl
743 nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
744 if mergeMost: # BE REASONABLE: DON'T MERGE A WHOLE CHROMOSOME
745 maxgap=10000
746 maxinsert=10000
747 mininsert=-10 # ALLOW SOME OVERLAP IN INTERVAL ALIGNMENTS
748 maxsize=50000
749 if filterList is not None:
750 targetDict = {}
751 for seq in filterList: # CREATE AN INDEX OF SEQUENCE IDs TO KEEP
752 t = nl.seqs.getIDcoords(seq)
753 targetDict[t[0]] = t[1:] # SAVE START,STOP
754 seqIntervals={}
755 for i from 0 <= i < self.n: # LIST INTERVALS FOR EACH TARGET
756 if nl.seqlist.is_lpo(self.im[i].target_id):
757 continue # IT IS AN LPO, SO SKIP IT
758 start=self.im[i].start
759 end=self.im[i].end
760 targetStart=self.im[i].target_start
761 targetEnd=self.im[i].target_end
762 if filterList is not None:
763 try: # CHECK IF SEQUENCE IS IN MASKING DICTIONARY
764 maskStart,maskEnd = targetDict[self.im[i].target_id]
765 except KeyError:
766 continue # FILTER THIS SEQUENCE OUT OF THE RESULT SET
767 if start>=maskEnd or end<=maskStart: # NO OVERLAP
768 continue
769 if start<maskStart: # CLIP START TO MASKED REGION
770 targetStart = targetStart+maskStart-start
771 start = maskStart
772 if end>maskEnd:# CLIP END TO MASKED REGION
773 targetEnd = targetEnd+maskEnd-end
774 end = maskEnd
775 elif filterSeqs is not None: # CLIP TARGET SEQ INTERVAL
776 target = self.get_seq_interval(nl, self.im[i].target_id,
777 targetStart, targetEnd)
778 try:
779 target=filterSeqs[target] # PERFORM CLIPPING
780 except KeyError: # NO OVERLAP IN filterSeqs, SO SKIP
781 continue
782 start=start+target.start-targetStart # CLIP SOURCE SEQUENCE
783 end=end+target.stop-targetEnd
784 targetStart=target.start # GET COORDS OF CLIPPED TARGET
785 targetEnd=target.stop
786 try: # ADD INTERVAL TO EXISTING LIST
787 seqIntervals[self.im[i].target_id] \
788 .append([start,end,targetStart,targetEnd,None])
789 except KeyError: # CREATE A NEW LIST FOR THIS TARGET
790 seqIntervals[self.im[i].target_id]= \
791 [[start,end,targetStart,targetEnd,None]]
793 for i,l in seqIntervals.iteritems(): # MERGE INTERVALS FOR EACH SEQ
794 if ivalMethod is not None: # USER-SUPPLIED GROUPING FUNCTION
795 ivalMethod(l,nl.seqlist.getSeq(i),msaSlice=self,maxgap=maxgap,
796 maxinsert=maxinsert,mininsert=mininsert,
797 filterSeqs=filterSeqs,mergeAll=mergeAll,**kwargs)
798 continue # NO NEED TO APPLY GENERIC MERGING OPERATION BELOW
800 for j from 1 <= j < len(l): # MERGE BY INDEL LENGTH RULES
801 gap=l[j][0]-l[n][1] # current.start - last.end
802 insert=l[j][2]-l[n][3] # current.target_start - last.target_end
803 if not mergeAll and \
804 (gap>maxgap or insert>maxinsert or insert<mininsert
805 or l[j][1]-l[n][0]>maxsize
806 or l[j][3]-l[n][2]>maxsize):
807 n=n+1 # SPLIT, SO START A NEW INTERVAL
808 if n<j: # COPY START COORDS TO NEW SLOT
809 l[n][0]=l[j][0]
810 l[n][2]=l[j][2]
811 else: # INTERVALS MERGED: SAVE ORIGINAL 1:1 INTERVAL LIST
812 try:
813 lastIval = l[n][4][-1] # GET LAST 1:1 INTERVAL
814 except TypeError: # EMPTY LIST: CREATE ONE
815 if l[n][1]==l[j][0] and l[n][3]==l[j][2]: # NO GAP, SO MERGE
816 l[n][4] = [(l[n][0],l[j][1],l[n][2],l[j][3])]
817 else: # TWO SEPARATE 1:1 INTERVALS
818 l[n][4] = [tuple(l[n][:4]),tuple(l[j][:4])]
819 else: # SEE IF WE CAN FUSE TO LAST 1:1 INTERVAL
820 if lastIval[1]==l[j][0] and lastIval[3]==l[j][2]:
821 l[n][4][-1] = (lastIval[0],l[j][1],lastIval[2],l[j][3])
822 else: # GAP, SO JUST APPEND THIS 1:1 INTERVAL
823 l[n][4].append(tuple(l[j][:4]))
824 if n<j: # COPY END COORDS TO CURRENT SLOT
825 l[n][1]=l[j][1]
826 l[n][3]=l[j][3]
827 del l[n+1:] # DELETE REMAINING UNMERGED INTERVALS
828 for m in l: # CULL SINGLETON 1:1 INTERVAL LISTS (DUE TO FUSION)
829 try:
830 if len(m[4])==1: # TWO INTERVALS MUST HAVE BEEN FUSED
831 m[4] = None # NO NEED TO KEEP SINGLETON!
832 except TypeError:
833 pass
834 # SEQUENCE MASKING BY CONSERVATION OR %ALIGNED CONSTRAINT
835 if 'pAlignedMin' in kwargs or 'pIdentityMin' in kwargs or \
836 'minAlignSize' in kwargs or 'maxAlignSize' in kwargs:
837 self.filterIvalConservation(seqIntervals,**kwargs)
838 return seqIntervals
840 def conservationFilter(self,seq,m,pIdentityMin=None,
841 minAlignSize=None,maxAlignSize=None,**kwargs):
842 if minAlignSize is not None and m[1]-m[0]<minAlignSize:
843 return None
844 if maxAlignSize is not None and m[1]-m[0]>maxAlignSize:
845 return None
846 if pIdentityMin is not None:
847 seqEdge=sequence.Seq2SeqEdge(self,sequence.relativeSlice(seq,m[2],m[3]),
848 sequence.absoluteSlice(self.seq,m[0],m[1]),m[4])
849 t = seqEdge.conservedSegment(pIdentityMin=pIdentityMin, # GET CLIPPED INTERVAL
850 minAlignSize=minAlignSize,**kwargs)
851 if t is None:
852 return None
853 mergeIntervals = self.clip_interval_list(t[0],t[1],m[4]) # CLIP mergeIntervals
854 return list(t)+[mergeIntervals] # RECOMBINE
855 else:
856 return m
857 ## if pAlignedMin is not None and seqEdge.pAligned()<pAlignedMin:
858 ## return False # INTERVAL FAILED ALIGNMENT THRESHOLD, SO REMOVE IT
859 ## if pIdentityMin is not None and seqEdge.pIdentity()<pIdentityMin:
860 ## return False # INTERVAL FAILED CONSERVATION THRESHOLD, SO REMOVE IT
861 ## return True
863 def filterIvalConservation(self,seqIntervals,pIdentityMin=None,
864 filterFun=None,**kwargs):
865 cdef int i,j
866 cdef NLMSA nl
867 import types
868 if filterFun is None:
869 filterFun=self.conservationFilter
870 nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
871 pIdentityMin0=pIdentityMin
872 for targetID,l in seqIntervals.items(): # MERGE INTERVALS FOR EACH SEQ
873 seq = self.get_seq_interval(nl, targetID, 0, 0) # GET THE SEQUENCE OBJECT
874 if pIdentityMin0 is not None and not isinstance(pIdentityMin0,types.FloatType):
875 try:
876 pIdentityMin=pIdentityMin0[seq] # LOOK UP DESIRED IDENTITY FOR THIS SEQ
877 except KeyError:
878 del seqIntervals[targetID] # SO REMOVE TARGET ENTIRELY
879 continue # NO NEED TO PROCESS THIS TARGET ANY FURTHER
881 for i from 0 <= i <len(l): # CHECK EACH INTERVAL FOR CONSERVATION THRESHOLD
882 newIval=filterFun(seq,l[i],pIdentityMin=pIdentityMin,**kwargs)
883 if newIval is None:
884 continue # l[i] FAILED FILTER CRITERIA, SO SKIP IT
885 l[j]=newIval # COMPACT THE ARRAY: KEEP newIval IN LOCATION j
886 j=j+1 # KEEP THIS ARRAY ENTRY, SO INCREMENT COUNT OF ENTRIES
887 if j==0: # NO INTERVALS FOR THIS SEQUENCE SURVIVED MASKING
888 del seqIntervals[targetID] # SO REMOVE TARGET ENTIRELY
889 elif j<i: # SOME INTERVALS REMOVED, SO SHRINK ITS LIST
890 del l[j:] # JUST TRUNCATE THE LIST TO ENTRIES THAT PASSED
892 def groupBySequences(self,seqIntervals,sourceOnly=False,
893 indelCut=False,seqGroups=None,minAligned=1,
894 pMinAligned=0.,seqMethod=None,**kwargs):
895 '''merge groups of sequences using "vertical" group-by rules:
896 - seqGroups: a list of one or more lists of sequences to group.
897 Each group will be analyzed separately, as follows:
898 - sourceOnly: output intervals will be reported giving only
899 the corresponding interval on the source sequence; redundant
900 output intervals (mapping to the same source interval) are
901 culled. Has the effect of giving a single interval traversal
902 of each group.
903 - indelCut: for sourceOnly mode, do not merge separate
904 intervals as reported by seqIntervals (in other words,
905 that the groupByIntervals analysis separated due to an indel).
906 - minAligned: the minimum #sequences that must be aligned to
907 the source sequence for masking the output. Regions below
908 this threshold are masked out; no intervals will be reported
909 in these regions.
910 - pMinAligned: the minimum fraction of sequences (out of the
911 total in the group) that must be aligned to the source
912 sequence for masking the output.
913 - seqMethod: you may supply your own function for grouping.
914 Called as seqMethod(bounds,seqs,**kwargs), where
915 bounds is a sorted list of
916 (ipos,isStart,i,ns,isIndel,(start,end,targetStart,targetEnd))
917 seqs is a list of sequences in the group.
918 Must return a list of (sourceIval,targetIval). See the docs.
920 cdef int i,j,start,end,targetStart,targetEnd,ipos,id
921 cdef float f
922 cdef NLMSA nl
923 nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
924 if seqGroups is None:
925 seqGroups=[seqIntervals] # JUST USE THE WHOLE SET
926 result=[]
927 import mapping # GET ACCESS TO DictQueue CLASS
928 for seqs in seqGroups: # PROCESS EACH SEQ GROUP
929 bounds=[]
931 for seq in seqs: # CONSTRUCT INTERVAL BOUNDS LIST
932 if isinstance(seq,int): # seqIntervals USES INT INDEX VALUES
933 id=seq # SAVE THE ID
934 seq = self.get_seq_interval(nl, id, 0, 0) # GET THE SEQUENCE OBJECT
935 else: # EXPECT USER TO SUPPLY ACTUAL SEQUENCE OBJECTS
936 id=nl.seqs.getID(seq)
937 seq=seq.pathForward # ENSURE WE HAVE TOP-LEVEL SEQ OBJECT
938 try:
939 ivals=seqIntervals[id]
940 except KeyError: # SEQUENCE NOT IN THIS ALIGNMENT REGION, SO SKIP
941 continue
942 isIndel=False
943 for ival in ivals:
944 bounds.append((ival[1],False,j,seq,isIndel,ival))
945 bounds.append((ival[0],True,j,seq,isIndel,ival))
946 isIndel=True
947 j=j+1 # SEQUENCE COUNTER ENSURES ORDER OF SEQS IN SORTED LIST
948 bounds.sort() # ASCENDING ORDER OF source_pos, SORT stop B4 start
949 if seqMethod is not None:
950 result=result+seqMethod(bounds,seqs,sourceOnly=sourceOnly,
951 msaSlice=self,minAligned=minAligned,
952 pMinAligned=pMinAligned,
953 indelCut=indelCut,**kwargs)
954 continue # DON'T USE GENERIC GROUPING METHOD BELOW
955 seqStart=mapping.DictQueue() # setitem PUSHES, delitem POPS
956 maskStart=None
957 for bound in bounds: # GENERIC GROUPING: APPLY MASKING, sourceOnly
958 ipos,isStart,j,seq,isIndel=bound[0:5]
959 if isStart: # INTERVAL START
960 seqStart[seq]=bound[5] # JUST RECORD START OF INTERVAL
961 else: # INTERVAL STOP
962 start,end,targetStart,targetEnd,mergeIntervals=bound[5]
963 if maskStart is not None and not sourceOnly: # SAVE TARGET IVAL
964 if maskStart>start: # TRUNCATE TARGET IVAL START
965 targetStart=targetStart+maskStart-start
966 start=maskStart
967 mergeIntervals = self.clip_interval_list(maskStart,None,mergeIntervals)
968 result.append((sequence.absoluteSlice(self.seq,start,end),
969 sequence.relativeSlice(seq,targetStart,
970 targetEnd),mergeIntervals))
971 del seqStart[seq] # POP THIS SEQ FROM START DICT
973 f=len(seqStart) # #ALIGNED SEQS IN THIS REGION
974 if f<minAligned or f/len(seqs)<pMinAligned: # APPLY MASKING
975 if maskStart is not None:
976 if sourceOnly: # JUST SAVE MERGED SOURCE INTERVAL
977 result.append(sequence.absoluteSlice(self.seq,maskStart,end))
978 else: # REPORT TARGET IVALS WITHIN (maskStart,end) REGION
979 for seq in seqStart: # CANNOT USE items() BECAUSE THIS IS A QUEUE!
980 (start,i,targetStart,targetEnd,mergeIntervals)=seqStart[seq]
981 pleaseClip = False
982 if maskStart>start: # TRUNCATE TARGET IVAL START
983 targetStart=targetStart+maskStart-start
984 start=maskStart
985 pleaseClip = True
986 if end<i: # TRUNCATE TARGET IVAL END
987 targetEnd=targetEnd+end-i
988 pleaseClip = True
989 if pleaseClip:
990 mergeIntervals = self.clip_interval_list(maskStart,end,mergeIntervals)
991 result.append((sequence.absoluteSlice(self.seq,start,end),
992 sequence.relativeSlice(seq,targetStart,
993 targetEnd),mergeIntervals))
994 maskStart=None # REGION NOW BELOW THRESHOLD
995 elif maskStart is None:
996 maskStart=ipos # START OF REGION ABOVE THRESHOLD
997 if maskStart is not None and sourceOnly and indelCut \
998 and isIndel and maskStart<ipos:
999 result.append(sequence.absoluteSlice(self.seq,maskStart,ipos))
1000 maskStart=ipos
1001 return result
1002 def clip_interval_list(self,start,end,l):
1003 'truncate list of 1:1 intervals using start,end'
1004 if l is None:
1005 return None
1006 result = []
1007 for srcStart,srcEnd,destStart,destEnd in l:
1008 if (start is not None and start>=srcEnd) or (end is not None and end<=srcStart):
1009 continue
1010 if start is not None and start>srcStart:
1011 destStart = destStart+start-srcStart
1012 srcStart = start
1013 if end is not None and end<srcEnd:
1014 destEnd = destEnd+end-srcEnd
1015 srcEnd = end
1016 result.append((srcStart,srcEnd,destStart,destEnd))
1017 if len(result)<2:
1018 return None
1019 else:
1020 return result
1022 ############################################## LPO REGION METHODS
1023 def split(self,minAligned=0,**kwargs):
1024 '''Use groupByIntervals() and groupBySequences() methods to
1025 divide this slice into subslices using indel rules etc.'''
1026 seqIntervals=self.groupByIntervals(**kwargs)
1027 kwargs['sourceOnly']=True
1028 kwargs['indelCut']=True
1029 ivals=self.groupBySequences(seqIntervals,minAligned=minAligned,
1030 **kwargs)
1031 l=[]
1032 for ival in ivals:
1033 if ival.start==self.start and ival.stop==self.stop:
1034 l.append(self) # SAME INTERVAL, SO JUST RETURN self
1035 else:
1036 subslice=NLMSASlice(self.nlmsaSequence,ival.start,ival.stop,
1037 self.id,self.offset,self.seq)
1038 l.append(subslice)
1039 return l
1042 def regions(self,dummyArg=None,**kwargs):
1043 '''get LPO region(s) corresponding to this interval
1044 Same group-by rules apply here as for the split() method.'''
1045 cdef int i
1046 cdef NLMSA nl
1047 cdef NLMSASequence ns_lpo
1048 if self.nlmsaSequence.is_lpo: # ALREADY AN LPO REGION!
1049 return self.split(**kwargs) # JUST APPLY GROUP-BY RULES TO self
1050 nl=self.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
1051 l=[]
1052 for i from 0 <= i <self.nseqBounds:
1053 ns_lpo=nl.seqlist[self.seqBounds[i].target_id]
1054 if ns_lpo.is_lpo: # ADD ALL LPO INTERVALS ALIGNED HERE
1055 subslice=NLMSASlice(ns_lpo,self.seqBounds[i].target_start,
1056 self.seqBounds[i].target_end)
1057 l=l+subslice.split(**kwargs) # APPLY GROUP-BY RULES
1058 if len(l)>0:
1059 return l
1060 raise ValueError('no LPO in nlmsaSlice.seqBounds? Debug!')
1063 ########################################### LETTER METHODS
1064 property letters:
1065 'interface to individual LPO letters in this interval'
1066 def __get__(self):
1067 return NLMSASliceLetters(self)
1069 def __cmp__(self,other):
1070 if isinstance(other,NLMSASlice):
1071 return cmp(self.nlmsaSequence,other.nlmsaSequence)
1072 else:
1073 return -1
1075 def rawIvals(self):
1076 'return list of raw numeric intervals in this slice'
1077 cdef int i
1078 l=[]
1079 for i from 0 <= i < self.n:
1080 l.append((self.im[i].start,self.im[i].end,self.im[i].target_id,
1081 self.im[i].target_start,self.im[i].target_end))
1082 return l
1088 def advanceStartStop(int ipos,NLMSASlice nlmsaSlice not None,
1089 int istart,int istop):
1090 cdef int i
1091 if istop>=nlmsaSlice.n:
1092 raise IndexError('out of bounds')
1093 for i from istop <= i < nlmsaSlice.n:
1094 if ipos>=nlmsaSlice.im[i].start: # ENTERS THIS INTERVAL
1095 istop = i + 1 # ADVANCE THE END MARKER
1096 else:
1097 break # BEYOND ipos, SEARCH NO FURTHR
1098 for i from istart <= i < istop: # FIND 1ST OVERLAP
1099 if ipos<nlmsaSlice.im[i].end:
1100 break
1101 return i,istop
1108 cdef class NLMSASliceIterator:
1109 'generate letters (nodes) in this LPO slice'
1110 def __new__(self,NLMSASlice nlmsaSlice not None):
1111 self.nlmsaSlice=nlmsaSlice
1112 self.ipos= nlmsaSlice.start - 1
1114 def __iter__(self):
1115 return self
1117 def __next__(self):
1118 self.ipos = self.ipos + 1
1119 # ADJUST istart,istop TO OVERLAP ipos
1120 self.istart,self.istop= \
1121 advanceStartStop(self.ipos,self.nlmsaSlice,self.istart,self.istop)
1122 if self.istart>=self.istop: # HMM, NO OVERLAPS TO ipos
1123 if self.istop<self.nlmsaSlice.n: # ANY MORE INTERVALS?
1124 self.ipos=self.nlmsaSlice.im[self.istop].start # START OF NEXT INTERVAL
1125 # ADJUST istart,istop TO OVERLAP ipos
1126 self.istart,self.istop= \
1127 advanceStartStop(self.ipos,self.nlmsaSlice,self.istart,self.istop)
1128 else:
1129 raise StopIteration # NO MORE POSITIONS IN THIS SLICE
1130 return NLMSANode(self.ipos,self.nlmsaSlice,self.istart,self.istop)
1138 cdef class NLMSANode:
1139 'interface to a node in NLMSA storage of LPO alignment'
1140 def __new__(self,int ipos,NLMSASlice nlmsaSlice not None,
1141 int istart=0,int istop= -1):
1142 cdef int i,n
1143 cdef NLMSA nl
1144 self.nlmsaSlice=nlmsaSlice
1145 nl=nlmsaSlice.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
1146 self.ipos=ipos
1147 if istop<0:
1148 istart,istop= \
1149 advanceStartStop(ipos,nlmsaSlice,istart,istart) # COMPUTE PROPER BOUNDS
1150 self.istart=istart # SAVE BOUNDS
1151 self.istop=istop
1152 self.id=ipos # DEFAULT: ASSUME SLICE IS IN LPO...
1153 for i from istart <= i < istop:
1154 if nlmsaSlice.im[i].start<=ipos and ipos<nlmsaSlice.im[i].end:
1155 if nl.seqlist.is_lpo(nlmsaSlice.im[i].target_id):
1156 self.id=nlmsaSlice.im[i].target_start+ipos-nlmsaSlice.im[i].start #LPO ipos
1157 else: # DON'T COUNT THE LPO SEQUENCE
1158 self.n = self.n + 1
1160 ############################################# ALIGNED LETTER METHODS
1161 def __len__(self):
1162 return self.n
1163 def __iter__(self):
1164 cdef int i,j
1165 cdef NLMSA nl
1166 nl=self.nlmsaSlice.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
1167 l=[]
1168 for i from self.istart <= i < self.istop:
1169 if self.nlmsaSlice.im[i].start<=self.ipos \
1170 and self.ipos<self.nlmsaSlice.im[i].end \
1171 and not nl.seqlist.is_lpo(self.nlmsaSlice.im[i].target_id):
1172 j=self.nlmsaSlice.im[i].target_start+self.ipos-self.nlmsaSlice.im[i].start
1173 l.append(nl.seqInterval(self.nlmsaSlice.im[i].target_id,j,j+1))
1174 return iter(l)
1176 def getSeqPos(self,seq):
1177 'return seqpos for this seq at this node'
1178 cdef int i,j,id
1179 try:
1180 id=self.nlmsaSlice.nlmsaSequence.nlmsaLetters.seqs.getID(seq)
1181 except KeyError:
1182 raise KeyError('seq not in this alignment')
1183 for i from self.istart <= i < self.istop:
1184 if self.nlmsaSlice.im[i].target_id==id: # RETURN THE SEQUENCE INTERVAL
1185 j=self.nlmsaSlice.im[i].target_start+self.ipos-self.nlmsaSlice.im[i].start
1186 return sequence.absoluteSlice(seq,j,j+1)
1187 raise KeyError('seq not in node')
1189 property edges:
1190 "get this node's edges as list of tuples (ival1,ival2,edge)"
1191 def __get__(self):
1192 l=[]
1193 ival1=sequence.absoluteSlice(self.nlmsaSlice.seq,
1194 self.ipos,self.ipos+1) # OUR SEQ INTERVAL
1195 for ival2 in self:
1196 l.append((ival1,ival2,None)) # GET EDGE INFO!!!
1197 return l
1199 def __getitem__(self,seq):
1200 raise NotImplementedError('hey! write some code here!')
1201 ## from lpo import POMSANodeRef # PROBABLY WONT NEED THIS AFTER RENAMING!!!
1202 ## try:
1203 ## s=self.getSeqPos(seq)
1204 ## return POMSANodeRef(self,seq.path)
1205 ## else:
1206 ## raise KeyError('seq not in node')
1208 def __cmp__(self,other):
1209 if isinstance(other,NLMSANode):
1210 return cmp((self.nlmsaSlice,ipos),(other.nlmsaSlice,ipos))
1211 else:
1212 return -1
1214 ########################################## NODE-TO-NODE EDGE METHODS
1215 cdef int check_edge(self,int iseq,int ipos):
1216 cdef int i
1217 for i from self.istart <= i < self.istop:
1218 if self.nlmsaSlice.im[i].start<=self.ipos \
1219 and self.ipos<self.nlmsaSlice.im[i].end \
1220 and self.nlmsaSlice.im[i].target_id==iseq \
1221 and self.nlmsaSlice.im[i].target_start==ipos:
1222 return 1 # MATCH!
1223 return 0 # NO MATCH!
1225 def getEdgeSeqs(self,NLMSANode other):
1226 "return dict of sequences that traverse edge from self -> other"
1227 cdef int i
1228 cdef NLMSA nl
1229 nl=self.nlmsaSlice.nlmsaSequence.nlmsaLetters # GET TOPLEVEL LETTERS OBJECT
1230 d={}
1231 if self.id+1==other.id: #other ADJACENT IN LPO
1232 for i from self.istart <= i < self.istop:
1233 if self.nlmsaSlice.im[i].start<=self.ipos+1 \
1234 and self.ipos+1<self.nlmsaSlice.im[i].end: # INTERVAL CONTAINS other
1235 seq=nl.seqlist.getSeq(self.nlmsaSlice.im[i].target_id)
1236 d[seq]=self.nlmsaSlice.im[i].target_start+self.ipos-self.nlmsaSlice.im[i].start
1237 elif self.ipos+1==self.im[i].end \
1238 and other.check_edge(self.nlmsaSlice.im[i].target_id,
1239 self.nlmsaSlice.im[i].target_end):
1240 seq=nl.seqlist.getSeq(self.nlmsaSlice.im[i].target_id) # BRIDGE TO NEXT INTERVAL
1241 d[seq]=self.nlmsaSlice.im[i].target_start+self.ipos-self.nlmsaSlice.im[i].start
1242 else: # other NOT ADJACENT, SO INTERVALS THAT END HERE MIGHT JUMP TO other
1243 for i from self.istart <= i < self.istop:
1244 if self.ipos+1==self.nlmsaSlice.im[i].end \
1245 and other.check_edge(self.nlmsaSlice.im[i].target_id,
1246 self.nlmsaSlice.im[i].target_end):
1247 seq=nl.seqlist.getSeq(self.nlmsaSlice.im[i].target_id) # BRIDGE TO NEXT INTERVAL
1248 d[seq]=self.nlmsaSlice.im[i].target_start+self.ipos-self.nlmsaSlice.im[i].start
1249 return d
1251 def nodeEdges(self):
1252 'get all outgoing edges from this node'
1253 cdef int i,has_continuation
1254 has_continuation= -1
1255 d={}
1256 nodes={} # LIST OF TARGET NODES WE HAVE AN EDGE TO
1257 for i from self.istart <= i < self.nlmsaSlice.n:
1258 if i>=self.istop and len(d)==0: # NO FURTHER CHECKS TO DO
1259 break
1260 if self.nlmsaSlice.im[i].start<=self.ipos and \
1261 self.ipos<self.nlmsaSlice.im[i].end-1:
1262 has_continuation=i # ipos INSIDE THIS INTERVAL, SO HAS EDGE TO ipos+1
1263 elif self.ipos==self.nlmsaSlice.im[i].end-1: # END OF THIS INTERVAL
1264 d[self.nlmsaSlice.im[i].target_id]=self.nlmsaSlice.im[i].target_end
1265 else:
1266 try: # CHECK FOR START OF AN "ADJACENT" INTERVAL
1267 if d[self.nlmsaSlice.im[i].target_id]==self.nlmsaSlice.im[i].target_start:
1268 nodes[self.nlmsaSlice.im[i].start]=i
1269 del d[self.nlmsaSlice.im[i].target_id] # REMOVE FROM ADJACENCY LIST
1270 except KeyError:
1271 pass
1272 if has_continuation>=0:
1273 nodes[self.ipos+1]=has_continuation
1274 result={}
1275 for i in nodes:
1276 node=NLMSANode(i,self.nlmsaSlice,self.istart)
1277 result[node]=sequence.LetterEdge(self,node) # EDGE OBJECT
1278 return result
1285 cdef class NLMSASequence:
1286 'sequence interface to NLMSA storage of an LPO alignment'
1287 def __init__(self,NLMSA nl not None,filestem,seq,mode='r',is_union=0,
1288 length=None):
1289 self.nlmsaLetters=nl
1290 self.filestem=filestem
1291 self.is_union=is_union
1292 self.is_lpo=0 # DEFAULT: NOT AN LPO
1293 self.seq=seq
1294 if length is not None: # ALLOW USER TO SUPPLY A LENGTH FOR THIS COORD SYSTEM
1295 self.length=length
1296 import types
1297 if isinstance(seq,types.StringType):
1298 self.name=seq # ALLOW USER TO BUILD INDEXES WITH A STRING NAME
1299 elif seq is not None: # REGULAR SEQUENCE
1300 seq= seq.pathForward # GET THE WHOLE SEQUENCE, IN FORWARD ORIENTATION
1301 try: # MAKE SURE seq HAS A UNIQUE NAME FOR INDEXING IT...
1302 self.name=str(seq.path.name)
1303 except AttributeError:
1304 try:
1305 self.name=str(seq.path.id)
1306 except AttributeError:
1307 raise AttributeError('NLMSASequence: seq must have name or id attribute')
1308 else:
1309 self.length=0 # LPO AND UNION SEQUENCES EXPAND AUTOMATICALLY
1310 if not is_union:
1311 self.is_lpo=1
1312 if len(nl.lpoList)>0: # CALCULATE OFFSET OF NEW LPO, BASED ON LAST LPO
1313 lastLPO=nl.lpoList[-1]
1314 self.offset=lastLPO.offset+lastLPO.length
1315 else:
1316 self.offset=0
1317 nl.lpoList.append(self) # ADD TO THE LPO LIST
1318 self.idb=None # DEFAULT: NOT USING IN-MEMORY DATABASE.
1319 self.db=None # DEFAULT: WAIT TO OPEN DB UNTIL ACTUALLY NEEDED
1320 if mode=='r': # IMMEDIATELY OPEN DATABASE, UNLIKE onDemand MODE
1321 self.db=IntervalFileDB(filestem,mode)
1322 elif mode=='memory': # OPEN IN-MEMORY DATABASE
1323 self.idb=IntervalDB()
1324 elif mode=='w': # WRITE .build FILE
1325 filename=filestem+'.build'
1326 self.build_ifile=fopen(filename,'wb') # binary file
1327 if self.build_ifile==NULL:
1328 errmsg='unable to open in write mode: '+filename
1329 raise IOError(errmsg)
1330 self.nbuild=0
1332 def __dealloc__(self):
1333 'remember: dealloc cannot call other methods!'
1334 if self.build_ifile:
1335 fclose(self.build_ifile)
1337 def forceLoad(self):
1338 'force database to be initialized, if not already open'
1339 self.db=IntervalFileDB(self.filestem,'r')
1341 def close(self):
1342 'free memory and close files associated with this sequence index'
1343 if self.db is not None:
1344 self.db.close() # CLOSE THE DATABASE, RELEASE MEMORY
1345 self.db=None # DISCONNECT FROM DATABASE
1346 if self.idb is not None:
1347 self.idb.close() # CLOSE THE DATABASE, RELEASE MEMORY
1348 self.idb=None # DISCONNECT FROM DATABASE
1349 if self.build_ifile:
1350 fclose(self.build_ifile)
1351 self.build_ifile=NULL
1353 def buildFiles(self,**kwargs):
1354 'build nested list from saved unsorted alignment data'
1355 cdef IntervalDB db
1356 if self.build_ifile==NULL:
1357 raise IOError('not opened in write mode')
1358 fclose(self.build_ifile)
1359 self.build_ifile=NULL
1360 filename=self.filestem+'.build'
1361 db=IntervalDB() # CREATE EMPTY NL IN MEMORY
1362 if self.nbuild>0:
1363 db.buildFromUnsortedFile(filename,self.nbuild,**kwargs) # BUILD FROM .build
1364 db.write_binaries(self.filestem) # SAVE AS IntervalDBFile
1365 db.close() # DUMP NESTEDLIST FROM MEMORY
1366 import os
1367 os.remove(filename) # REMOVE OUR .build FILE, NO LONGER NEEDED
1368 self.db=IntervalFileDB(self.filestem) # NOW OPEN THE IntervalFileDB
1369 return self.nbuild # return count of intervals
1371 def buildInMemory(self, **kwargs):
1372 try:
1373 n = len(self.buildList)
1374 except TypeError:
1375 return 0
1376 else:
1377 self.idb.save_tuples(self.buildList,**kwargs)
1378 self.buildList = None
1379 return n
1381 cdef int saveInterval(self,IntervalMap im[],int n,int expand_self,FILE *ifile):
1382 cdef int i
1383 if ifile==NULL:
1384 raise IOError('not opened in write mode')
1385 if expand_self: # AN LPO THAT EXPANDS AS WE ADD TO IT...
1386 for i from 0 <= i < n:
1387 if im[i].start>=0:
1388 if im[i].end>self.length: # EXPAND IT...
1389 self.length=im[i].end
1390 elif -(im[i].start)>self.length:
1391 self.length= -(im[i].start) # THIS HANDLES NEGATIVE ORI CASE
1392 i=write_padded_binary(im,n,1,ifile)
1393 if i!=n:
1394 raise IOError('write_padded_binary failed???')
1395 return i
1397 def __setitem__(self,k,t): # SAVE TO .build FILE
1398 'save mapping [k.start:k.stop] --> (id,start,stop)'
1399 cdef int i
1400 cdef IntervalMap im_tmp
1401 if self.build_ifile: # SAVE TO BUILD FILE
1402 im_tmp.start,im_tmp.end=(k.start,k.stop)
1403 im_tmp.target_id,im_tmp.target_start,im_tmp.target_end=t
1404 im_tmp.sublist= -1
1405 i=self.saveInterval(&im_tmp,1,self.is_lpo,self.build_ifile)
1406 #logger.debug('saveInterval: %s %s %s %s %s %s' % (self.id, im_tmp.start, im_tmp.end,
1407 # im_tmp.target_id, im_tmp.target_start, im_tmp.target_end))
1408 self.nbuild=self.nbuild+i # INCREMENT COUNTER OF INTERVALS SAVED
1409 elif self.nlmsaLetters.in_memory_mode:
1410 t=(k.start,k.stop)+t
1411 try:
1412 self.buildList.append(t)
1413 except AttributeError:
1414 self.buildList=[t]
1415 else:
1416 raise ValueError('not opened in write mode')
1418 def __getitem__(self,k):
1419 try:
1420 if k.pathForward is self.seq:
1421 return NLMSASlice(self,k.start,k.stop)
1422 except AttributeError: pass
1423 raise KeyError('key must be a sequence interval of this sequence')
1425 def __len__(self):
1426 'call len(self.seq) if we have a seq. Otherwise self.length'
1427 if self.seq is None:
1428 return self.length
1429 else:
1430 return len(self.seq)
1432 def __iadd__(self,seq):
1433 'add sequence to our union'
1434 try: # CHECK WHETHER THIS IS ALREADY IN THE INDEX
1435 x=self.nlmsaLetters.seqs[seq]
1436 return self # ALREADY IN THE INDEX, NO NEED TO ANYTHING
1437 except KeyError: # OK, WE REALLY DO NEED TO ADD IT...
1438 pass
1439 # CHECK FOR OVERFLOW... CREATE A NEW UNION IF NEEDED
1440 seq=seq.pathForward # GET THE ENTIRE SEQUENCE
1441 if self.length+len(seq)>self.nlmsaLetters.maxlen: # TOO BIG!
1442 if self.nlmsaLetters.pairwiseMode: # NEED TO CREATE CORRESPONDING LPO
1443 ns=self.nlmsaLetters.newSequence(None) # CREATE NEW LPO
1444 ns=self.nlmsaLetters.newSequence(None,is_union=1) # NEW UNION
1445 ns.__iadd__(seq) # ADD seq TO BRAND-NEW UNION
1446 return ns # RETURN THE NEW UNION COORDINATE SYSTEM
1447 # USE OUR EXISTING UNION
1448 self.nlmsaLetters.seqs.saveSeq(seq,self.id,self.length)
1449 self.length=self.length+len(seq) # EXPAND COORDINATE SYSTEM
1450 return self # iadd MUST ALWAYS RETURN self!!!
1457 cdef class NLMSA:
1458 'toplevel interface to NLMSA storage of an LPO alignment'
1459 def __init__(self,pathstem='',mode='r',seqDict=None,mafFiles=None,axtFiles=None,
1460 maxOpenFiles=1024,maxlen=None,nPad=1000000,maxint=41666666,
1461 trypath=None,bidirectional=True,pairwiseMode= -1,
1462 bidirectionalRule=nlmsa_utils.prune_self_mappings,
1463 use_virtual_lpo=None,maxLPOcoord=None,
1464 inverseDB=None, alignedIvals=None, **kwargs):
1465 try:
1466 import resource # WE MAY NEED TO OPEN A LOT OF FILES...
1467 resource.setrlimit(resource.RLIMIT_NOFILE,(maxOpenFiles,-1))
1468 except: # BUT THIS IS OPTIONAL...
1469 pass
1470 self.lpoList=[] # EMPTY LIST OF LPO
1471 self.seqs=nlmsa_utils.NLMSASeqDict(self,pathstem,mode,**kwargs)
1472 self.seqlist=self.seqs.seqlist
1473 self.pathstem=pathstem
1474 self.inverseDB = inverseDB
1475 if maxlen is None:
1476 maxlen=C_int_max-65536 # C_int_max MAXIMUM VALUE REPRESENTABLE BY int
1477 if axtFiles is not None:
1478 maxlen = maxlen/2
1479 self.maxlen=maxlen
1480 self.inlmsa=nPad
1481 self._ignoreShadowAttr={'sourceDB':None,'targetDB':None} # SCHEMA INFO
1482 self.seqDict=seqDict # SAVE FOR USER TO ACCESS...
1483 self.in_memory_mode=0
1484 if bidirectional:
1485 self.is_bidirectional=1
1486 else:
1487 self.is_bidirectional=0
1488 if use_virtual_lpo is not None: # DEPRECATED: MUST HAVE COME FROM PICKLE...
1489 pairwiseMode = use_virtual_lpo # USE SETTING FROM PICKLE...
1490 if pairwiseMode is True:
1491 self.pairwiseMode=1
1492 elif pairwiseMode is False:
1493 self.pairwiseMode=0
1494 else:
1495 self.pairwiseMode=pairwiseMode
1496 if maxLPOcoord is not None: # SIZE OF THE ENTIRE LPO COORD SYSTEM
1497 self.maxLPOcoord=maxLPOcoord
1498 if self.pairwiseMode==1:
1499 raise ValueError('maxLPOcoord and pairwiseMode options incompatible!')
1500 self.pairwiseMode=0
1501 else: # DEFAULT: RESTRICT USER TO A SINGLE LPO
1502 self.maxLPOcoord=self.maxlen
1503 if mode=='r': # OPEN FROM DISK FILES
1504 if self.seqDict is None:
1505 self.seqDict = nlmsa_utils.read_seq_dict(pathstem,trypath)
1506 self.read_indexes(self.seqDict)
1507 self.read_attrs()
1508 elif mode=='w': # WRITE TO DISK FILES
1509 self.do_build=1
1510 self.lpo_id=0
1511 if mafFiles is not None:
1512 self.newSequence() # CREATE INITIAL LPO
1513 self.readMAFfiles(mafFiles,maxint)
1514 elif axtFiles is not None:
1515 self.newSequence() # CREATE INITIAL LPO
1516 self.readAxtNet(axtFiles,bidirectionalRule)
1517 else: # USER WILL ADD INTERVAL MAPPINGS HIMSELF...
1518 if self.seqDict is None:
1519 import seqdb
1520 self.seqDict=seqdb.SeqPrefixUnionDict(addAll=True)
1521 self.initLPO() # CREATE AS MANY LPOs AS WE NEED
1522 self.newSequence(is_union=1) # SO HE NEEDS AN INITIAL UNION
1523 if alignedIvals is not None:
1524 self.add_aligned_intervals(alignedIvals)
1525 self.build()
1526 elif mode=='memory': # CONSTRUCT IN-MEMORY
1527 if self.seqDict is None:
1528 import seqdb
1529 self.seqDict=seqdb.SeqPrefixUnionDict(addAll=True)
1530 self.in_memory_mode=1
1531 self.do_build=1
1532 self.initLPO() # CREATE AS MANY LPOs AS WE NEED
1533 self.newSequence(is_union=1) # CREATE INITIAL UNION
1534 self.lpo_id=0
1535 if alignedIvals is not None:
1536 self.add_aligned_intervals(alignedIvals)
1537 self.build()
1538 elif mode!='xmlrpc':
1539 raise ValueError('unknown mode %s' % mode)
1541 def close(self):
1542 'close our shelve index files'
1543 cdef NLMSASequence ns
1544 for ns in self.seqlist: # tell each seq to close its index files
1545 ns.close()
1546 self.seqs.close()
1548 def __reduce__(self): ############################# SUPPORT FOR PICKLING
1549 import classutil
1550 return (classutil.ClassicUnpickler, (self.__class__,self.__getstate__()))
1551 def __getstate__(self):
1552 if self.in_memory_mode:
1553 raise ValueError("can't pickle NLMSA.in_memory_mode")
1554 return dict(pathstem=self.pathstem,seqDict=self.seqDict,
1555 inverseDB=self.inverseDB)
1556 def __setstate__(self,state):
1557 self.__init__(**state) #JUST PASS KWARGS TO CONSTRUCTOR
1559 def read_indexes(self,seqDict):
1560 'open all nestedlist indexes in this LPO database for immediate use'
1561 cdef NLMSASequence ns
1562 try:
1563 ifile=file(self.pathstem+'.NLMSAindex', 'rU') # text file
1564 except IOError:
1565 ifile=file(self.pathstem+'NLMSAindex', 'rU') # FOR BACKWARDS COMPATIBILITY
1566 try:
1567 for line in ifile:
1568 id,name,is_union,length=line.strip().split('\t')
1569 id=int(id)
1570 is_union=int(is_union)
1571 if id!=len(self.seqlist):
1572 raise IOError('corrupted NLMSAIndex???')
1573 filestem=self.pathstem+str(id)
1574 seq=None # DEFAULT: NO ACTUAL SEQUENCE ASSOCIATED WITH LPO OR UNION
1575 if name=='NLMSA_LPO_Internal': # AN LPO REFERENCE
1576 self.lpo_id=id
1577 elif not is_union: # REGULAR SEQUENCE
1578 try:
1579 seq=seqDict[name]
1580 except KeyError:
1581 raise KeyError('unable to find sequence %s in seqDict!' % name)
1582 # CREATE THE SEQ INTERFACE, BUT DELAY OPENING THE IntervalDBFile
1583 ns=NLMSASequence(self,filestem,seq,'onDemand',is_union) # UNTIL NEEDED
1584 ns.length=int(length) # SAVE STORED LENGTH
1585 self.addToSeqlist(ns,seq)
1586 finally:
1587 ifile.close()
1588 def read_attrs(self):
1589 'read pickled attribute dictionary from file and apply to self'
1590 import pickle
1591 try:
1592 ifile = file(self.pathstem+'.attrDict', 'rb') # pickle is binary file!
1593 except IOError: # BACKWARDS COMPATIBILITY: OLD NLMSA HAS NOT ATTRDICT
1594 return
1595 try:
1596 d = pickle.load(ifile)
1597 for k,v in d.items():
1598 if k=='is_bidirectional':
1599 self.is_bidirectional = v
1600 elif k=='pairwiseMode':
1601 self.pairwiseMode = v
1602 else:
1603 setattr(self,k,v)
1604 finally:
1605 ifile.close()
1606 def addToSeqlist(self,NLMSASequence ns,seq=None):
1607 'add an NLMSASequence to our seqlist, and set its id'
1608 ns.id=self.seqlist.nextID()
1609 self.seqs[seq]=ns # SAVE TO OUR INDEX
1611 def newSequence(self,seq=None,is_union=0):
1612 'create a new nestedlist index for sequence seq'
1613 cdef NLMSASequence ns
1614 id=self.seqlist.nextID() # USE ITS INDEX AS UNIQUE ID FOR THIS SEQUENCE
1615 if self.pairwiseMode==1 and is_union and id>1: # NEED A VIRTUAL_LPO FOR THIS UNION
1616 self.newSequence() # CREATE AN LPO
1617 id=self.seqlist.nextID() # NOW GET ID FOR THE UNION
1618 filestem=self.pathstem+str(id)
1619 if self.in_memory_mode:
1620 mode='memory'
1621 else:
1622 mode='w'
1623 ns=NLMSASequence(self,filestem,seq,mode,is_union=is_union) #OPEN FOR WRITING
1624 if is_union: # RECORD THIS AS OUR CURRENT UNION OBJECT
1625 self.currentUnion=ns
1626 self.addToSeqlist(ns,seq) # SAVE TO OUR INDEX
1627 #logger.debug('Opened build file for ns_id %s, is_union %s' % (ns.id,
1628 # ns.is_union))
1629 return ns
1631 def nextID(self):
1632 'get a unique nlmsaID and advance counter'
1633 nlmsaID=self.inlmsa # USE THE NEXT FREE ID
1634 self.inlmsa=self.inlmsa+1 # ADVANCE THE COUNTER
1635 return nlmsaID
1637 def initLPO(self):
1638 'create LPOs for this alignment'
1639 cdef NLMSASequence ns
1640 offset=0
1641 while offset<self.maxLPOcoord:
1642 ns=self.newSequence() # CREATE AN LPO
1643 ns.offset=offset # FORCE OUR DESIRED OFFSET... EVEN THOUGH ALL LPOs EMPTY
1644 offset=offset+self.maxlen
1645 def init_pairwise_mode(self):
1646 'turn on use of virtual LPO mapping (i.e. no actual LPO is present!)'
1647 if self.pairwiseMode==0:
1648 raise ValueError('this alignment is already using an LPO!')
1649 elif self.pairwiseMode!=1: # NOT ALREADY SET TO PAIRWISE MODE
1650 logger.info('''
1651 Because you are aligning a pair of sequence intervals,
1652 the pairwiseMode=True option is automatically being applied.
1653 To avoid this message in the future, pass the pairwiseMode=True
1654 option to the NLMSA constructor.
1655 See the NLMSA documentation for more details.\n''')
1656 self.pairwiseMode=1 # TURN ON THE VIRTUAL LPO FEATURE
1659 def __setitem__(self,k,v): # THIS METHOD EXISTS ONLY FOR nlmsa[s1]+=s2
1660 if isinstance(v,nlmsa_utils.BuildMSASlice):
1661 if v.seq is not None and v.seq==k: # CASE WHERE k IS A SEQ INTERVAL
1662 return # MATCHES += USAGE: NO NEED TO DO ANYTHING!
1663 elif v.is_lpo and isinstance(k,slice) \
1664 and k.start==v.start and k.stop==v.stop: # CASE: k IS A slice
1665 return # MATCHES += USAGE: NO NEED TO DO ANYTHING!
1666 raise KeyError('Usage only nlmsa[s1]+=s2 allowed. nlmsa[s1]=s2 forbidden!')
1667 def __getitem__(self,k):
1668 'return a slice of the LPO'
1669 if isinstance(k,sequence.SeqPath): # TREAT k AS A SEQUENCE INTERVAL
1670 id,ns,offset=self.seqs[k] # GET UNION INFO FOR THIS SEQ
1671 if self.do_build:
1672 return nlmsa_utils.BuildMSASlice(ns,k.start,k.stop,id,offset,0,k)
1673 else: # QUERY THE ALIGNMENT
1674 try:
1675 return NLMSASlice(ns,k.start,k.stop,id,offset,k)
1676 except nlmsa_utils.EmptySliceError:
1677 return nlmsa_utils.EmptySlice(k)
1678 try: # TREAT k AS A PYTHON SLICE OBJECT
1679 i=k.start
1680 except AttributeError:
1681 raise KeyError('key must be a sequence interval or python slice object')
1682 if self.do_build:
1683 return nlmsa_utils.BuildMSASlice(self.lpoList[0],k.start,k.stop,-1,0,1,None)
1684 else: # QUERY THE ALIGNMENT
1685 l=nlmsa_utils.splitLPOintervals(self.lpoList,k) # MAP TO LPO(S)
1686 if len(l)>1:
1687 raise ValueError('Sorry! Query interval spans multiple LPOs!')
1688 for ns,myslice in l: # ONLY RETURN ONE SLICE OBJECT
1689 return NLMSASlice(ns,myslice.start,myslice.stop)
1691 def __iter__(self):
1692 raise NotImplementedError, 'you cannot iterate over NLMSAs'
1694 def edges(self, *args, **kwargs):
1695 return nlmsa_utils.generate_nlmsa_edges(self, *args, **kwargs)
1697 def __iadd__(self,seq):
1698 'add seq to our union'
1699 self.seqs.saveSeq(seq)
1700 return self # iadd MUST ALWAYS RETURN self!
1701 def addAnnotation(self,a):
1702 'save alignment of sequence interval --> an annotation object'
1703 ival = a.sequence # GET PURE SEQUENCE INTERVAL
1704 self.__iadd__(ival) # ADD SEQ AS A NODE IN OUR ALIGNMENT
1705 self[ival].__iadd__(a) # ADD ALIGNMENT BETWEEN ival AND ANNOTATION
1706 def add_aligned_intervals(self, alignedIvals):
1707 'add alignedIvals to this alignment'
1708 nlmsa_utils.add_aligned_intervals(self, alignedIvals)
1710 cdef void free_seqidmap(self,int nseq0,SeqIDMap *seqidmap):
1711 cdef int i
1712 for i from 0 <= i <nseq0: # DUMP STRING STORAGE FOR SEQUENCE IDENTIFIERS
1713 free(seqidmap[i].id)
1714 free(seqidmap) # WE CAN NOW FREE THE SEQUENCE LOOKUP ARRAY
1716 cdef void save_nbuild(self,int nbuild[]):
1717 cdef NLMSASequence ns
1718 for ns in self.seqlist: # SAVE INTERVAL COUNTS BACK TO EACH SEQUENCE
1719 if not ns.is_lpo or self.pairwiseMode==1:
1720 ns.nbuild=nbuild[ns.id] # SAVE INTERVAL COUNTS BACK TO REGULAR SEQUENCES
1721 #logger.debug('nbuild[%d] = %s' % (i, ns.nbuild))
1724 def readMAFfiles(self,mafFiles,maxint):
1725 'read alignment from a set of MAF files'
1726 cdef int i,j,nseq0,nseq1,n,nseq,block_len
1727 cdef SeqIDMap *seqidmap
1728 cdef char tmp[32768],*p,a_header[4]
1729 cdef FILE *ifile
1730 cdef IntervalMap im[4096],im_tmp
1731 cdef NLMSASequence ns_lpo,ns # ns IS OUR CURRENT UNION
1732 cdef FILE *build_ifile[4096]
1733 cdef int nbuild[4096],has_continuation
1734 cdef long long linecode_count[256]
1736 ns_lpo=self.seqlist[self.lpo_id] # OUR INITIAL LPO
1737 self.pairwiseMode=0 # WE ARE USING A REAL LPO!
1738 memset(<void *>linecode_count,0,sizeof(linecode_count))
1739 has_continuation=0
1741 nseq0=len(self.seqDict) # GET TOTAL #SEQUENCES IN ALL DATABASES
1742 seqidmap=<SeqIDMap *>calloc(nseq0,sizeof(SeqIDMap)) # ALLOCATE ARRAY
1744 for pythonStr,seqInfo in self.seqDict.seqInfoDict.iteritems():
1745 seqidmap[i].id=strdup(pythonStr)
1746 try:
1747 seqidmap[i].length = seqInfo.length
1748 except OverflowError:
1749 raise OverflowError('''Sequence too long for 32 bit int: %s, %d
1750 Something is probably wrong with creation / reading of this sequence.
1751 Check the input!''' % (pythonStr, seqInfo.length))
1752 i=i+1
1753 qsort(seqidmap,nseq0,sizeof(SeqIDMap),seqidmap_qsort_cmp) # SORT BY id
1754 ns=None
1756 im_tmp.sublist= -1 # DEFAULT
1757 strcpy(a_header,"a ") # MAKE C STRING
1758 for filename in mafFiles:
1759 logger.info('Processing MAF file: ' + filename)
1760 ifile=fopen(filename,'r') # text file
1761 if ifile==NULL:
1762 self.free_seqidmap(nseq0,seqidmap)
1763 self.save_nbuild(nbuild)
1764 raise IOError('unable to open file %s' % filename)
1765 if fgets(tmp,32767,ifile)==NULL or strncmp(tmp,"##maf",4): # HEADER LINE
1766 self.free_seqidmap(nseq0,seqidmap)
1767 self.save_nbuild(nbuild)
1768 raise IOError('%s: not a MAF file? Bad format.' % filename)
1769 p=fgets(tmp,32767,ifile) # READ 1ST DATA LINE OF THE MAF FILE
1770 while p: # GOT ANOTHER LINE TO PROCESS
1771 if has_continuation or 0==strncmp(tmp,a_header,2): # ALIGNMENT HEADER: READ ALIGNMENT
1772 n=readMAFrecord(im,0,seqidmap,nseq0,ns_lpo.length, # READ ONE MAF BLOCK
1773 &block_len,ifile,4096,linecode_count,&has_continuation)
1774 if n<0: # UNRECOVERABLE ERROR OCCURRED...
1775 self.free_seqidmap(nseq0,seqidmap)
1776 self.save_nbuild(nbuild)
1777 raise ValueError('MAF block too long! Increase max size')
1778 elif n==0:
1779 continue
1781 if self.maxlen-ns_lpo.length<=block_len \
1782 or ns_lpo.nbuild>maxint: # TOO BIG! MUST CREATE A NEW LPO
1783 j=ns_lpo.length # RECORD THE OLD OFFSET
1784 ns_lpo=self.newSequence() # CREATE A NEW LPO SEQUENCE
1785 for i from 0<= i < n: # TRANSLATE THESE INTERVALS BACK TO ZERO OFFSET
1786 if im[i].start>=0: # FORWARD INTERVAL
1787 im[i].start = im[i].start - j
1788 im[i].end = im[i].end - j
1789 else: # REVERSE INTERVAL
1790 im[i].start = im[i].start + j
1791 im[i].end = im[i].end + j
1793 for i from 0 <= i < n: # SAVE EACH INTERVAL IN UNION -> LPO MAP
1794 j=im[i].target_id
1795 if seqidmap[j].nlmsa_id<=0: # NEW SEQUENCE, NEED TO ADD TO UNION
1796 if ns is None or self.maxlen-ns.length<=seqidmap[j].length:
1797 ns=self.newSequence(None,is_union=1) # CREATE NEW UNION TO HOLD IT
1798 build_ifile[ns.id]=ns.build_ifile # KEEP PTR SO WE CAN WRITE DIRECTLY!
1799 nbuild[ns.id]=0
1800 seqidmap[j].ns_id=ns.id # SET IDs TO ADD THIS SEQ TO THE UNION
1801 seqidmap[j].nlmsa_id=self.inlmsa
1802 seqidmap[j].offset=ns.length
1803 self.inlmsa=self.inlmsa+1 # ADVANCE SEQUENCE ID COUNTER
1804 ns.length=ns.length+seqidmap[j].length # EXPAND UNION SIZE
1806 im[i].target_id=seqidmap[j].nlmsa_id # USE THE CORRECT ID
1807 if im[i].target_start<0: # OFFSET REVERSE ORI
1808 im_tmp.start= -seqidmap[j].offset+im[i].target_start
1809 im_tmp.end= -seqidmap[j].offset+im[i].target_end
1810 else: # OFFSET FORWARD ORI
1811 im_tmp.start= seqidmap[j].offset+im[i].target_start
1812 im_tmp.end= seqidmap[j].offset+im[i].target_end
1813 im_tmp.target_id=ns_lpo.id
1814 im_tmp.target_start=im[i].start
1815 im_tmp.target_end=im[i].end
1816 j=seqidmap[j].ns_id # USE NLMSA ID OF THE UNION
1817 ns_lpo.saveInterval(&im_tmp,1,0,build_ifile[j]) # SAVE SEQ -> LPO
1818 nbuild[j]=nbuild[j]+1
1820 ns_lpo.saveInterval(im,n,1,ns_lpo.build_ifile) # SAVE LPO -> SEQ
1821 ns_lpo.nbuild=ns_lpo.nbuild+n # INCREMENT COUNT OF SAVED INTERVALS
1822 if not has_continuation:
1823 p=fgets(tmp,32767,ifile) # TRY TO READ ANOTHER LINE...
1824 fclose(ifile) # CLOSE THIS MAF FILE
1825 #logger.debug('nbuild[0] = ' + ns_lpo.nbuild)
1826 for i from 0 <= i <256: # PRINT WARNINGS ABOUT NON-ALIGNMENT LINES
1827 if linecode_count[i]>0:
1828 logger.warn("Non-alignment text lines ignored: prefix %s, count %d" %
1829 (chr(i), linecode_count[i]))
1830 for i from 0 <= i <nseq0: # INDEX SEQUENCES THAT WERE ALIGNED
1831 if seqidmap[i].nlmsa_id>0: # ALIGNED, SO RECORD IT
1832 self.seqs.saveSeq(seqidmap[i].id,seqidmap[i].ns_id,seqidmap[i].offset,
1833 seqidmap[i].nlmsa_id)
1834 self.free_seqidmap(nseq0,seqidmap)
1835 self.save_nbuild(nbuild)
1836 self.build() # WILL TAKE CARE OF CLOSING ALL build_ifile STREAMS
1838 cdef NLMSASequence add_seqidmap_to_union(self,int j,SeqIDMap seqidmap[],
1839 NLMSASequence ns,FILE *build_ifile[],
1840 int nbuild[]):
1841 cdef NLMSASequence ns_lpo
1842 if ns is None or self.maxlen-ns.length<=seqidmap[j].length:
1843 ns = self.newSequence(None,is_union=1) # CREATE NEW UNION TO HOLD IT
1844 build_ifile[ns.id] = ns.build_ifile # KEEP PTR SO WE CAN WRITE DIRECTLY!
1845 nbuild[ns.id] = 0
1846 if self.pairwiseMode==1: # ALSO BIND INFO FOR VIRTUAL LPO FOR THIS UNION
1847 ns_lpo = self.seqs.seqlist[ns.id - 1]
1848 build_ifile[ns_lpo.id] = ns_lpo.build_ifile
1849 nbuild[ns_lpo.id] = 0
1850 seqidmap[j].ns_id = ns.id # SET IDs TO ADD THIS SEQ TO THE UNION
1851 seqidmap[j].nlmsa_id = self.inlmsa
1852 seqidmap[j].offset = ns.length
1853 self.inlmsa = self.inlmsa+1 # ADVANCE SEQUENCE ID COUNTER
1854 ns.length = ns.length+seqidmap[j].length # EXPAND UNION SIZE
1855 return ns
1857 def readAxtNet(self,axtFiles,bidirectionalRule):
1858 'read alignment from a set of axtnet files'
1859 cdef int i,j,nseq0,n,isrc,is_bidirectional
1860 cdef SeqIDMap *seqidmap
1861 cdef char tmp[32768],*p,comment[4],src_prefix[64],dest_prefix[64]
1862 cdef FILE *ifile
1863 cdef IntervalMap im[4096],im_tmp
1864 cdef NLMSASequence ns_src # SOURCE UNION VS DEST UNION
1865 cdef FILE *build_ifile[4096]
1866 cdef int nbuild[4096],has_continuation
1868 self.pairwiseMode = 1 # WE ARE USING pairwiseMode
1870 nseq0=len(self.seqDict) # GET TOTAL #SEQUENCES IN ALL DATABASES
1871 seqidmap=<SeqIDMap *>calloc(nseq0,sizeof(SeqIDMap)) # ALLOCATE ARRAY
1873 for pythonStr,seqInfo in self.seqDict.seqInfoDict.iteritems():
1874 seqidmap[i].id=strdup(pythonStr)
1875 seqidmap[i].length = seqInfo.length
1876 i=i+1
1877 qsort(seqidmap,nseq0,sizeof(SeqIDMap),seqidmap_qsort_cmp) # SORT BY id
1878 ns_src = None
1880 im_tmp.sublist= -1 # DEFAULT
1881 strcpy(comment,"#") # MAKE C STRING
1882 import string
1883 import os.path
1884 for filename in axtFiles:
1885 logger.info('Processing axtnet file: ' + filename)
1886 try:
1887 if filename[-8:] == '.net.axt':
1888 t = string.split(os.path.basename(filename)[:-8], '.')[-2:]
1889 elif filename[-4:] == '.axt':
1890 t = string.split(os.path.basename(filename)[:-4], '.')[-2:]
1891 except:
1892 raise IOError('%s is not correct axtNet file name. Correct name is (chrid.)source.target.net.axt.' % filename)
1893 #t = prefix_fun(filename) # CALL PYTHON FUNCTION TO OBTAIN PREFIXES
1894 if bidirectionalRule is None: # DETERMINE IF UNI- VS. BI-DIRECTIONAL
1895 is_bidirectional = self.is_bidirectional # JUST USE GLOBAL SETTING
1896 else: # GET SETTING FROM USER-SUPPLIED FUNCTION
1897 is_bidirectional = bidirectionalRule(t[0],t[1],self.is_bidirectional)
1898 strcpy(src_prefix,t[0]) # KEEP THEM IN STATIC C STRINGS FOR SPEED
1899 strcpy(dest_prefix,t[1])
1900 ifile=fopen(filename,'r') # text file
1901 if ifile==NULL:
1902 self.free_seqidmap(nseq0,seqidmap)
1903 self.save_nbuild(nbuild)
1904 raise IOError('unable to open file %s' % filename)
1905 while True:
1906 n = read_axtnet(im,seqidmap,nseq0,ifile,4096,&isrc,src_prefix,dest_prefix)
1907 if n<0: # UNRECOVERABLE ERROR OCCURRED...
1908 self.free_seqidmap(nseq0,seqidmap)
1909 self.save_nbuild(nbuild)
1910 raise ValueError('axtNet block too long! Increase max size')
1911 elif n==0: # NO MORE DATA TO READ
1912 break
1914 if seqidmap[isrc].nlmsa_id<=0: # NEW SEQUENCE, NEED TO ADD TO UNION
1915 ns_src = self.add_seqidmap_to_union(isrc,seqidmap,ns_src,build_ifile,nbuild)
1917 for i from 0 <= i < n: # SAVE EACH INTERVAL IN SRC -> DEST MAP
1918 j=im[i].target_id
1919 #logger.debug('A: %s %s %s %s %s %s' % (im[i].start, im[i].end,
1920 # im[i].target_id,
1921 # im[i].target_start,
1922 # im[i].target_end,
1923 # im_tmp.sublist))
1924 #logger.debug('B: %s %s %s %s %s %s %s' % (seqidmap[isrc].nlmsa_id, i,
1925 # j, seqidmap[j].id,
1926 # seqidmap[j].ns_id,
1927 # seqidmap[j].offset,
1928 # seqidmap[j].nlmsa_id))
1929 if seqidmap[j].nlmsa_id<=0: # NEW SEQUENCE, NEED TO ADD TO UNION
1930 ns_src = self.add_seqidmap_to_union(j,seqidmap,ns_src,build_ifile,nbuild)
1931 im[i].target_id = seqidmap[j].nlmsa_id # USE THE CORRECT ID
1932 if is_bidirectional: # SAVE DEST -> SRC ALIGNMENT MAPPING
1933 if im[i].target_start<0: # OFFSET REVERSE ORI
1934 im_tmp.start = -seqidmap[j].offset+im[i].target_start
1935 im_tmp.end = -seqidmap[j].offset+im[i].target_end
1936 else: # OFFSET FORWARD ORI
1937 im_tmp.start = seqidmap[j].offset+im[i].target_start
1938 im_tmp.end = seqidmap[j].offset+im[i].target_end
1939 im_tmp.target_id = seqidmap[isrc].nlmsa_id
1940 im_tmp.target_start = im[i].start
1941 im_tmp.target_end = im[i].end
1942 #logger.debug('C: %s %s %s %s %s' % (im_tmp.target_id,
1943 # im_tmp.target_start,
1944 # im_tmp.target_end,
1945 # seqidmap[j].ns_id, j))
1946 j=seqidmap[j].ns_id-1 # SAVE ALL ALIGNMENTS TO THE VIRTUAL LPO
1947 ns_src.saveInterval(&im_tmp,1,0,build_ifile[j]) # SAVE DEST -> SRC
1948 nbuild[j]=nbuild[j]+1
1949 if im[i].start < 0: # OFFSET FORWARD ORI
1950 im[i].start = -seqidmap[isrc].offset + im[i].start
1951 im[i].end = -seqidmap[isrc].offset + im[i].end
1952 else: # OFFSET FORWARD ORI
1953 im[i].start = seqidmap[isrc].offset + im[i].start
1954 im[i].end = seqidmap[isrc].offset + im[i].end
1955 #logger.debug('D: %s %s %s %s %s %s' % (im_tmp.start, im_tmp.end,
1956 # im_tmp.target_id,
1957 # im_tmp.target_start,
1958 # im_tmp.target_end,
1959 # im_tmp.sublist))
1961 # SAVE THE RECORD. read_axtnet FUNCTION READS SRC/DEST AT THE SAME TIME
1962 j = seqidmap[isrc].ns_id-1 # SAVE ALL ALIGNMENTS TO THE VIRTUAL LPO
1963 ns_src.saveInterval(im,n,0,build_ifile[j]) # SAVE SRC -> DEST
1964 nbuild[j]=nbuild[j]+n # INCREMENT COUNT OF SAVED INTERVALS
1966 fclose(ifile) # CLOSE THIS AXTNET FILE
1968 for i from 0 <= i <nseq0: # INDEX SEQUENCES THAT WERE ALIGNED
1969 if seqidmap[i].nlmsa_id>0: # ALIGNED, SO RECORD IT
1970 self.seqs.saveSeq(seqidmap[i].id,seqidmap[i].ns_id,seqidmap[i].offset,
1971 seqidmap[i].nlmsa_id)
1972 self.free_seqidmap(nseq0,seqidmap)
1973 self.save_nbuild(nbuild)
1974 self.build() # WILL TAKE CARE OF CLOSING ALL build_ifile STREAMS
1977 def buildFiles(self, saveSeqDict=False, **kwargs):
1978 'build nestedlist databases on-disk, and .seqDict index if desired'
1979 cdef NLMSASequence ns
1980 self.seqs.reopenReadOnly() # SAVE INDEXES AND OPEN READ-ONLY
1981 ntotal = 0
1982 ifile=file(self.pathstem+'.NLMSAindex','w') # text file
1983 try:
1984 for ns in self.seqlist: # BUILD EACH IntervalFileDB ONE BY ONE
1985 ntotal = ntotal + ns.buildFiles(**kwargs)
1986 if ns.is_lpo:
1987 ifile.write('%d\t%s\t%d\t%d\n' %(ns.id,'NLMSA_LPO_Internal',0,ns.length))
1988 elif ns.is_union:
1989 ifile.write('%d\t%s\t%d\t%d\n' %(ns.id,'NLMSA_UNION_Internal',1,ns.length))
1990 else:
1991 ifile.write('%d\t%s\t%d\t%d\n' %(ns.id,ns.name,0,ns.length))
1992 finally:
1993 ifile.close()
1994 if ntotal==0:
1995 raise nlmsa_utils.EmptyAlignmentError('empty alignment!')
1996 import sys,pickle
1997 ifile = file(self.pathstem+'.attrDict','wb') # pickle is binary file!
1998 try:
1999 pickle.dump(dict(is_bidirectional=self.is_bidirectional,
2000 pairwiseMode=self.pairwiseMode),ifile)
2001 finally:
2002 ifile.close()
2003 logger.info('Index files saved.')
2004 if saveSeqDict:
2005 self.save_seq_dict()
2006 else:
2007 logger.info('''Note: the NLMSA.seqDict was not saved to a file.
2008 This is not necessary if you intend to save the NLMSA to worldbase.
2009 But if you wish to open this NLMSA independently of worldbase,
2010 you should call NLMSA.save_seq_dict() to save the seqDict info to a file,
2011 or in the future pass the saveSeqDict=True option to NLMSA.build().''')
2013 def save_seq_dict(self):
2014 'save seqDict to a worldbase-aware pickle file'
2015 nlmsa_utils.save_seq_dict(self.pathstem,self.seqDict)
2017 def build(self,**kwargs):
2018 'build nestedlist databases from saved mappings and initialize for use'
2019 if self.do_build==0:
2020 raise ValueError('not opened in write mode')
2021 try: # TURN OFF AUTOMATIC ADDING OF SEQUENCES TO OUR SEQDICT...
2022 self.seqDict.addAll=False
2023 except AttributeError: # THAT WAS PURELY OPTIONAL...
2024 pass
2025 if self.in_memory_mode:
2026 ntotal = 0
2027 for ns in self.seqlist: # BUILD EACH IntervalDB ONE BY ONE
2028 ntotal = ntotal + ns.buildInMemory(**kwargs)
2029 if ntotal==0:
2030 raise nlmsa_utils.EmptyAlignmentError('empty alignment!')
2031 else:
2032 self.buildFiles(**kwargs)
2033 self.do_build=0
2035 def seqInterval(self,int iseq,int istart,int istop):
2036 'get specified interval in the target sequence'
2037 seq=self.seqlist.getSeq(iseq) # JUST THE SEQ OBJECT
2038 return sequence.relativeSlice(seq,istart,istop)
2039 def __invert__(self):
2040 if self.inverseDB is not None: # use the specified inverseDB
2041 return self.inverseDB
2042 elif self.is_bidirectional: # provides mapping both directions
2043 return self
2044 else:
2045 raise ValueError('this mapping is not invertible')
2050 def dump_textfile(pathstem, outfilename=None):
2051 'dump NLMSA binary files to a text file'
2052 cdef int n,nlmsaID,nsID,offset,is_bidirectional,pairwiseMode,nprefix
2053 cdef FILE *outfile
2054 cdef char err_msg[2048],tmp[2048],seqDictID[256]
2055 err_msg[0] = 0 # ENSURE STRING IS EMPTY
2056 if outfilename is None:
2057 outfilename = pathstem+'.txt' # DEFAULT TEXTFILE NAME
2058 import classutil,pickle,sys # NEED TO COPY THE WHOLE seqIDdict
2059 seqIDdict = classutil.open_shelve(pathstem+'.seqIDdict','r')
2060 n = len(seqIDdict)
2061 seqDict = nlmsa_utils.read_seq_dict(pathstem)
2062 try: # OBTAIN PREFIX INFO FOR SEQDICT
2063 prefixDict = seqDict.prefixDict
2064 nprefix = len(prefixDict)
2065 strcpy(seqDictID,"None")
2066 except AttributeError: # NO PREFIXUNION. TRY TO GET ID OF seqDict
2067 nprefix = 0
2068 prefixDict = {}
2069 try:
2070 strcpy(seqDictID,seqDict._persistent_id)
2071 except AttributeError:
2072 strcpy(seqDictID,"unknown")
2073 logger.info('''Warning: Because your seqDict has no worldbase ID, there
2074 is no host-independent way to save it to a textfile for transfer
2075 to another machine. Therefore, when loading this textfile
2076 on the destination machine, you will have to provide the
2077 seqDict argument to textfile_to_binaries() on the destination machine.''')
2078 try:
2079 ifile = file(pathstem+'.attrDict', 'rb') # pickle is binary file!
2080 d = pickle.load(ifile)
2081 ifile.close()
2082 except IOError:
2083 d = {}
2084 is_bidirectional = d.get('is_bidirectional',-1)
2085 pairwiseMode = d.get('pairwiseMode',-1)
2086 outfile=fopen(outfilename,"w") # text file
2087 import os.path
2088 basestem=os.path.basename(pathstem) # GET RID OF PATH INFO
2089 strcpy(tmp,basestem) # COPY TO C STRING SO WE CAN fprintf
2090 if outfile==NULL:
2091 raise IOError('unable to open file %s' %outfilename)
2092 try:
2093 if fprintf(outfile,"PATHSTEM\t%s\t%d\t%d\t%d\t%d\t%s\n",tmp,n,
2094 is_bidirectional,pairwiseMode,nprefix,seqDictID)<0:
2095 raise IOError('error writing to file %s' %outfilename)
2096 pleaseWarn = True
2097 for id,d in prefixDict.items(): # SAVE seqDict PREFIX ENTRIES
2098 strcpy(tmp,id) # CONVERT TO C DATA TYPES FOR fprintf
2099 try:
2100 strcpy(seqDictID,d._persistent_id) # try to get worldbase ID
2101 except AttributeError:
2102 strcpy(seqDictID,"None")
2103 if pleaseWarn:
2104 pleaseWarn = False
2105 logger.info('''Warning: Because one or more of the sequence
2106 databases in the seqDict have no worldbase ID, there is no
2107 host-independent way to save it to a textfile for transfer
2108 to another machine. Therefore, when loading this textfile
2109 on the destination machine, you will have to provide a dictionary
2110 for these sequence database(s) as the prefixDict argument
2111 to textfile_to_binaries() on the destination machine.''')
2112 if fprintf(outfile,"PREFIXUNION\t%s\t%s\n",tmp,seqDictID)<0:
2113 raise IOError('error writing to file %s' %outfilename)
2114 for id,t in seqIDdict.iteritems(): # SAVE seqIDdict
2115 strcpy(tmp,id) # CONVERT TO C DATA TYPES FOR fprintf
2116 nlmsaID=t[0]
2117 nsID=t[1]
2118 offset=t[2]
2119 if fprintf(outfile,"SEQID\t%s\t%d\t%d\t%d\n",tmp,
2120 nlmsaID,nsID,offset)<0:
2121 raise IOError('error writing to file %s' %outfilename)
2122 try:
2123 ifile = file(pathstem+'.NLMSAindex', 'rU') # text file
2124 except IOError:
2125 ifile = file(pathstem+'NLMSAindex', 'rU')
2126 except:
2127 fclose(outfile)
2128 raise
2129 try:
2130 for line in ifile: # NOW SAVE THE NLMSA DATA
2131 id,name,is_union,length=line.strip().split('\t')
2132 strcpy(tmp,line) # COPY TO C STRING SO WE CAN fprintf
2133 if fprintf(outfile,"NLMSASequence\t%s",tmp)<0:
2134 raise IOError('error writing file %s'%outfilename)
2135 mypath=pathstem+id
2136 mybase=basestem+id
2137 if save_text_file(mypath,mybase,err_msg,outfile)!=0:
2138 raise IOError(err_msg)
2139 finally:
2140 fclose(outfile)
2141 ifile.close()
2144 def textfile_to_binaries(filename,seqDict=None,prefixDict=None,buildpath=''):
2145 'convert pathstem.txt textfile to NLMSA binary files'
2146 cdef int i,n,nlmsaID,nsID,offset,is_bidirectional,pairwiseMode,nprefix
2147 cdef FILE *infile
2148 cdef char err_msg[2048],line[32768],tmp[2048],basestem[2048],seqDictID[2048]
2149 if seqDict is not None:
2150 ignorePrefix = True
2151 else:
2152 ignorePrefix = False
2153 err_msg[0]=0 # ENSURE STRING IS EMPTY
2154 infile=fopen(filename,"r") # text file
2155 if infile==NULL:
2156 raise IOError('unable to open file %s' %filename)
2157 try:
2158 if fgets(line,32767,infile)==NULL:
2159 raise IOError('error or EOF reading %s'%filename)
2160 is_bidirectional = -1 # INVALID INITIAL SETTING
2161 pairwiseMode = -1
2162 nprefix = 0
2163 strcpy(tmp,"None")
2164 if 2>sscanf(line,"PATHSTEM\t%s\t%d\t%d\t%d\t%d\t%s",basestem,&n,
2165 &is_bidirectional,&pairwiseMode,&nprefix,tmp):
2166 raise IOError('bad format in %s'%filename)
2167 if buildpath!='': # USER-SPECIFIED PATH FOR BINARIES
2168 import os
2169 buildpath1 = os.path.join(buildpath,basestem) # CONSTRUCT FILE PATH
2170 strcpy(basestem,buildpath1) # COPY BACK TO C STRING USABLE IN C FUNCTIONS
2171 else: # JUST USE PATH IN CURRENT DIRECTORY
2172 buildpath1 = basestem
2173 if 0==strcmp(tmp,"unknown"):
2174 if seqDict is None:
2175 raise ValueError('You must provide a seqDict for this NLMSA!')
2176 elif 0!=strcmp(tmp,"None"): # try obtaining as worldbase ID
2177 from pygr import worldbase
2178 seqDict = worldbase(tmp)
2179 import classutil,pickle # CREATE THE seqIDdict
2180 seqIDdict = classutil.open_shelve(basestem+'.seqIDdict','n')
2181 IDdict = classutil.open_shelve(basestem+'.idDict','n')
2182 d = {}
2183 if is_bidirectional != -1:
2184 d['is_bidirectional'] = is_bidirectional
2185 if pairwiseMode != -1:
2186 d['pairwiseMode'] = pairwiseMode
2187 ifile = file(basestem+'.attrDict',"wb") # pickle is binary file!
2188 try:
2189 pickle.dump(d,ifile)
2190 finally:
2191 ifile.close()
2192 if prefixDict is None or ignorePrefix:
2193 prefixDict = {}
2194 missing = []
2195 for i from 0 <= i < nprefix: # READ seqDICT PREFIX ENTRIES
2196 if fgets(line,32767,infile)==NULL:
2197 raise IOError('error or EOF reading %s'%filename)
2198 if 2!=sscanf(line,"PREFIXUNION\t%s\t%s",tmp,seqDictID):
2199 raise IOError('bad format in %s'%filename)
2200 if ignorePrefix: # JUST IGNORE THE PREFIX INFO WE READ
2201 continue
2202 if 0==strcmp(seqDictID,"None"):
2203 if tmp not in prefixDict:
2204 missing.append(tmp) # MISSING A SEQDICT DICTIONARY ENTRY!
2205 else: # load it from worldbase
2206 from pygr import worldbase
2207 prefixDict[tmp] = worldbase(seqDictID)
2208 if len(missing)>0:
2209 raise KeyError('''You must supply sequence database(s) for the
2210 following prefixes, by passing them in the prefixDict optional
2211 dictionary argument: %s''' % missing)
2212 if len(prefixDict)>0: # CREATE A PREFIX UNION
2213 import seqdb
2214 seqDict = seqdb.PrefixUnionDict(prefixDict)
2215 nlmsa_utils.save_seq_dict(basestem,seqDict) # SAVE SEQDICT
2216 for i from 0 <= i <n: # seqIDDict READING
2217 if fgets(line,32767,infile)==NULL:
2218 raise IOError('error or EOF reading %s'%filename)
2219 if 4!=sscanf(line,"SEQID\t%s\t%d %d %d",tmp,
2220 &nlmsaID,&nsID,&offset):
2221 raise IOError('bad format in %s'%filename)
2222 seqIDdict[tmp]=(nlmsaID,nsID,offset) # SAVE THIS ENTRY
2223 IDdict[str(nlmsaID)]=(tmp,nsID)
2224 seqIDdict.close() # DONE WRITING THE seqIDdict
2225 IDdict.close() # DONE WRITING THE seqIDdict
2227 NLMSAindexText = ''
2228 if buildpath!='': # USER-SPECIFIED PATH FOR BINARIES
2229 import os
2230 buildpath2 = os.path.join(buildpath,'') # ENSURE THIS ENDS IN DIRECTORY SEPARATOR
2231 strcpy(basestem,buildpath2) # COPY BACK TO C STRING USABLE IN C FUNCTIONS
2232 else:
2233 strcpy(basestem,'') # JUST USE BLANK STRING TO SAVE IN CURRENT DIRECTORY
2234 import sys
2235 while fgets(line,32767,infile)!=NULL:
2236 s=line # CONVERT STRING TO PYTHON OBJECT
2237 if not s.startswith('NLMSASequence'):
2238 raise IOError('bad format in file %s'%filename)
2239 NLMSAindexText = NLMSAindexText + s[14:] # JUST SAVE THE DATA FIELDS
2240 logger.info('Saving NLMSA binary index: ' + s[14:] + '...')
2241 if text_file_to_binaries(infile,basestem,err_msg)<0:
2242 raise IOError(err_msg)
2243 ifile = file(buildpath1+'.NLMSAindex',"w") # text file
2244 ifile.write(NLMSAindexText) # LAST, WRITE TOP INDEX FILE
2245 ifile.close()
2246 finally:
2247 fclose(infile)
2248 return buildpath1 # ACTUAL PATH TO NLMSA INDEX FILESET