added SQLTable pickle test
[pygr.git] / pygr / annotation.py
blobab3b2e4891901d131874ee7c132eb5eaa7235d64
1 from __future__ import generators
2 from sequence import *
3 import classutil
4 import UserDict
5 import weakref
7 def getAnnotationAttr(self,attr):
8 'forward attributes from slice object if available'
9 return self.db.getSliceAttr(self.db.sliceDB[self.id], attr)
11 def annotation_repr(self):
12 if self.annotationType is not None:
13 title = self.annotationType
14 else:
15 title = 'annot'
16 if self.orientation>0:
17 return '%s%s[%d:%d]' % (title,self.id,self.start,self.stop)
18 else:
19 return '-%s%s[%d:%d]' % (title,self.id,-self.stop,-self.start)
21 class AnnotationSeqDescr(object):
22 'get the sequence interval corresponding to this annotation'
23 def __get__(self,obj,objtype):
24 return absoluteSlice(obj._anno_seq,obj._anno_start,obj._anno_start+obj.stop)
25 class AnnotationSliceDescr(object):
26 'get the sequence interval corresponding to this annotation'
27 def __get__(self,obj,objtype):
28 return relativeSlice(obj.pathForward.sequence,obj.start,obj.stop)
29 class AnnotationSeqtypeDescr(object):
30 'get seqtype of the sequence interval corresponding to this annotation'
31 def __get__(self,obj,objtype):
32 return obj._anno_seq.seqtype()
34 class AnnotationSeq(SeqPath):
35 'base class representing an annotation'
36 start=0
37 step=1
38 orientation=1
39 def __init__(self,id,db,parent,start,stop):
40 self.id = id
41 self.db = db
42 self.stop = stop-start
43 self._anno_seq = parent
44 self._anno_start = start
45 self.path = self
46 __getattr__ = getAnnotationAttr
47 sequence = AnnotationSeqDescr()
48 annotationType = classutil.DBAttributeDescr('annotationType')
49 _seqtype = AnnotationSeqtypeDescr()
50 __repr__ = annotation_repr
51 def __cmp__(self, other):
52 if not isinstance(other, AnnotationSeq):
53 return -1
54 if cmp(self.sequence, other.sequence) == 0:
55 if self.id == other.id and self.db is other.db:
56 return cmp((self.start,self.stop),(other.start,other.stop))
57 return NOT_ON_SAME_PATH
59 def strslice(self,start,stop):
60 raise ValueError('''this is an annotation, and you cannot get a sequence string from it.
61 Use its sequence attribute to get a sequence object representing this interval.''')
63 class AnnotationSlice(SeqDBSlice):
64 'represents subslice of an annotation'
65 __getattr__=getAnnotationAttr
66 sequence = AnnotationSliceDescr()
67 annotationType = classutil.DBAttributeDescr('annotationType')
68 __repr__ = annotation_repr
70 class TranslationAnnotSeqDescr(object):
71 'get the sequence interval corresponding to this annotation'
72 def __get__(self,obj,objtype):
73 return absoluteSlice(obj._anno_seq, obj._anno_start, obj._anno_stop)
75 class TranslationAnnotFrameDescr(object):
76 """Get the frame of this protein translation, relative to original DNA."""
77 def __get__(self, obj, objtype):
78 orig = obj.pathForward.sequence
79 if orig.orientation > 0:
80 frame = (orig.start % 3) + 1
81 else:
82 return -((orig.start + 1) % 3 + 1)
83 return frame
85 class TranslationAnnot(AnnotationSeq):
86 'annotation representing aa translation of a given nucleotide interval'
87 def __init__(self, id, db, parent, start, stop):
88 AnnotationSeq.__init__(self, id, db, parent, start, stop)
89 self.stop /= 3
90 self._anno_stop = stop
91 sequence = TranslationAnnotSeqDescr()
92 frame = TranslationAnnotFrameDescr()
93 _seqtype = PROTEIN_SEQTYPE
94 def strslice(self, start, stop):
95 'get the aa translation of our associated ORF'
96 try:
97 aa = self._translation
98 except AttributeError:
99 aa = self._translation = translate_orf(str(self.sequence))
100 return aa[start:stop]
102 class TranslationAnnotSliceDescr(object):
103 'get the sequence interval corresponding to this annotation'
104 def __get__(self,obj,objtype):
105 return relativeSlice(obj.pathForward.sequence, 3*obj.start, 3*obj.stop)
107 class TranslationAnnotSlice(AnnotationSlice):
108 sequence = TranslationAnnotSliceDescr()
109 frame = TranslationAnnotFrameDescr()
112 class AnnotationDB(object, UserDict.DictMixin):
113 'container of annotations as specific slices of db sequences'
114 def __init__(self, sliceDB, seqDB, annotationType=None,
115 itemClass=AnnotationSeq,
116 itemSliceClass=AnnotationSlice,
117 itemAttrDict=None, # GET RID OF THIS BACKWARDS-COMPATIBILITY KLUGE!!
118 sliceAttrDict=None,maxCache=None, autoGC=True,
119 checkFirstID=True, **kwargs):
120 '''sliceDB must map identifier to a sliceInfo object;
121 sliceInfo must have attributes: id, start, stop, orientation;
122 seqDB must map sequence ID to a sliceable sequence object;
123 sliceAttrDict gives optional dict of item attributes that
124 should be mapped to sliceDB item attributes.
125 maxCache specfies the maximum number of annotation objects to keep in the cache.'''
126 if autoGC: # automatically garbage collect unused objects
127 self._weakValueDict = classutil.RecentValueDictionary(autoGC)
128 else:
129 self._weakValueDict = {} # object cache
130 self.autoGC = autoGC
131 if sliceAttrDict is None:
132 sliceAttrDict = {}
133 if sliceDB is not None:
134 self.sliceDB = sliceDB
135 else: # NEED TO CREATE / OPEN A DATABASE FOR THE USER
136 self.sliceDB = classutil.get_shelve_or_dict(**kwargs)
137 self.seqDB = seqDB
138 self.annotationType = annotationType
139 self.itemClass=itemClass
140 self.itemSliceClass=itemSliceClass
141 self.sliceAttrDict=sliceAttrDict # USER-PROVIDED ALIASES
142 if maxCache is not None:
143 self.maxCache = maxCache
144 if checkFirstID:
145 try: # don't cache anything now; schema may change itemClass!
146 k = iter(self).next() # get the first ID if any
147 self.get_annot_obj(k, self.sliceDB[k]) # valid annotation?
148 except KeyError: # a convenient warning to the user...
149 raise KeyError('''\
150 cannot create annotation object %s; sequence database %s may not be correct'''
151 % (k, repr(seqDB),))
152 except StopIteration:
153 pass # dataset is empty so there is nothing we can check...
154 __getstate__ = classutil.standard_getstate ############### PICKLING METHODS
155 __setstate__ = classutil.standard_setstate
156 _pickleAttrs = dict(sliceDB=0,seqDB=0,annotationType=0, autoGC=0,
157 itemClass=0,itemSliceClass=0,sliceAttrDict=0,maxCache=0)
158 def __hash__(self): # @CTB unnecessary??
159 'ALLOW THIS OBJECT TO BE USED AS A KEY IN DICTS...'
160 return id(self)
161 def __getitem__(self,k):
162 'get annotation object by its ID'
163 try: # GET FROM OUR CACHE
164 return self._weakValueDict[k]
165 except KeyError:
166 pass
167 return self.sliceAnnotation(k,self.sliceDB[k])
168 def __setitem__(self,k,v):
169 raise KeyError('''you cannot save annotations directly using annoDB[k] = v
170 Instead, use annoDB.new_annotation(k,sliceInfo) where sliceInfo provides
171 a sequence ID, start, stop (and any additional info desired), and will be
172 saved directly to the sliceDB.''')
173 def getSliceAttr(self,sliceInfo,attr):
174 try:
175 k = self.sliceAttrDict[attr] # USE ALIAS IF PROVIDED
176 except KeyError:
177 return getattr(sliceInfo,attr) # GET ATTRIBUTE AS USUAL
178 try: # REMAP TO ANOTHER ATTRIBUTE NAME
179 return getattr(sliceInfo,k)
180 except TypeError: # TREAT AS int INDEX INTO A TUPLE
181 return sliceInfo[k]
182 def get_annot_obj(self, k, sliceInfo):
183 'create an annotation object based on the input sliceInfo'
184 start = int(self.getSliceAttr(sliceInfo,'start'))
185 stop = int(self.getSliceAttr(sliceInfo,'stop'))
187 try:
188 orientation = self.getSliceAttr(sliceInfo, 'orientation')
189 orientation = int(orientation)
190 if orientation < 0 and start >= 0:
191 start,stop = (-stop, -start) # NEGATIVE ORIENTATION COORDINATES
192 except (AttributeError, IndexError):
193 pass # ok if no orientation is specified.
195 if start>=stop:
196 raise IndexError('annotation %s has zero or negative length [%s:%s]!'
197 %(k,start,stop))
198 seq_id = self.getSliceAttr(sliceInfo, 'id')
199 seq = self.seqDB[seq_id]
200 return self.itemClass(k, self, seq, start, stop)
201 def sliceAnnotation(self,k,sliceInfo,limitCache=True):
202 'create annotation and cache it'
203 a = self.get_annot_obj(k, sliceInfo)
204 try: # APPLY CACHE SIZE LIMIT IF ANY
205 if limitCache and self.maxCache<len(self._weakValueDict):
206 self._weakValueDict.clear()
207 except AttributeError:
208 pass
209 self._weakValueDict[k] = a # CACHE THIS IN OUR DICT
210 return a
211 def new_annotation(self,k,sliceInfo):
212 'save sliceInfo to the annotation database and return annotation object'
213 a = self.sliceAnnotation(k,sliceInfo) # 1st CHECK IT GIVES A VALID ANNOTATION
214 try:
215 self.sliceDB[k] = sliceInfo # NOW SAVE IT TO THE SLICE DATABASE
216 except:
217 try:
218 del self._weakValueDict[k] # DELETE FROM CACHE
219 except:
220 pass
221 raise
222 self._wroteSliceDB = True
223 return a
224 def foreignKey(self,attr,k):
225 'iterate over items matching specified foreign key'
226 for t in self.sliceDB.foreignKey(attr,k):
227 try: # get from cache if exists
228 yield self._weakValueDict[t.id]
229 except KeyError:
230 yield self.sliceAnnotation(t.id,t)
231 def __contains__(self, k): return k in self.sliceDB
232 def __len__(self): return len(self.sliceDB)
233 def __iter__(self): return iter(self.sliceDB) ########## ITERATORS
234 def keys(self): return self.sliceDB.keys()
235 def iteritems(self):
236 'uses maxCache to manage caching of annotation objects'
237 for k,sliceInfo in self.sliceDB.iteritems():
238 yield k,self.sliceAnnotation(k,sliceInfo)
239 def itervalues(self):
240 'uses maxCache to manage caching of annotation objects'
241 for k,v in self.iteritems():
242 yield v
243 def items(self):
244 'forces load of all annotation objects into cache'
245 return [(k,self.sliceAnnotation(k,sliceInfo,limitCache=False))
246 for (k,sliceInfo) in self.sliceDB.items()]
247 def values(self):
248 'forces load of all annotation objects into cache'
249 return [self.sliceAnnotation(k,sliceInfo,limitCache=False)
250 for (k,sliceInfo) in self.sliceDB.items()]
251 def add_homology(self, seq, search, id=None, idFormat='%s_%d',
252 autoIncrement=False, maxAnnot=999999,
253 maxLoss=None, sliceInfo=None, **kwargs):
254 'find homology in our seq db and add as annotations'
255 try: # ENSURE THAT sliceAttrDict COMPATIBLE WITH OUR TUPLE FORMAT
256 if self.sliceAttrDict['id'] != 0:
257 raise KeyError
258 except KeyError:
259 sliceAttrDict['id'] = 0 # USE TUPLE AS OUR INTERNAL STANDARD FORMAT
260 sliceAttrDict['start'] = 1
261 sliceAttrDict['stop'] = 2
262 if autoIncrement:
263 id = len(self.sliceDB)
264 elif id is None:
265 id = seq.id
266 if isinstance(search,str): # GET SEARCH METHOD
267 search = getattr(self.seqDB,search)
268 if isinstance(seq,str): # CREATE A SEQ OBJECT
269 seq = Sequence(seq,str(id))
270 al = search(seq,**kwargs) # RUN THE HOMOLOGY SEARCH
271 if maxLoss is not None: # REQUIRE HIT BE AT LEAST A CERTAIN LENGTH
272 kwargs['minAlignSize'] = len(seq)-maxLoss
273 hits = al[seq].keys(**kwargs) # OBTAIN LIST OF HIT INTERVALS
274 if len(hits)>maxAnnot:
275 raise ValueError('too many hits for %s: %d' %(id,len(hits)))
276 out = []
277 i = 0
278 k = id
279 for ival in hits: # CREATE ANNOTATION FOR EACH HIT
280 if len(hits)>1: # NEED TO CREATE AN ID FOR EACH HIT
281 if autoIncrement:
282 k = len(self.sliceDB)
283 else:
284 k = idFormat %(id,i)
285 i += 1
286 if sliceInfo is not None: # SAVE SLICE AS TUPLE WITH INFO
287 a = self.new_annotation(k, (ival.id,ival.start,ival.stop)+sliceInfo)
288 else:
289 a = self.new_annotation(k, (ival.id,ival.start,ival.stop))
290 out.append(a) # RETURN THE ANNOTATION
291 return out
292 def close(self):
293 'if sliceDB needs to be closed, do it and return True, otherwise False'
294 try:
295 if self._wroteSliceDB:
296 self.sliceDB.close()
297 self._wroteSliceDB = False # DISK FILE IS UP TO DATE
298 return True
299 except AttributeError:
300 pass
301 return False
302 def __del__(self):
303 if self.close():
304 import sys
305 print >>sys.stderr,'''
306 WARNING: you forgot to call AnnotationDB.close() after writing
307 new annotation data to it. This could result in failure to properly
308 store the data in the associated disk file. To avoid this, we
309 have automatically called AnnotationDB.sliceDB.close() to write the data
310 for you, when the AnnotationDB was deleted.'''
312 def clear_cache(self):
313 'empty the cache'
314 self._weakValueDict.clear()
315 # not clear what this should do for AnnotationDB
316 def copy(self):
317 raise NotImplementedError, "nonsensical in AnnotationDB"
318 def setdefault(self, k, d=None):
319 raise NotImplementedError, "nonsensical in AnnotationDB"
320 def update(self, other):
321 raise NotImplementedError, "nonsensical in AnnotationDB"
323 # these methods should not be implemented for read-only database.
324 def clear(self):
325 raise NotImplementedError, "no deletions allowed"
326 def pop(self):
327 raise NotImplementedError, "no deletions allowed"
328 def popitem(self):
329 raise NotImplementedError, "no deletions allowed"
331 class AnnotationServer(AnnotationDB):
332 'XMLRPC-ready server for AnnotationDB'
333 xmlrpc_methods={'get_slice_tuple':0,'get_slice_items':0,
334 'get_annotation_attr':0, 'keys':0,
335 '__len__':0, '__contains__':0}
336 def get_slice_tuple(self, k):
337 'get (seqID,start,stop) for a given key'
338 try:
339 sliceInfo = self.sliceDB[k]
340 except KeyError:
341 return '' # XMLRPC-acceptable failure code
342 start = int(self.getSliceAttr(sliceInfo,'start'))
343 stop = int(self.getSliceAttr(sliceInfo,'stop'))
344 try:
345 if int(self.getSliceAttr(sliceInfo,'orientation'))<0 and start>=0:
346 start,stop = (-stop,-start) # NEGATIVE ORIENTATION COORDINATES
347 except AttributeError:
348 pass
349 return (self.getSliceAttr(sliceInfo, 'id'), start, stop)
350 def get_slice_items(self):
351 'get all (key,tuple) pairs in one query'
352 l = []
353 for k in self.sliceDB:
354 l.append((k,self.get_slice_tuple(k)))
355 return l
356 def get_annotation_attr(self, k, attr):
357 'get the requested attribute of the requested key'
358 try:
359 sliceInfo = self.sliceDB[k]
360 except KeyError:
361 return ''
362 try:
363 return self.getSliceAttr(sliceInfo, attr)
364 except AttributeError:
365 return ''
367 class AnnotationClientSliceDB(dict):
368 'proxy just queries the server'
369 def __init__(self, db):
370 self.db = db
371 dict.__init__(self)
372 def __getitem__(self, k):
373 try:
374 return dict.__getitem__(self, k)
375 except KeyError:
376 t = self.db.server.get_slice_tuple(k)
377 if t == '':
378 raise KeyError('no such annotation: ' + str(k))
379 dict.__setitem__(self, k, t)
380 return t
381 def __setitem__(self, k, v): raise ValueError('XMLRPC client is read-only')
382 def keys(self): return self.db.server.keys()
383 def __iter__(self): return iter(self.keys())
384 def items(self): return self.db.server.get_slice_items()
385 def iteritems(self): return iter(self.items())
386 def __len__(self): return self.db.server.__len__()
387 def __contains__(self, k): return self.db.server.__contains__(k)
389 class AnnotationClient(AnnotationDB):
390 'XMLRPC AnnotationDB client'
391 def __init__(self, url, name, seqDB,itemClass=AnnotationSeq,
392 itemSliceClass=AnnotationSlice, autoGC=True, **kwargs):
393 if autoGC: # automatically garbage collect unused objects
394 self._weakValueDict = classutil.RecentValueDictionary(autoGC)
395 else:
396 self._weakValueDict = {} # object cache
397 self.autoGC = autoGC
398 import coordinator
399 self.server = coordinator.get_connection(url, name)
400 self.url = url
401 self.name = name
402 self.seqDB = seqDB
403 self.sliceDB = AnnotationClientSliceDB(self)
404 self.itemClass = itemClass
405 self.itemSliceClass = itemSliceClass
406 def __getstate__(self):
407 return dict(url=self.url, name=self.name, seqDB=self.seqDB,
408 autoGC=self.autoGC)
409 def getSliceAttr(self, sliceInfo, attr):
410 if attr=='id': return sliceInfo[0]
411 elif attr=='start': return sliceInfo[1]
412 elif attr=='stop': return sliceInfo[2]
413 elif attr=='orientation': raise AttributeError('ori not saved')
414 else:
415 v = self.server.get_annotation_attr(sliceInfo[0], attr)
416 if v=='':
417 raise AttributeError('this annotation has no attr: ' + attr)
418 return v