added test of len() method for SQLTable
[pygr.git] / tests / annotation_dm2_megatest.py
blob455a95ab2ffdfc70c02420ee27eb5faea5c7e269
1 import unittest
2 from testlib import testutil, PygrTestProgram
4 import ConfigParser, sys, os, string
5 from pygr.mapping import Collection
6 import pygr.Data
8 try:
9 import hashlib
10 except ImportError:
11 import md5 as hashlib
13 config = ConfigParser.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
14 config.read([ os.path.join(os.path.expanduser('~'), '.pygrrc'), os.path.join(os.path.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
15 msaDir = config.get('megatests_dm2', 'msaDir')
16 seqDir = config.get('megatests_dm2', 'seqDir')
17 smallSampleKey = config.get('megatests_dm2', 'smallSampleKey')
18 testInputDB = config.get('megatests', 'testInputDB')
19 testInputDir = config.get('megatests', 'testInputDir')
20 testOutputBaseDir = config.get('megatests', 'testOutputBaseDir')
22 if smallSampleKey:
23 smallSamplePostfix = '_' + smallSampleKey
24 else:
25 smallSamplePostfix = ''
27 ## msaDir CONTAINS PRE-BUILT NLMSA
28 ## seqDir CONTAINS GENOME ASSEMBLIES AND THEIR SEQDB FILES
29 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
30 ## exonAnnotFileName = 'Annotation_ConservedElement_Exons_dm2.txt'
31 ## intronAnnotFileName = 'Annotation_ConservedElement_Introns_dm2.txt'
32 ## stopAnnotFileName = 'Annotation_ConservedElement_Stop_dm2.txt'
33 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
35 # DIRECTIONARY FOR DOC STRING OF SEQDB
36 docStringDict = {
37 'anoGam1':'A. gambiae Genome (February 2003)',
38 'apiMel2':'A. mellifera Genome (January 2005)',
39 'dm2':'D. melanogaster Genome (April 2004)',
40 'dp4':'D. pseudoobscura Genome (February 2006)',
41 'droAna3':'D. ananassae Genome (February 2006)',
42 'droEre2':'D. erecta Genome (February 2006)',
43 'droGri2':'D. grimshawi Genome (February 2006)',
44 'droMoj3':'D. mojavensis Genome (February 2006)',
45 'droPer1':'D. persimilis Genome (October 2005)',
46 'droSec1':'D. sechellia Genome (October 2005)',
47 'droSim1':'D. simulans Genome (April 2005)',
48 'droVir3':'D. virilis Genome (February 2006)',
49 'droWil1':'D. willistoni Genome (February 2006)',
50 'droYak2':'D. yakuba Genome (November 2005)',
51 'triCas2':'T. castaneum Genome (September 2005)',
54 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
55 msaSpeciesList = ['anoGam1', 'apiMel2', 'dm2', 'dp4', 'droAna3', 'droEre2', 'droGri2', 'droMoj3', \
56 'droPer1', 'droSec1', 'droSim1', 'droVir3', 'droWil1', 'droYak2', 'triCas2']
58 class PygrBuildNLMSAMegabase(unittest.TestCase):
59 'restrict megatest to an initially empty directory, need large space to perform'
60 def setUp(self, testDir = None):
61 import random
62 tmpList = [c for c in 'PygrBuildNLMSAMegabase']
63 random.shuffle(tmpList)
64 testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) # FOR TEST, SHOULD BE DELETED
65 if testDir is None: testDir = 'TEST_' + ''.join(tmpList) # NOT SPECIFIED, USE CURRENT DIRECTORY
66 try:
67 os.mkdir(testDir)
68 testDir = os.path.realpath(testDir)
69 except:
70 raise IOError
71 self.path = testDir
72 try:
73 tmpFileName = os.path.join(testDir, 'DELETE_THIS_TEMP_FILE')
74 open(tmpFileName, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
75 except:
76 raise IOError
77 pygr.Data.update(self.path)
78 from pygr import seqdb
79 for orgstr in msaSpeciesList:
80 genome = seqdb.BlastDB(os.path.join(seqDir, orgstr))
81 genome.__doc__ = docStringDict[orgstr]
82 pygr.Data.addResource('TEST.Seq.Genome.' + orgstr, genome)
83 pygr.Data.save()
84 def copyFile(self, filename): # COPY A FILE INTO TEST DIRECTORY
85 newname = os.path.join(self.path, os.path.basename(filename))
86 open(newname, 'w').write(open(filename, 'r').read())
87 return newname
88 def tearDown(self):
89 'delete the temporary directory and files, restore pygr.Data path'
90 for dirpath, subdirs, files in os.walk(self.path, topdown = False): # SHOULD BE DELETED BOTTOM-UP FASHION
91 # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
92 # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
93 for filename in files:
94 os.remove(os.path.join(dirpath, filename))
95 os.rmdir(dirpath)
96 # Restore original pygr.Data path to remedy lack of isolation
97 # between tests from the same run
98 pygr.Data.update(None)
101 class Build_Test(PygrBuildNLMSAMegabase):
102 def test_seqdb(self):
103 'Check pygr.Data contents'
104 l = pygr.Data.dir('TEST')
105 preList = ['TEST.Seq.Genome.' + orgstr for orgstr in msaSpeciesList]
106 assert l == preList
107 def test_collectionannot(self):
108 'Test building an AnnotationDB from file'
109 from pygr import seqdb, cnestedlist, sqlgraph
110 dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
111 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS
112 exon_slices = Collection(filename = os.path.join(self.path, 'refGene_exonAnnot_dm2.cdb'), \
113 intKeys = True, mode = 'cr', writeback = False)
114 exon_db = seqdb.AnnotationDB(exon_slices, dm2,
115 sliceAttrDict = dict(id = 0, exon_id = 1, orientation = 2,
116 gene_id = 3, start = 4, stop = 5))
117 msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_dm2'), 'w', \
118 pairwiseMode = True, bidirectional = False)
119 for lines in open(os.path.join(testInputDir, 'refGene_exonAnnot%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines():
120 row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
121 row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
122 exon_slices[row[1]] = row
123 exon = exon_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
124 msa.addAnnotation(exon) # SAVE IT TO GENOME MAPPING
125 exon_db.clear_cache() # not really necessary; cache should autoGC
126 exon_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
127 msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
128 exon_db.__doc__ = 'Exon Annotation Database for dm2'
129 pygr.Data.addResource('TEST.Annotation.dm2.exons', exon_db)
130 msa.__doc__ = 'NLMSA Exon for dm2'
131 pygr.Data.addResource('TEST.Annotation.NLMSA.dm2.exons', msa)
132 exon_schema = pygr.Data.ManyToManyRelation(dm2, exon_db, bindAttrs = ('exon1',))
133 exon_schema.__doc__ = 'Exon Schema for dm2'
134 pygr.Data.addSchema('TEST.Annotation.NLMSA.dm2.exons', exon_schema)
135 # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES
136 splice_slices = Collection(filename = os.path.join(self.path, 'refGene_spliceAnnot_dm2.cdb'), \
137 intKeys = True, mode = 'cr', writeback = False)
138 splice_db = seqdb.AnnotationDB(splice_slices, dm2,
139 sliceAttrDict = dict(id = 0, splice_id = 1, orientation = 2,
140 gene_id = 3, start = 4, stop = 5))
141 msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_dm2'), 'w', \
142 pairwiseMode = True, bidirectional = False)
143 for lines in open(os.path.join(testInputDir, 'refGene_spliceAnnot%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines():
144 row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
145 row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
146 splice_slices[row[1]] = row
147 splice = splice_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
148 msa.addAnnotation(splice) # SAVE IT TO GENOME MAPPING
149 splice_db.clear_cache() # not really necessary; cache should autoGC
150 splice_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
151 msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
152 splice_db.__doc__ = 'Splice Annotation Database for dm2'
153 pygr.Data.addResource('TEST.Annotation.dm2.splices', splice_db)
154 msa.__doc__ = 'NLMSA Splice for dm2'
155 pygr.Data.addResource('TEST.Annotation.NLMSA.dm2.splices', msa)
156 splice_schema = pygr.Data.ManyToManyRelation(dm2, splice_db, bindAttrs = ('splice1',))
157 splice_schema.__doc__ = 'Splice Schema for dm2'
158 pygr.Data.addSchema('TEST.Annotation.NLMSA.dm2.splices', splice_schema)
159 # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC
160 ucsc_slices = Collection(filename = os.path.join(self.path, 'phastConsElements15way_dm2.cdb'), \
161 intKeys = True, mode = 'cr', writeback = False)
162 ucsc_db = seqdb.AnnotationDB(ucsc_slices, dm2,
163 sliceAttrDict = dict(id = 0, ucsc_id = 1, orientation = 2,
164 gene_id = 3, start = 4, stop = 5))
165 msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements15way_dm2'), 'w', \
166 pairwiseMode = True, bidirectional = False)
167 for lines in open(os.path.join(testInputDir, 'phastConsElements15way%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines():
168 row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
169 row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
170 ucsc_slices[row[1]] = row
171 ucsc = ucsc_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
172 msa.addAnnotation(ucsc) # SAVE IT TO GENOME MAPPING
173 ucsc_db.clear_cache() # not really necessary; cache should autoGC
174 ucsc_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
175 msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
176 ucsc_db.__doc__ = 'Most Conserved Elements for dm2'
177 pygr.Data.addResource('TEST.Annotation.UCSC.dm2.mostconserved', ucsc_db)
178 msa.__doc__ = 'NLMSA for Most Conserved Elements for dm2'
179 pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved', msa)
180 ucsc_schema = pygr.Data.ManyToManyRelation(dm2, ucsc_db, bindAttrs = ('element1',))
181 ucsc_schema.__doc__ = 'Schema for UCSC Most Conserved Elements for dm2'
182 pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved', ucsc_schema)
183 pygr.Data.save()
184 pygr.Data.clear_cache() # force resources to reload when requested
186 # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
187 dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
188 exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.dm2.exons')
189 splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.dm2.splices')
190 conservedmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved')
191 exons = pygr.Data.getResource('TEST.Annotation.dm2.exons')
192 splices = pygr.Data.getResource('TEST.Annotation.dm2.splices')
193 mostconserved = pygr.Data.getResource('TEST.Annotation.UCSC.dm2.mostconserved')
195 # OPEN DM2_MULTIZ15WAY NLMSA
196 msa = cnestedlist.NLMSA(os.path.join(msaDir, 'dm2_multiz15way'), 'r', trypath = [seqDir])
198 exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_dm2.txt' % smallSamplePostfix)
199 intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_dm2.txt' % smallSamplePostfix)
200 newexonAnnotFileName = os.path.join(self.path, 'new_Exons_dm2.txt')
201 newintronAnnotFileName = os.path.join(self.path, 'new_Introns_dm2.txt')
202 tmpexonAnnotFileName = self.copyFile(exonAnnotFileName)
203 tmpintronAnnotFileName = self.copyFile(intronAnnotFileName)
205 if smallSampleKey:
206 chrList = [ smallSampleKey ]
207 else:
208 chrList = dm2.seqLenDict.keys()
209 chrList.sort()
211 outfile = open(newexonAnnotFileName, 'w')
212 for chrid in chrList:
213 slice = dm2[chrid]
214 try:
215 ex1 = exonmsa[slice]
216 except KeyError:
217 continue
218 else:
219 exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()]
220 exlist1.sort()
221 for ixx, exon in exlist1:
222 saveList = []
223 tmp = exon.sequence
224 tmpexon = exons[exon.exon_id]
225 tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE
226 wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, tmpslice.start, tmpslice.stop
227 try:
228 out1 = conservedmsa[tmp]
229 except KeyError:
230 pass
231 else:
232 elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
233 elementlist.sort()
234 for iyy, element in elementlist:
235 if element.stop - element.start < 100: continue
236 score = int(string.split(element.gene_id, '=')[1])
237 if score < 100: continue
238 tmp2 = element.sequence
239 tmpelement = mostconserved[element.ucsc_id]
240 tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
241 wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
242 slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
243 tmp1 = msa.seqDict['dm2.' + chrid][slicestart:sliceend]
244 edges = msa[tmp1].edges()
245 for src, dest, e in edges:
246 if src.stop - src.start < 100: continue
247 palign, pident = e.pAligned(), e.pIdentity()
248 if palign < 0.8 or pident < 0.8: continue
249 palign, pident = '%.2f' % palign, '%.2f' % pident
250 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
251 (~msa.seqDict)[dest], \
252 str(dest), dest.start, dest.stop, palign, pident)
253 saveList.append('\t'.join(map(str, wlist3)) + '\n')
254 saveList.sort()
255 for saveline in saveList:
256 outfile.write(saveline)
257 outfile.close()
258 md5old = hashlib.md5()
259 md5old.update(open(tmpexonAnnotFileName, 'r').read())
260 md5new = hashlib.md5()
261 md5new.update(open(newexonAnnotFileName, 'r').read())
262 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
264 outfile = open(newintronAnnotFileName, 'w')
265 for chrid in chrList:
266 slice = dm2[chrid]
267 try:
268 sp1 = splicemsa[slice]
269 except:
270 continue
271 else:
272 splist1 = [(ix.splice_id, ix) for ix in sp1.keys()]
273 splist1.sort()
274 for ixx, splice in splist1:
275 saveList = []
276 tmp = splice.sequence
277 tmpsplice = splices[splice.splice_id]
278 tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE
279 wlist1 = 'INTRON', chrid, tmpsplice.splice_id, tmpsplice.gene_id, tmpslice.start, tmpslice.stop
280 try:
281 out1 = conservedmsa[tmp]
282 except KeyError:
283 pass
284 else:
285 elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
286 elementlist.sort()
287 for iyy, element in elementlist:
288 if element.stop - element.start < 100: continue
289 score = int(string.split(element.gene_id, '=')[1])
290 if score < 100: continue
291 tmp2 = element.sequence
292 tmpelement = mostconserved[element.ucsc_id]
293 tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
294 wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
295 slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
296 tmp1 = msa.seqDict['dm2.' + chrid][slicestart:sliceend]
297 edges = msa[tmp1].edges()
298 for src, dest, e in edges:
299 if src.stop - src.start < 100: continue
300 palign, pident = e.pAligned(), e.pIdentity()
301 if palign < 0.8 or pident < 0.8: continue
302 palign, pident = '%.2f' % palign, '%.2f' % pident
303 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
304 (~msa.seqDict)[dest], \
305 str(dest), dest.start, dest.stop, palign, pident)
306 saveList.append('\t'.join(map(str, wlist3)) + '\n')
307 saveList.sort()
308 for saveline in saveList:
309 outfile.write(saveline)
310 outfile.close()
311 md5old = hashlib.md5()
312 md5old.update(open(tmpintronAnnotFileName, 'r').read())
313 md5new = hashlib.md5()
314 md5new.update(open(newintronAnnotFileName, 'r').read())
315 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
317 def test_mysqlannot(self):
318 'Test building an AnnotationDB from MySQL'
319 from pygr import seqdb, cnestedlist, sqlgraph
320 dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
321 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION
322 exon_slices = sqlgraph.SQLTableClustered('%s.pygr_refGene_exonAnnot%s_dm2' % ( testInputDB, smallSamplePostfix ),
323 clusterKey = 'chromosome', maxCache = 0)
324 exon_db = seqdb.AnnotationDB(exon_slices, dm2, sliceAttrDict = dict(id = 'chromosome', \
325 gene_id = 'name', exon_id = 'exon_id'))
326 msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_SQL_dm2'), 'w', \
327 pairwiseMode = True, bidirectional = False)
328 for id in exon_db:
329 msa.addAnnotation(exon_db[id])
330 exon_db.clear_cache() # not really necessary; cache should autoGC
331 exon_slices.clear_cache()
332 msa.build()
333 exon_db.__doc__ = 'SQL Exon Annotation Database for dm2'
334 pygr.Data.addResource('TEST.Annotation.SQL.dm2.exons', exon_db)
335 msa.__doc__ = 'SQL NLMSA Exon for dm2'
336 pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.dm2.exons', msa)
337 exon_schema = pygr.Data.ManyToManyRelation(dm2, exon_db, bindAttrs = ('exon2',))
338 exon_schema.__doc__ = 'SQL Exon Schema for dm2'
339 pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.dm2.exons', exon_schema)
340 # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES: MYSQL VERSION
341 splice_slices = sqlgraph.SQLTableClustered('%s.pygr_refGene_spliceAnnot%s_dm2' % ( testInputDB, smallSamplePostfix ),
342 clusterKey = 'chromosome', maxCache = 0)
343 splice_db = seqdb.AnnotationDB(splice_slices, dm2, sliceAttrDict = dict(id = 'chromosome', \
344 gene_id = 'name', splice_id = 'splice_id'))
345 msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_SQL_dm2'), 'w', \
346 pairwiseMode = True, bidirectional = False)
347 for id in splice_db:
348 msa.addAnnotation(splice_db[id])
349 splice_db.clear_cache() # not really necessary; cache should autoGC
350 splice_slices.clear_cache()
351 msa.build()
352 splice_db.__doc__ = 'SQL Splice Annotation Database for dm2'
353 pygr.Data.addResource('TEST.Annotation.SQL.dm2.splices', splice_db)
354 msa.__doc__ = 'SQL NLMSA Splice for dm2'
355 pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.dm2.splices', msa)
356 splice_schema = pygr.Data.ManyToManyRelation(dm2, splice_db, bindAttrs = ('splice2',))
357 splice_schema.__doc__ = 'SQL Splice Schema for dm2'
358 pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.dm2.splices', splice_schema)
359 # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC: MYSQL VERSION
360 ucsc_slices = sqlgraph.SQLTableClustered('%s.pygr_phastConsElements15way%s_dm2' % ( testInputDB, smallSamplePostfix ),
361 clusterKey = 'chromosome', maxCache = 0)
362 ucsc_db = seqdb.AnnotationDB(ucsc_slices, dm2, sliceAttrDict = dict(id = 'chromosome', \
363 gene_id = 'name', ucsc_id = 'ucsc_id'))
364 msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements15way_SQL_dm2'), 'w', \
365 pairwiseMode = True, bidirectional = False)
366 for id in ucsc_db:
367 msa.addAnnotation(ucsc_db[id])
368 ucsc_db.clear_cache() # not really necessary; cache should autoGC
369 ucsc_slices.clear_cache()
370 msa.build()
371 ucsc_db.__doc__ = 'SQL Most Conserved Elements for dm2'
372 pygr.Data.addResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved', ucsc_db)
373 msa.__doc__ = 'SQL NLMSA for Most Conserved Elements for dm2'
374 pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', msa)
375 ucsc_schema = pygr.Data.ManyToManyRelation(dm2, ucsc_db, bindAttrs = ('element2',))
376 ucsc_schema.__doc__ = 'SQL Schema for UCSC Most Conserved Elements for dm2'
377 pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', ucsc_schema)
378 pygr.Data.save()
379 pygr.Data.clear_cache()
381 # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
382 dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
383 exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.dm2.exons')
384 splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.dm2.splices')
385 conservedmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved')
386 exons = pygr.Data.getResource('TEST.Annotation.SQL.dm2.exons')
387 splices = pygr.Data.getResource('TEST.Annotation.SQL.dm2.splices')
388 mostconserved = pygr.Data.getResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved')
390 # OPEN DM2_MULTIZ15WAY NLMSA
391 msa = cnestedlist.NLMSA(os.path.join(msaDir, 'dm2_multiz15way'), 'r', trypath = [seqDir])
393 exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_dm2.txt' % smallSamplePostfix)
394 intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_dm2.txt' % smallSamplePostfix)
395 newexonAnnotFileName = os.path.join(self.path, 'new_Exons_dm2.txt')
396 newintronAnnotFileName = os.path.join(self.path, 'new_Introns_dm2.txt')
397 tmpexonAnnotFileName = self.copyFile(exonAnnotFileName)
398 tmpintronAnnotFileName = self.copyFile(intronAnnotFileName)
400 if smallSampleKey:
401 chrList = [ smallSampleKey ]
402 else:
403 chrList = dm2.seqLenDict.keys()
404 chrList.sort()
406 outfile = open(newexonAnnotFileName, 'w')
407 for chrid in chrList:
408 slice = dm2[chrid]
409 try:
410 ex1 = exonmsa[slice]
411 except KeyError:
412 continue
413 else:
414 exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()]
415 exlist1.sort()
416 for ixx, exon in exlist1:
417 saveList = []
418 tmp = exon.sequence
419 tmpexon = exons[exon.exon_id]
420 tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE
421 wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, tmpslice.start, tmpslice.stop
422 try:
423 out1 = conservedmsa[tmp]
424 except KeyError:
425 pass
426 else:
427 elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
428 elementlist.sort()
429 for iyy, element in elementlist:
430 if element.stop - element.start < 100: continue
431 score = int(string.split(element.gene_id, '=')[1])
432 if score < 100: continue
433 tmp2 = element.sequence
434 tmpelement = mostconserved[element.ucsc_id]
435 tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
436 wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
437 slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
438 tmp1 = msa.seqDict['dm2.' + chrid][slicestart:sliceend]
439 edges = msa[tmp1].edges()
440 for src, dest, e in edges:
441 if src.stop - src.start < 100: continue
442 palign, pident = e.pAligned(), e.pIdentity()
443 if palign < 0.8 or pident < 0.8: continue
444 palign, pident = '%.2f' % palign, '%.2f' % pident
445 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
446 (~msa.seqDict)[dest], \
447 str(dest), dest.start, dest.stop, palign, pident)
448 saveList.append('\t'.join(map(str, wlist3)) + '\n')
449 saveList.sort()
450 for saveline in saveList:
451 outfile.write(saveline)
452 outfile.close()
453 md5old = hashlib.md5()
454 md5old.update(open(tmpexonAnnotFileName, 'r').read())
455 md5new = hashlib.md5()
456 md5new.update(open(newexonAnnotFileName, 'r').read())
457 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
459 outfile = open(newintronAnnotFileName, 'w')
460 for chrid in chrList:
461 slice = dm2[chrid]
462 try:
463 sp1 = splicemsa[slice]
464 except:
465 continue
466 else:
467 splist1 = [(ix.splice_id, ix) for ix in sp1.keys()]
468 splist1.sort()
469 for ixx, splice in splist1:
470 saveList = []
471 tmp = splice.sequence
472 tmpsplice = splices[splice.splice_id]
473 tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE
474 wlist1 = 'INTRON', chrid, tmpsplice.splice_id, tmpsplice.gene_id, tmpslice.start, tmpslice.stop
475 try:
476 out1 = conservedmsa[tmp]
477 except KeyError:
478 pass
479 else:
480 elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
481 elementlist.sort()
482 for iyy, element in elementlist:
483 if element.stop - element.start < 100: continue
484 score = int(string.split(element.gene_id, '=')[1])
485 if score < 100: continue
486 tmp2 = element.sequence
487 tmpelement = mostconserved[element.ucsc_id]
488 tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
489 wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
490 slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
491 tmp1 = msa.seqDict['dm2.' + chrid][slicestart:sliceend]
492 edges = msa[tmp1].edges()
493 for src, dest, e in edges:
494 if src.stop - src.start < 100: continue
495 palign, pident = e.pAligned(), e.pIdentity()
496 if palign < 0.8 or pident < 0.8: continue
497 palign, pident = '%.2f' % palign, '%.2f' % pident
498 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
499 (~msa.seqDict)[dest], \
500 str(dest), dest.start, dest.stop, palign, pident)
501 saveList.append('\t'.join(map(str, wlist3)) + '\n')
502 saveList.sort()
503 for saveline in saveList:
504 outfile.write(saveline)
505 outfile.close()
506 md5old = hashlib.md5()
507 md5old.update(open(tmpintronAnnotFileName, 'r').read())
508 md5new = hashlib.md5()
509 md5new.update(open(newintronAnnotFileName, 'r').read())
510 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
513 if __name__ == '__main__':
514 PygrTestProgram(verbosity=2)