Merge commit 'remotes/ctb/xmlrpc_patches' into tryme
[pygr.git] / tests / annotation_hg18_megatest.py
blob4230e4d1300198bf8e3c7d4c5cc62f2318c33b00
1 import unittest
2 from testlib import testutil, PygrTestProgram
4 import ConfigParser, sys, os, string
5 from pygr.mapping import Collection
6 import pygr.Data
8 try:
9 import hashlib
10 except ImportError:
11 import md5 as hashlib
13 config = ConfigParser.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
14 config.read([ os.path.join(os.path.expanduser('~'), '.pygrrc'), os.path.join(os.path.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
15 msaDir = config.get('megatests_hg18', 'msaDir')
16 seqDir = config.get('megatests_hg18', 'seqDir')
17 smallSampleKey = config.get('megatests_hg18', 'smallSampleKey')
18 testInputDB = config.get('megatests', 'testInputDB')
19 testInputDir = config.get('megatests', 'testInputDir')
20 testOutputBaseDir = config.get('megatests', 'testOutputBaseDir')
22 if smallSampleKey:
23 smallSamplePostfix = '_' + smallSampleKey
24 else:
25 smallSamplePostfix = ''
27 ## msaDir CONTAINS PRE-BUILT NLMSA
28 ## seqDir CONTAINS GENOME ASSEMBLIES AND THEIR SEQDB FILES
29 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
30 ## exonAnnotFileName = 'Annotation_ConservedElement_Exons_hg18.txt'
31 ## intronAnnotFileName = 'Annotation_ConservedElement_Introns_hg18.txt'
32 ## stopAnnotFileName = 'Annotation_ConservedElement_Stop_hg18.txt'
33 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
35 # DIRECTIONARY FOR DOC STRING OF SEQDB
36 docStringDict = {
37 'anoCar1':' Lizard Genome (January 2007)',
38 'bosTau3':'Cow Genome (August 2006)',
39 'canFam2':'Dog Genome (May 2005)',
40 'cavPor2':'Guinea Pig (October 2005)',
41 'danRer4':'Zebrafish Genome (March 2006)',
42 'dasNov1':'Armadillo Genome (May 2005)',
43 'echTel1':'Tenrec Genome (July 2005)',
44 'eriEur1':'European Hedgehog (Junuary 2006)',
45 'equCab1':'Horse Genome (January 2007)',
46 'felCat3':'Cat Genome (March 2006)',
47 'fr2':'Fugu Genome (October 2004)',
48 'galGal3':'Chicken Genome (May 2006)',
49 'gasAcu1':'Stickleback Genome (February 2006)',
50 'hg18':'Human Genome (May 2006)',
51 'loxAfr1':'Elephant Genome (May 2005)',
52 'mm8':'Mouse Genome (March 2006)',
53 'monDom4':'Opossum Genome (January 2006)',
54 'ornAna1':'Platypus Genome (March 2007)',
55 'oryCun1':'Rabbit Genome (May 2005)',
56 'oryLat1':'Medaka Genome (April 2006)',
57 'otoGar1':'Bushbaby Genome (December 2006)',
58 'panTro2':'Chimpanzee Genome (March 2006)',
59 'rheMac2':'Rhesus Genome (January 2006)',
60 'rn4':'Rat Genome (November 2004)',
61 'sorAra1':'Shrew (Junuary 2006)',
62 'tetNig1':'Tetraodon Genome (February 2004)',
63 'tupBel1':'Tree Shrew (December 2006)',
64 'xenTro2':'X. tropicalis Genome (August 2005)'
67 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
68 msaSpeciesList = ['anoCar1', 'bosTau3', 'canFam2', 'cavPor2', 'danRer4', 'dasNov1', 'echTel1', \
69 'equCab1', 'eriEur1', 'felCat3', 'fr2', 'galGal3', 'gasAcu1', 'hg18', 'loxAfr1', \
70 'mm8', 'monDom4', 'ornAna1', 'oryCun1', 'oryLat1', 'otoGar1', 'panTro2', 'rheMac2', \
71 'rn4', 'sorAra1', 'tetNig1', 'tupBel1', 'xenTro2']
73 class PygrBuildNLMSAMegabase(unittest.TestCase):
74 'restrict megatest to an initially empty directory, need large space to perform'
75 def setUp(self, testDir = None):
76 import random
77 tmpList = [c for c in 'PygrBuildNLMSAMegabase']
78 random.shuffle(tmpList)
79 testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) # FOR TEST, SHOULD BE DELETED
80 if testDir is None: testDir = 'TEST_' + ''.join(tmpList) # NOT SPECIFIED, USE CURRENT DIRECTORY
81 try:
82 os.mkdir(testDir)
83 testDir = os.path.realpath(testDir)
84 except:
85 raise IOError
86 self.path = testDir
87 try:
88 tmpFileName = os.path.join(testDir, 'DELETE_THIS_TEMP_FILE')
89 open(tmpFileName, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
90 except:
91 raise IOError
92 pygr.Data.update(self.path)
93 from pygr import seqdb
94 for orgstr in msaSpeciesList:
95 genome = seqdb.BlastDB(os.path.join(seqDir, orgstr))
96 genome.__doc__ = docStringDict[orgstr]
97 pygr.Data.addResource('TEST.Seq.Genome.' + orgstr, genome)
98 pygr.Data.save()
99 def copyFile(self, filename): # COPY A FILE INTO TEST DIRECTORY
100 newname = os.path.join(self.path, os.path.basename(filename))
101 open(newname, 'w').write(open(filename, 'r').read())
102 return newname
103 def tearDown(self):
104 'delete the temporary directory and files, restore pygr.Data path'
105 for dirpath, subdirs, files in os.walk(self.path, topdown = False): # SHOULD BE DELETED BOTTOM-UP FASHION
106 # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
107 # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
108 for filename in files:
109 os.remove(os.path.join(dirpath, filename))
110 os.rmdir(dirpath)
111 # Restore original pygr.Data path to remedy lack of isolation
112 # between tests from the same run
113 pygr.Data.update(None)
116 class Build_Test(PygrBuildNLMSAMegabase):
117 def test_seqdb(self):
118 'Check pygr.Data contents'
119 l = pygr.Data.dir('TEST')
120 preList = ['TEST.Seq.Genome.' + orgstr for orgstr in msaSpeciesList]
121 assert l == preList
122 def test_collectionannot(self):
123 'Test building an AnnotationDB from file'
124 from pygr import seqdb, cnestedlist, sqlgraph
125 hg18 = pygr.Data.getResource('TEST.Seq.Genome.hg18')
126 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS
127 exon_slices = Collection(filename = os.path.join(self.path, 'refGene_exonAnnot_hg18.cdb'), \
128 intKeys = True, mode = 'cr', writeback = False)
129 exon_db = seqdb.AnnotationDB(exon_slices, hg18,
130 sliceAttrDict = dict(id = 0, exon_id = 1, orientation = 2,
131 gene_id = 3, start = 4, stop = 5))
132 msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_hg18'), 'w', \
133 pairwiseMode = True, bidirectional = False)
134 for lines in open(os.path.join(testInputDir, 'refGene_exonAnnot%s_hg18.txt' % smallSamplePostfix), 'r').xreadlines():
135 row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
136 row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
137 exon_slices[row[1]] = row
138 exon = exon_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
139 msa.addAnnotation(exon) # SAVE IT TO GENOME MAPPING
140 exon_db.clear_cache() # not really necessary; cache should autoGC
141 exon_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
142 msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
143 exon_db.__doc__ = 'Exon Annotation Database for hg18'
144 pygr.Data.addResource('TEST.Annotation.hg18.exons', exon_db)
145 msa.__doc__ = 'NLMSA Exon for hg18'
146 pygr.Data.addResource('TEST.Annotation.NLMSA.hg18.exons', msa)
147 exon_schema = pygr.Data.ManyToManyRelation(hg18, exon_db, bindAttrs = ('exon1',))
148 exon_schema.__doc__ = 'Exon Schema for hg18'
149 pygr.Data.addSchema('TEST.Annotation.NLMSA.hg18.exons', exon_schema)
150 # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES
151 splice_slices = Collection(filename = os.path.join(self.path, 'refGene_spliceAnnot_hg18.cdb'), \
152 intKeys = True, mode = 'cr', writeback = False)
153 splice_db = seqdb.AnnotationDB(splice_slices, hg18,
154 sliceAttrDict = dict(id = 0, splice_id = 1, orientation = 2,
155 gene_id = 3, start = 4, stop = 5))
156 msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_hg18'), 'w', \
157 pairwiseMode = True, bidirectional = False)
158 for lines in open(os.path.join(testInputDir, 'refGene_spliceAnnot%s_hg18.txt' % smallSamplePostfix), 'r').xreadlines():
159 row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
160 row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
161 splice_slices[row[1]] = row
162 splice = splice_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
163 msa.addAnnotation(splice) # SAVE IT TO GENOME MAPPING
164 splice_db.clear_cache() # not really necessary; cache should autoGC
165 splice_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
166 msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
167 splice_db.__doc__ = 'Splice Annotation Database for hg18'
168 pygr.Data.addResource('TEST.Annotation.hg18.splices', splice_db)
169 msa.__doc__ = 'NLMSA Splice for hg18'
170 pygr.Data.addResource('TEST.Annotation.NLMSA.hg18.splices', msa)
171 splice_schema = pygr.Data.ManyToManyRelation(hg18, splice_db, bindAttrs = ('splice1',))
172 splice_schema.__doc__ = 'Splice Schema for hg18'
173 pygr.Data.addSchema('TEST.Annotation.NLMSA.hg18.splices', splice_schema)
174 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS
175 cds_slices = Collection(filename = os.path.join(self.path, 'refGene_cdsAnnot_hg18.cdb'), \
176 intKeys = True, mode = 'cr', writeback = False)
177 cds_db = seqdb.AnnotationDB(cds_slices, hg18,
178 sliceAttrDict = dict(id = 0, cds_id = 1, orientation = 2,
179 gene_id = 3, start = 4, stop = 5))
180 msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_cdsAnnot_hg18'), 'w', \
181 pairwiseMode = True, bidirectional = False)
182 for lines in open(os.path.join(testInputDir, 'refGene_cdsAnnot%s_hg18.txt' % smallSamplePostfix), 'r').xreadlines():
183 row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
184 row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
185 cds_slices[row[1]] = row
186 cds = cds_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
187 msa.addAnnotation(cds) # SAVE IT TO GENOME MAPPING
188 cds_db.clear_cache() # not really necessary; cache should autoGC
189 cds_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
190 msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
191 cds_db.__doc__ = 'CDS Annotation Database for hg18'
192 pygr.Data.addResource('TEST.Annotation.hg18.cdss', cds_db)
193 msa.__doc__ = 'NLMSA CDS for hg18'
194 pygr.Data.addResource('TEST.Annotation.NLMSA.hg18.cdss', msa)
195 cds_schema = pygr.Data.ManyToManyRelation(hg18, cds_db, bindAttrs = ('cds1',))
196 cds_schema.__doc__ = 'CDS Schema for hg18'
197 pygr.Data.addSchema('TEST.Annotation.NLMSA.hg18.cdss', cds_schema)
198 # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC
199 ucsc_slices = Collection(filename = os.path.join(self.path, 'phastConsElements28way_hg18.cdb'), \
200 intKeys = True, mode = 'cr', writeback = False)
201 ucsc_db = seqdb.AnnotationDB(ucsc_slices, hg18,
202 sliceAttrDict = dict(id = 0, ucsc_id = 1, orientation = 2,
203 gene_id = 3, start = 4, stop = 5))
204 msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements28way_hg18'), 'w', \
205 pairwiseMode = True, bidirectional = False)
206 for lines in open(os.path.join(testInputDir, 'phastConsElements28way%s_hg18.txt' % smallSamplePostfix), 'r').xreadlines():
207 row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
208 row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
209 ucsc_slices[row[1]] = row
210 ucsc = ucsc_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
211 msa.addAnnotation(ucsc) # SAVE IT TO GENOME MAPPING
212 ucsc_db.clear_cache() # not really necessary; cache should autoGC
213 ucsc_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
214 msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
215 ucsc_db.__doc__ = 'Most Conserved Elements for hg18'
216 pygr.Data.addResource('TEST.Annotation.UCSC.hg18.mostconserved', ucsc_db)
217 msa.__doc__ = 'NLMSA for Most Conserved Elements for hg18'
218 pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.hg18.mostconserved', msa)
219 ucsc_schema = pygr.Data.ManyToManyRelation(hg18, ucsc_db, bindAttrs = ('element1',))
220 ucsc_schema.__doc__ = 'Schema for UCSC Most Conserved Elements for hg18'
221 pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.hg18.mostconserved', ucsc_schema)
222 # BUILD ANNOTATION DATABASE FOR SNP126 FROM UCSC
223 snp_slices = Collection(filename = os.path.join(self.path, 'snp126_hg18.cdb'), \
224 intKeys = True, protocol = 2, mode = 'cr', writeback = False)
225 snp_db = seqdb.AnnotationDB(snp_slices, hg18,
226 sliceAttrDict = dict(id = 0, snp_id = 1, orientation = 2, gene_id = 3, start = 4,
227 stop = 5, score = 6, ref_NCBI = 7, ref_UCSC = 8, observed = 9,
228 molType = 10, myClass = 11, myValid = 12, avHet = 13, avHetSE = 14,
229 myFunc = 15, locType = 16, myWeight = 17))
230 msa = cnestedlist.NLMSA(os.path.join(self.path, 'snp126_hg18'), 'w', \
231 pairwiseMode = True, bidirectional = False)
232 for lines in open(os.path.join(testInputDir, 'snp126%s_hg18.txt' % smallSamplePostfix), 'r').xreadlines():
233 row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
234 row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
235 snp_slices[row[1]] = row
236 snp = snp_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
237 msa.addAnnotation(snp) # SAVE IT TO GENOME MAPPING
238 snp_db.clear_cache() # not really necessary; cache should autoGC
239 snp_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
240 msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
241 snp_db.__doc__ = 'SNP126 for hg18'
242 pygr.Data.addResource('TEST.Annotation.UCSC.hg18.snp126', snp_db)
243 msa.__doc__ = 'NLMSA for SNP126 for hg18'
244 pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.hg18.snp126', msa)
245 snp_schema = pygr.Data.ManyToManyRelation(hg18, snp_db, bindAttrs = ('snp1',))
246 snp_schema.__doc__ = 'Schema for UCSC SNP126 for hg18'
247 pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.hg18.snp126', snp_schema)
248 pygr.Data.save()
249 pygr.Data.clear_cache()
251 # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
252 hg18 = pygr.Data.getResource('TEST.Seq.Genome.hg18')
253 exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.hg18.exons')
254 splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.hg18.splices')
255 conservedmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.hg18.mostconserved')
256 snpmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.hg18.snp126')
257 cdsmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.hg18.cdss')
258 exons = pygr.Data.getResource('TEST.Annotation.hg18.exons')
259 splices = pygr.Data.getResource('TEST.Annotation.hg18.splices')
260 mostconserved = pygr.Data.getResource('TEST.Annotation.UCSC.hg18.mostconserved')
261 snp126 = pygr.Data.getResource('TEST.Annotation.UCSC.hg18.snp126')
262 cdss = pygr.Data.getResource('TEST.Annotation.hg18.cdss')
264 # OPEN hg18_MULTIZ28WAY NLMSA
265 msa = cnestedlist.NLMSA(os.path.join(msaDir, 'hg18_multiz28way'), 'r', trypath = [seqDir])
267 exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_hg18.txt' % smallSamplePostfix)
268 intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_hg18.txt' % smallSamplePostfix)
269 stopAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Stop%s_hg18.txt' % smallSamplePostfix)
270 newexonAnnotFileName = os.path.join(self.path, 'new_Exons_hg18.txt')
271 newintronAnnotFileName = os.path.join(self.path, 'new_Introns_hg18.txt')
272 newstopAnnotFileName = os.path.join(self.path, 'new_stop_hg18.txt')
273 tmpexonAnnotFileName = self.copyFile(exonAnnotFileName)
274 tmpintronAnnotFileName = self.copyFile(intronAnnotFileName)
275 tmpstopAnnotFileName = self.copyFile(stopAnnotFileName)
277 if smallSampleKey:
278 chrList = [ smallSampleKey ]
279 else:
280 chrList = hg18.seqLenDict.keys()
281 chrList.sort()
283 outfile = open(newexonAnnotFileName, 'w')
284 for chrid in chrList:
285 slice = hg18[chrid]
286 # EXON ANNOTATION DATABASE
287 try:
288 ex1 = exonmsa[slice]
289 except:
290 continue
291 else:
292 exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()]
293 exlist1.sort()
294 for ixx, exon in exlist1:
295 saveList = []
296 tmp = exon.sequence
297 tmpexon = exons[exon.exon_id]
298 tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE
299 wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, tmpslice.start, tmpslice.stop
300 try:
301 out1 = conservedmsa[tmp]
302 except KeyError:
303 pass
304 else:
305 elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
306 elementlist.sort()
307 for iyy, element in elementlist:
308 if element.stop - element.start < 100: continue
309 score = int(string.split(element.gene_id, '=')[1])
310 if score < 100: continue
311 tmp2 = element.sequence
312 tmpelement = mostconserved[element.ucsc_id]
313 tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
314 wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
315 slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
316 if slicestart < 0 or sliceend < 0: sys.exit('wrong query')
317 tmp1 = msa.seqDict['hg18.' + chrid][slicestart:sliceend]
318 edges = msa[tmp1].edges()
319 for src, dest, e in edges:
320 if src.stop - src.start < 100: continue
321 palign, pident = e.pAligned(), e.pIdentity()
322 if palign < 0.8 or pident < 0.8: continue
323 palign, pident = '%.2f' % palign, '%.2f' % pident
324 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
325 (~msa.seqDict)[dest], \
326 str(dest), dest.start, dest.stop, palign, pident)
327 saveList.append('\t'.join(map(str, wlist3)) + '\n')
328 saveList.sort()
329 for saveline in saveList:
330 outfile.write(saveline)
331 outfile.close()
332 md5old = hashlib.md5()
333 md5old.update(open(tmpexonAnnotFileName, 'r').read())
334 md5new = hashlib.md5()
335 md5new.update(open(newexonAnnotFileName, 'r').read())
336 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
338 outfile = open(newintronAnnotFileName, 'w')
339 for chrid in chrList:
340 slice = hg18[chrid]
341 # SPLICE ANNOTATION DATABASE
342 try:
343 sp1 = splicemsa[slice]
344 except:
345 continue
346 else:
347 splist1 = [(ix.splice_id, ix) for ix in sp1.keys()]
348 splist1.sort()
349 for ixx, splice in splist1:
350 saveList = []
351 tmp = splice.sequence
352 tmpsplice = splices[splice.splice_id]
353 tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE
354 wlist1 = 'INTRON', chrid, tmpsplice.splice_id, tmpsplice.gene_id, tmpslice.start, tmpslice.stop
355 try:
356 out1 = conservedmsa[tmp]
357 except KeyError:
358 pass
359 else:
360 elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
361 elementlist.sort()
362 for iyy, element in elementlist:
363 if element.stop - element.start < 100: continue
364 score = int(string.split(element.gene_id, '=')[1])
365 if score < 100: continue
366 tmp2 = element.sequence
367 tmpelement = mostconserved[element.ucsc_id]
368 tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
369 wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
370 slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
371 if slicestart < 0 or sliceend < 0: sys.exit('wrong query')
372 tmp1 = msa.seqDict['hg18.' + chrid][slicestart:sliceend]
373 edges = msa[tmp1].edges()
374 for src, dest, e in edges:
375 if src.stop - src.start < 100: continue
376 palign, pident = e.pAligned(), e.pIdentity()
377 if palign < 0.8 or pident < 0.8: continue
378 palign, pident = '%.2f' % palign, '%.2f' % pident
379 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
380 (~msa.seqDict)[dest], \
381 str(dest), dest.start, dest.stop, palign, pident)
382 saveList.append('\t'.join(map(str, wlist3)) + '\n')
383 saveList.sort()
384 for saveline in saveList:
385 outfile.write(saveline)
386 # SNP IN SPLICE SITES
387 saveList = []
388 gt = tmpslice[:2]
389 ag = tmpslice[-2:]
390 try:
391 gtout = snpmsa[gt]
392 agout = snpmsa[ag]
393 except KeyError:
394 pass
395 else:
396 gtlist = gtout.keys()
397 aglist = agout.keys()
398 for snp in gtlist:
399 tmpsnp = snp.sequence
400 annsnp = snp126[snp.snp_id]
401 wlist2 = ('SNP5', chrid, tmpsplice.gene_id, gt.start, gt.stop, str(gt)) \
402 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
403 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
404 annsnp.observed, annsnp.molType, \
405 annsnp.myClass, annsnp.myValid)
406 tmp1 = msa.seqDict['hg18.' + chrid][abs(gt.start):abs(gt.stop)]
407 edges = msa[tmp1].edges()
408 for src, dest, e in edges:
409 if src.stop - src.start != 2 or dest.stop - dest.start != 2: continue
410 palign, pident = e.pAligned(), e.pIdentity()
411 palign, pident = '%.2f' % palign, '%.2f' % pident
412 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
413 (~msa.seqDict)[dest], \
414 str(dest), dest.start, dest.stop, palign, pident)
415 saveList.append('\t'.join(map(str, wlist3)) + '\n')
416 for snp in aglist:
417 tmpsnp = snp.sequence
418 annsnp = snp126[snp.snp_id]
419 wlist2 = ('SNP3', chrid, tmpsplice.gene_id, ag.start, ag.stop, str(ag)) \
420 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
421 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
422 annsnp.observed, annsnp.molType, \
423 annsnp.myClass, annsnp.myValid)
424 tmp1 = msa.seqDict['hg18.' + chrid][abs(ag.start):abs(ag.stop)]
425 edges = msa[tmp1].edges()
426 for src, dest, e in edges:
427 if src.stop - src.start != 2 or dest.stop - dest.start != 2: continue
428 palign, pident = e.pAligned(), e.pIdentity()
429 palign, pident = '%.2f' % palign, '%.2f' % pident
430 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
431 (~msa.seqDict)[dest], \
432 str(dest), dest.start, dest.stop, palign, pident)
433 saveList.append('\t'.join(map(str, wlist3)) + '\n')
434 saveList.sort()
435 for saveline in saveList:
436 outfile.write(saveline)
437 outfile.close()
438 md5old = hashlib.md5()
439 md5old.update(open(tmpintronAnnotFileName, 'r').read())
440 md5new = hashlib.md5()
441 md5new.update(open(newintronAnnotFileName, 'r').read())
442 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
444 outfile = open(newstopAnnotFileName, 'w')
445 for chrid in chrList:
446 slice = hg18[chrid]
447 # STOP ANNOTATION DATABASE
448 try:
449 cds1 = cdsmsa[slice]
450 except:
451 continue
452 else:
453 cdslist1 = [(ix.cds_id, ix) for ix in cds1.keys()]
454 cdslist1.sort()
455 for ixx, cds in cdslist1:
456 saveList = []
457 tmp = cds.sequence
458 tmpcds = cdss[cds.cds_id]
459 tmpslice = tmpcds.sequence # FOR REAL EXON COORDINATE
460 wlist1 = 'STOP', chrid, tmpcds.cds_id, tmpcds.gene_id, tmpslice.start, tmpslice.stop
461 if tmpslice.start < 0:
462 stopstart, stopend = -tmpslice.stop, -tmpslice.start
463 stop = -hg18[chrid][stopstart:stopstart+3]
464 else:
465 stopstart, stopend = tmpslice.start, tmpslice.stop
466 stop = hg18[chrid][stopend-3:stopend]
467 if str(stop).upper() not in ('TAA', 'TAG', 'TGA'): continue
468 try:
469 snp1 = snpmsa[stop]
470 except KeyError:
471 pass
472 else:
473 snplist = [(ix.snp_id, ix) for ix in snp1.keys()]
474 snplist.sort()
475 for iyy, snp in snplist:
476 tmpsnp = snp.sequence
477 annsnp = snp126[snp.snp_id]
478 wlist2 = wlist1 + (str(stop), stop.start, stop.stop) \
479 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
480 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
481 annsnp.observed, annsnp.molType, \
482 annsnp.myClass, annsnp.myValid)
483 if tmpslice.start < 0:
484 tmp1 = -msa.seqDict['hg18.' + chrid][stopstart:stopstart+3]
485 else:
486 tmp1 = msa.seqDict['hg18.' + chrid][stopend-3:stopend]
487 edges = msa[tmp1].edges()
488 for src, dest, e in edges:
489 if src.stop - src.start != 3 or dest.stop - dest.start != 3: continue
490 palign, pident = e.pAligned(), e.pIdentity()
491 palign, pident = '%.2f' % palign, '%.2f' % pident
492 if str(dest).upper() not in ('TAA', 'TAG', 'TGA'): nonstr = 'NONSENSE'
493 else: nonstr = 'STOP'
494 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
495 (~msa.seqDict)[dest], \
496 str(dest), dest.start, dest.stop, palign, pident, nonstr)
497 saveList.append('\t'.join(map(str, wlist3)) + '\n')
498 saveList.sort()
499 for saveline in saveList:
500 outfile.write(saveline)
501 outfile.close()
502 md5old = hashlib.md5()
503 md5old.update(open(tmpstopAnnotFileName, 'r').read())
504 md5new = hashlib.md5()
505 md5new.update(open(newstopAnnotFileName, 'r').read())
506 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
508 def test_mysqlannot(self):
509 'Test building an AnnotationDB from MySQL'
510 from pygr import seqdb, cnestedlist, sqlgraph
511 hg18 = pygr.Data.getResource('TEST.Seq.Genome.hg18')
512 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION
513 exon_slices = sqlgraph.SQLTableClustered('%s.pygr_refGene_exonAnnot%s_hg18' % ( testInputDB, smallSamplePostfix ),
514 clusterKey = 'chromosome', maxCache = 0)
515 exon_db = seqdb.AnnotationDB(exon_slices, hg18, sliceAttrDict = dict(id = 'chromosome', \
516 gene_id = 'name', exon_id = 'exon_id'))
517 msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_SQL_hg18'), 'w', \
518 pairwiseMode = True, bidirectional = False)
519 for id in exon_db:
520 msa.addAnnotation(exon_db[id])
521 exon_db.clear_cache() # not really necessary; cache should autoGC
522 exon_slices.clear_cache()
523 msa.build()
524 exon_db.__doc__ = 'SQL Exon Annotation Database for hg18'
525 pygr.Data.addResource('TEST.Annotation.SQL.hg18.exons', exon_db)
526 msa.__doc__ = 'SQL NLMSA Exon for hg18'
527 pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.hg18.exons', msa)
528 exon_schema = pygr.Data.ManyToManyRelation(hg18, exon_db, bindAttrs = ('exon2',))
529 exon_schema.__doc__ = 'SQL Exon Schema for hg18'
530 pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.hg18.exons', exon_schema)
531 # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES: MYSQL VERSION
532 splice_slices = sqlgraph.SQLTableClustered('%s.pygr_refGene_spliceAnnot%s_hg18' % ( testInputDB, smallSamplePostfix ),
533 clusterKey = 'chromosome', maxCache = 0)
534 splice_db = seqdb.AnnotationDB(splice_slices, hg18, sliceAttrDict = dict(id = 'chromosome', \
535 gene_id = 'name', splice_id = 'splice_id'))
536 msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_SQL_hg18'), 'w', \
537 pairwiseMode = True, bidirectional = False)
538 for id in splice_db:
539 msa.addAnnotation(splice_db[id])
540 splice_db.clear_cache() # not really necessary; cache should autoGC
541 splice_slices.clear_cache()
542 msa.build()
543 splice_db.__doc__ = 'SQL Splice Annotation Database for hg18'
544 pygr.Data.addResource('TEST.Annotation.SQL.hg18.splices', splice_db)
545 msa.__doc__ = 'SQL NLMSA Splice for hg18'
546 pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.hg18.splices', msa)
547 splice_schema = pygr.Data.ManyToManyRelation(hg18, splice_db, bindAttrs = ('splice2',))
548 splice_schema.__doc__ = 'SQL Splice Schema for hg18'
549 pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.hg18.splices', splice_schema)
550 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION
551 cds_slices = sqlgraph.SQLTableClustered('%s.pygr_refGene_cdsAnnot%s_hg18' % ( testInputDB, smallSamplePostfix ),
552 clusterKey = 'chromosome', maxCache = 0)
553 cds_db = seqdb.AnnotationDB(cds_slices, hg18, sliceAttrDict = dict(id = 'chromosome', \
554 gene_id = 'name', cds_id = 'cds_id'))
555 msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_cdsAnnot_SQL_hg18'), 'w', \
556 pairwiseMode = True, bidirectional = False)
557 for id in cds_db:
558 msa.addAnnotation(cds_db[id])
559 cds_db.clear_cache() # not really necessary; cache should autoGC
560 cds_slices.clear_cache()
561 msa.build()
562 cds_db.__doc__ = 'SQL CDS Annotation Database for hg18'
563 pygr.Data.addResource('TEST.Annotation.SQL.hg18.cdss', cds_db)
564 msa.__doc__ = 'SQL NLMSA CDS for hg18'
565 pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.hg18.cdss', msa)
566 cds_schema = pygr.Data.ManyToManyRelation(hg18, cds_db, bindAttrs = ('cds2',))
567 cds_schema.__doc__ = 'SQL CDS Schema for hg18'
568 pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.hg18.cdss', cds_schema)
569 # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC: MYSQL VERSION
570 ucsc_slices = sqlgraph.SQLTableClustered('%s.pygr_phastConsElements28way%s_hg18' % ( testInputDB, smallSamplePostfix ),
571 clusterKey = 'chromosome', maxCache = 0)
572 ucsc_db = seqdb.AnnotationDB(ucsc_slices, hg18, sliceAttrDict = dict(id = 'chromosome', \
573 gene_id = 'name', ucsc_id = 'ucsc_id'))
574 msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements28way_SQL_hg18'), 'w', \
575 pairwiseMode = True, bidirectional = False)
576 for id in ucsc_db:
577 msa.addAnnotation(ucsc_db[id])
578 ucsc_db.clear_cache() # not really necessary; cache should autoGC
579 ucsc_slices.clear_cache()
580 msa.build()
581 ucsc_db.__doc__ = 'SQL Most Conserved Elements for hg18'
582 pygr.Data.addResource('TEST.Annotation.UCSC.SQL.hg18.mostconserved', ucsc_db)
583 msa.__doc__ = 'SQL NLMSA for Most Conserved Elements for hg18'
584 pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.mostconserved', msa)
585 ucsc_schema = pygr.Data.ManyToManyRelation(hg18, ucsc_db, bindAttrs = ('element2',))
586 ucsc_schema.__doc__ = 'SQL Schema for UCSC Most Conserved Elements for hg18'
587 pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.hg18.mostconserved', ucsc_schema)
588 # BUILD ANNOTATION DATABASE FOR SNP126 FROM UCSC: MYSQL VERSION
589 snp_slices = sqlgraph.SQLTableClustered('%s.pygr_snp126%s_hg18' % ( testInputDB, smallSamplePostfix ),
590 clusterKey = 'clusterKey', maxCache = 0)
591 snp_db = seqdb.AnnotationDB(snp_slices, hg18, sliceAttrDict = dict(id = 'chromosome', gene_id = 'name',
592 snp_id = 'snp_id', score = 'score', ref_NCBI = 'ref_NCBI', ref_UCSC = 'ref_UCSC',
593 observed = 'observed', molType = 'molType', myClass = 'myClass', myValid = 'myValid',
594 avHet = 'avHet', avHetSE = 'avHetSE', myFunc = 'myFunc', locType = 'locType',
595 myWeight = 'myWeight'))
596 msa = cnestedlist.NLMSA(os.path.join(self.path, 'snp126_SQL_hg18'), 'w', \
597 pairwiseMode = True, bidirectional = False)
598 for id in snp_db:
599 msa.addAnnotation(snp_db[id])
600 snp_db.clear_cache() # not really necessary; cache should autoGC
601 snp_slices.clear_cache()
602 msa.build()
603 snp_db.__doc__ = 'SQL SNP126 for hg18'
604 pygr.Data.addResource('TEST.Annotation.UCSC.SQL.hg18.snp126', snp_db)
605 msa.__doc__ = 'SQL NLMSA for SNP126 for hg18'
606 pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.snp126', msa)
607 snp_schema = pygr.Data.ManyToManyRelation(hg18, snp_db, bindAttrs = ('snp2',))
608 snp_schema.__doc__ = 'SQL Schema for UCSC SNP126 for hg18'
609 pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.hg18.snp126', snp_schema)
610 pygr.Data.save()
611 pygr.Data.clear_cache()
613 # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
614 hg18 = pygr.Data.getResource('TEST.Seq.Genome.hg18')
615 exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.hg18.exons')
616 splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.hg18.splices')
617 conservedmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.mostconserved')
618 snpmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.snp126')
619 cdsmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.hg18.cdss')
620 exons = pygr.Data.getResource('TEST.Annotation.SQL.hg18.exons')
621 splices = pygr.Data.getResource('TEST.Annotation.SQL.hg18.splices')
622 mostconserved = pygr.Data.getResource('TEST.Annotation.UCSC.SQL.hg18.mostconserved')
623 snp126 = pygr.Data.getResource('TEST.Annotation.UCSC.SQL.hg18.snp126')
624 cdss = pygr.Data.getResource('TEST.Annotation.SQL.hg18.cdss')
626 # OPEN hg18_MULTIZ28WAY NLMSA
627 msa = cnestedlist.NLMSA(os.path.join(msaDir, 'hg18_multiz28way'), 'r', trypath = [seqDir])
629 exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_hg18.txt' % smallSamplePostfix)
630 intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_hg18.txt' % smallSamplePostfix)
631 stopAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Stop%s_hg18.txt' % smallSamplePostfix)
632 newexonAnnotFileName = os.path.join(self.path, 'new_Exons_hg18.txt')
633 newintronAnnotFileName = os.path.join(self.path, 'new_Introns_hg18.txt')
634 newstopAnnotFileName = os.path.join(self.path, 'new_stop_hg18.txt')
635 tmpexonAnnotFileName = self.copyFile(exonAnnotFileName)
636 tmpintronAnnotFileName = self.copyFile(intronAnnotFileName)
637 tmpstopAnnotFileName = self.copyFile(stopAnnotFileName)
639 if smallSampleKey:
640 chrList = [ smallSampleKey ]
641 else:
642 chrList = hg18.seqLenDict.keys()
643 chrList.sort()
645 outfile = open(newexonAnnotFileName, 'w')
646 for chrid in chrList:
647 slice = hg18[chrid]
648 # EXON ANNOTATION DATABASE
649 try:
650 ex1 = exonmsa[slice]
651 except:
652 continue
653 else:
654 exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()]
655 exlist1.sort()
656 for ixx, exon in exlist1:
657 saveList = []
658 tmp = exon.sequence
659 tmpexon = exons[exon.exon_id]
660 tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE
661 wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, tmpslice.start, tmpslice.stop
662 try:
663 out1 = conservedmsa[tmp]
664 except KeyError:
665 pass
666 else:
667 elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
668 elementlist.sort()
669 for iyy, element in elementlist:
670 if element.stop - element.start < 100: continue
671 score = int(string.split(element.gene_id, '=')[1])
672 if score < 100: continue
673 tmp2 = element.sequence
674 tmpelement = mostconserved[element.ucsc_id]
675 tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
676 wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
677 slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
678 if slicestart < 0 or sliceend < 0: sys.exit('wrong query')
679 tmp1 = msa.seqDict['hg18.' + chrid][slicestart:sliceend]
680 edges = msa[tmp1].edges()
681 for src, dest, e in edges:
682 if src.stop - src.start < 100: continue
683 palign, pident = e.pAligned(), e.pIdentity()
684 if palign < 0.8 or pident < 0.8: continue
685 palign, pident = '%.2f' % palign, '%.2f' % pident
686 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
687 (~msa.seqDict)[dest], \
688 str(dest), dest.start, dest.stop, palign, pident)
689 saveList.append('\t'.join(map(str, wlist3)) + '\n')
690 saveList.sort()
691 for saveline in saveList:
692 outfile.write(saveline)
693 outfile.close()
694 md5old = hashlib.md5()
695 md5old.update(open(tmpexonAnnotFileName, 'r').read())
696 md5new = hashlib.md5()
697 md5new.update(open(newexonAnnotFileName, 'r').read())
698 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
700 outfile = open(newintronAnnotFileName, 'w')
701 for chrid in chrList:
702 slice = hg18[chrid]
703 # SPLICE ANNOTATION DATABASE
704 try:
705 sp1 = splicemsa[slice]
706 except:
707 continue
708 else:
709 splist1 = [(ix.splice_id, ix) for ix in sp1.keys()]
710 splist1.sort()
711 for ixx, splice in splist1:
712 saveList = []
713 tmp = splice.sequence
714 tmpsplice = splices[splice.splice_id]
715 tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE
716 wlist1 = 'INTRON', chrid, tmpsplice.splice_id, tmpsplice.gene_id, tmpslice.start, tmpslice.stop
717 try:
718 out1 = conservedmsa[tmp]
719 except KeyError:
720 pass
721 else:
722 elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
723 elementlist.sort()
724 for iyy, element in elementlist:
725 if element.stop - element.start < 100: continue
726 score = int(string.split(element.gene_id, '=')[1])
727 if score < 100: continue
728 tmp2 = element.sequence
729 tmpelement = mostconserved[element.ucsc_id]
730 tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
731 wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
732 slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
733 if slicestart < 0 or sliceend < 0: sys.exit('wrong query')
734 tmp1 = msa.seqDict['hg18.' + chrid][slicestart:sliceend]
735 edges = msa[tmp1].edges()
736 for src, dest, e in edges:
737 if src.stop - src.start < 100: continue
738 palign, pident = e.pAligned(), e.pIdentity()
739 if palign < 0.8 or pident < 0.8: continue
740 palign, pident = '%.2f' % palign, '%.2f' % pident
741 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
742 (~msa.seqDict)[dest], \
743 str(dest), dest.start, dest.stop, palign, pident)
744 saveList.append('\t'.join(map(str, wlist3)) + '\n')
745 saveList.sort()
746 for saveline in saveList:
747 outfile.write(saveline)
748 # SNP IN SPLICE SITES
749 saveList = []
750 gt = tmpslice[:2]
751 ag = tmpslice[-2:]
752 try:
753 gtout = snpmsa[gt]
754 agout = snpmsa[ag]
755 except KeyError:
756 pass
757 else:
758 gtlist = gtout.keys()
759 aglist = agout.keys()
760 for snp in gtlist:
761 tmpsnp = snp.sequence
762 annsnp = snp126[snp.snp_id]
763 wlist2 = ('SNP5', chrid, tmpsplice.gene_id, gt.start, gt.stop, str(gt)) \
764 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
765 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
766 annsnp.observed, annsnp.molType, \
767 annsnp.myClass, annsnp.myValid)
768 tmp1 = msa.seqDict['hg18.' + chrid][abs(gt.start):abs(gt.stop)]
769 edges = msa[tmp1].edges()
770 for src, dest, e in edges:
771 if src.stop - src.start != 2 or dest.stop - dest.start != 2: continue
772 palign, pident = e.pAligned(), e.pIdentity()
773 palign, pident = '%.2f' % palign, '%.2f' % pident
774 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
775 (~msa.seqDict)[dest], \
776 str(dest), dest.start, dest.stop, palign, pident)
777 saveList.append('\t'.join(map(str, wlist3)) + '\n')
778 for snp in aglist:
779 tmpsnp = snp.sequence
780 annsnp = snp126[snp.snp_id]
781 wlist2 = ('SNP3', chrid, tmpsplice.gene_id, ag.start, ag.stop, str(ag)) \
782 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
783 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
784 annsnp.observed, annsnp.molType, \
785 annsnp.myClass, annsnp.myValid)
786 tmp1 = msa.seqDict['hg18.' + chrid][abs(ag.start):abs(ag.stop)]
787 edges = msa[tmp1].edges()
788 for src, dest, e in edges:
789 if src.stop - src.start != 2 or dest.stop - dest.start != 2: continue
790 palign, pident = e.pAligned(), e.pIdentity()
791 palign, pident = '%.2f' % palign, '%.2f' % pident
792 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
793 (~msa.seqDict)[dest], \
794 str(dest), dest.start, dest.stop, palign, pident)
795 saveList.append('\t'.join(map(str, wlist3)) + '\n')
796 saveList.sort()
797 for saveline in saveList:
798 outfile.write(saveline)
799 outfile.close()
800 md5old = hashlib.md5()
801 md5old.update(open(tmpintronAnnotFileName, 'r').read())
802 md5new = hashlib.md5()
803 md5new.update(open(newintronAnnotFileName, 'r').read())
804 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
806 outfile = open(newstopAnnotFileName, 'w')
807 for chrid in chrList:
808 slice = hg18[chrid]
809 # STOP ANNOTATION DATABASE
810 try:
811 cds1 = cdsmsa[slice]
812 except:
813 continue
814 else:
815 cdslist1 = [(ix.cds_id, ix) for ix in cds1.keys()]
816 cdslist1.sort()
817 for ixx, cds in cdslist1:
818 saveList = []
819 tmp = cds.sequence
820 tmpcds = cdss[cds.cds_id]
821 tmpslice = tmpcds.sequence # FOR REAL EXON COORDINATE
822 wlist1 = 'STOP', chrid, tmpcds.cds_id, tmpcds.gene_id, tmpslice.start, tmpslice.stop
823 if tmpslice.start < 0:
824 stopstart, stopend = -tmpslice.stop, -tmpslice.start
825 stop = -hg18[chrid][stopstart:stopstart+3]
826 else:
827 stopstart, stopend = tmpslice.start, tmpslice.stop
828 stop = hg18[chrid][stopend-3:stopend]
829 if str(stop).upper() not in ('TAA', 'TAG', 'TGA'): continue
830 try:
831 snp1 = snpmsa[stop]
832 except KeyError:
833 pass
834 else:
835 snplist = [(ix.snp_id, ix) for ix in snp1.keys()]
836 snplist.sort()
837 for iyy, snp in snplist:
838 tmpsnp = snp.sequence
839 annsnp = snp126[snp.snp_id]
840 wlist2 = wlist1 + (str(stop), stop.start, stop.stop) \
841 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
842 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
843 annsnp.observed, annsnp.molType, \
844 annsnp.myClass, annsnp.myValid)
845 if tmpslice.start < 0:
846 tmp1 = -msa.seqDict['hg18.' + chrid][stopstart:stopstart+3]
847 else:
848 tmp1 = msa.seqDict['hg18.' + chrid][stopend-3:stopend]
849 edges = msa[tmp1].edges()
850 for src, dest, e in edges:
851 if src.stop - src.start != 3 or dest.stop - dest.start != 3: continue
852 palign, pident = e.pAligned(), e.pIdentity()
853 palign, pident = '%.2f' % palign, '%.2f' % pident
854 if str(dest).upper() not in ('TAA', 'TAG', 'TGA'): nonstr = 'NONSENSE'
855 else: nonstr = 'STOP'
856 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
857 (~msa.seqDict)[dest], \
858 str(dest), dest.start, dest.stop, palign, pident, nonstr)
859 saveList.append('\t'.join(map(str, wlist3)) + '\n')
860 saveList.sort()
861 for saveline in saveList:
862 outfile.write(saveline)
863 outfile.close()
864 md5old = hashlib.md5()
865 md5old.update(open(tmpstopAnnotFileName, 'r').read())
866 md5new = hashlib.md5()
867 md5new.update(open(newstopAnnotFileName, 'r').read())
868 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
871 if __name__ == '__main__':
872 PygrTestProgram(verbosity=2)