added test of len() method for SQLTable
[pygr.git] / tests / nlmsa_dm2_megatest.py
blob0fe91ab6e2d0ba78fa51bf8476e95e6ec37b3a19
1 import unittest
2 from testlib import testutil, PygrTestProgram
4 import ConfigParser, sys, os, string, glob
5 import pygr.Data
7 try:
8 import hashlib
9 except ImportError:
10 import md5 as hashlib
12 config = ConfigParser.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
13 config.read([ os.path.join(os.path.expanduser('~'), '.pygrrc'), os.path.join(os.path.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
14 mafDir = config.get('megatests_dm2', 'mafDir')
15 seqDir = config.get('megatests_dm2', 'seqDir')
16 testInputDir = config.get('megatests', 'testInputDir')
17 testOutputBaseDir = config.get('megatests', 'testOutputBaseDir')
19 ## mafDir CONTAINS FOLLOWING DM2 MULTIZ15WAY MAF ALIGNMENTS
20 ## seqDir CONTAINS FOLLOWING 15 GENOME ASSEMBLIES AND THEIR SEQDB FILES
21 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
22 ## outfileName = 'splicesite_dm2.txt' # CHR4H TESTING
23 ## outputName = 'splicesite_dm2_multiz15way.txt' # CHR4H TESTING
24 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
26 # DIRECTIONARY FOR DOC STRING OF SEQDB
27 docStringDict = {
28 'anoGam1':'A. gambiae Genome (February 2003)',
29 'apiMel2':'A. mellifera Genome (January 2005)',
30 'dm2':'D. melanogaster Genome (April 2004)',
31 'dp4':'D. pseudoobscura Genome (February 2006)',
32 'droAna3':'D. ananassae Genome (February 2006)',
33 'droEre2':'D. erecta Genome (February 2006)',
34 'droGri2':'D. grimshawi Genome (February 2006)',
35 'droMoj3':'D. mojavensis Genome (February 2006)',
36 'droPer1':'D. persimilis Genome (October 2005)',
37 'droSec1':'D. sechellia Genome (October 2005)',
38 'droSim1':'D. simulans Genome (April 2005)',
39 'droVir3':'D. virilis Genome (February 2006)',
40 'droWil1':'D. willistoni Genome (February 2006)',
41 'droYak2':'D. yakuba Genome (November 2005)',
42 'triCas2':'T. castaneum Genome (September 2005)',
45 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
46 msaSpeciesList = ['anoGam1', 'apiMel2', 'dm2', 'dp4', 'droAna3', 'droEre2', 'droGri2', 'droMoj3', \
47 'droPer1', 'droSec1', 'droSim1', 'droVir3', 'droWil1', 'droYak2', 'triCas2']
49 class PygrBuildNLMSAMegabase(unittest.TestCase):
50 'restrict megatest to an initially empty directory, need large space to perform'
51 def setUp(self, testDir = None):
52 import random
53 tmpList = [c for c in 'PygrBuildNLMSAMegabase']
54 random.shuffle(tmpList)
55 testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) # FOR TEST, SHOULD BE DELETED
56 if testDir is None: testDir = 'TEST_' + ''.join(tmpList) # NOT SPECIFIED, USE CURRENT DIRECTORY
57 try:
58 os.mkdir(testDir)
59 testDir = os.path.realpath(testDir)
60 except:
61 raise IOError
62 self.path = testDir
63 try:
64 tmpFileName = os.path.join(testDir, 'DELETE_THIS_TEMP_FILE')
65 open(tmpFileName, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
66 except:
67 raise IOError
68 pygr.Data.update(self.path)
69 from pygr import seqdb
70 for orgstr in msaSpeciesList:
71 genome = seqdb.BlastDB(os.path.join(seqDir, orgstr))
72 genome.__doc__ = docStringDict[orgstr]
73 pygr.Data.addResource('TEST.Seq.Genome.' + orgstr, genome)
74 pygr.Data.save()
75 def copyFile(self, filename): # COPY A FILE INTO TEST DIRECTORY
76 newname = os.path.join(self.path, os.path.basename(filename))
77 open(newname, 'w').write(open(filename, 'r').read())
78 return newname
79 def tearDown(self):
80 'delete the temporary directory and files, restore pygr.Data path'
81 for dirpath, subdirs, files in os.walk(self.path, topdown = False): # SHOULD BE DELETED BOTTOM-UP FASHION
82 # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
83 # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
84 for filename in files:
85 os.remove(os.path.join(dirpath, filename))
86 os.rmdir(dirpath)
87 # Restore original pygr.Data path to remedy lack of isolation
88 # between tests from the same run
89 pygr.Data.update(None)
92 class Build_Test(PygrBuildNLMSAMegabase):
93 def test_seqdb(self):
94 'Check pygr.Data contents'
95 l = pygr.Data.dir('TEST')
96 preList = ['TEST.Seq.Genome.' + orgstr for orgstr in msaSpeciesList]
97 assert l == preList
98 def test_build(self):
99 'Test building an NLMSA and querying results'
100 from pygr import seqdb, cnestedlist
101 genomedict = {}
102 for orgstr in msaSpeciesList:
103 genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' + orgstr)
104 uniondict = seqdb.PrefixUnionDict(genomedict)
105 import glob
106 maflist = glob.glob(os.path.join(mafDir, 'chr4h.maf')) # CHR4H TESTING
107 maflist.sort()
108 msaname = os.path.join(self.path, 'dm2_multiz15way')
109 msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, maflist, maxlen = 536870912, maxint = 22369620) # 500MB VERSION
110 msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way'
111 pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1)
112 pygr.Data.save()
113 msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way')
114 outfileName = os.path.join(testInputDir, 'splicesite_dm2_chr4h.txt') # CHR4H TESTING
115 outputName = os.path.join(testInputDir, 'splicesite_dm2_chr4h_multiz15way.txt') # CHR4H TESTING
116 newOutputName = os.path.join(self.path, 'splicesite_new1.txt')
117 tmpInputName = self.copyFile(outfileName)
118 tmpOutputName = self.copyFile(outputName)
119 outfile = open(newOutputName, 'w')
120 for lines in open(tmpInputName, 'r').xreadlines():
121 chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
122 intstart, intend, nobs = int(intstart), int(intend), int(nobs)
123 site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart+2]
124 site2 = msa.seqDict['dm2' + '.' + chrid][intend-2:intend]
125 edges1 = msa[site1].edges()
126 edges2 = msa[site2].edges()
127 if len(edges1) == 0: # EMPTY EDGES
128 wlist = str(site1), 'dm2', chrid, intstart, intstart+2, '', '', '', '', ''
129 outfile.write('\t'.join(map(str, wlist)) + '\n')
130 if len(edges2) == 0: # EMPTY EDGES
131 wlist = str(site2), 'dm2', chrid, intend-2, intend, '', '', '', '', ''
132 outfile.write('\t'.join(map(str, wlist)) + '\n')
133 saveList = []
134 for src, dest, e in edges1:
135 if len(str(src)) != 2 or len(str(dest)) != 2: continue
136 dotindex = (~msa.seqDict)[src].index('.')
137 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
138 dotindex = (~msa.seqDict)[dest].index('.')
139 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
140 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
141 destspecies, dest1, dest.start, dest.stop
142 saveList.append('\t'.join(map(str, wlist)) + '\n')
143 for src, dest, e in edges2:
144 if len(str(src)) != 2 or len(str(dest)) != 2: continue
145 dotindex = (~msa.seqDict)[src].index('.')
146 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
147 dotindex = (~msa.seqDict)[dest].index('.')
148 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
149 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
150 destspecies, dest1, dest.start, dest.stop
151 saveList.append('\t'.join(map(str, wlist)) + '\n')
152 saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
153 for saveline in saveList:
154 outfile.write(saveline)
155 outfile.close()
156 md5old = hashlib.md5()
157 md5old.update(open(newOutputName, 'r').read())
158 md5new = hashlib.md5()
159 md5new.update(open(tmpOutputName, 'r').read())
160 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
162 # TEXT<->BINARY TEST
163 msafilelist = glob.glob(msaname + '*')
164 msa.save_seq_dict()
165 cnestedlist.dump_textfile(msaname, os.path.join(self.path, 'dm2_multiz15way.txt'))
166 for filename in msafilelist: os.remove(filename)
167 runPath = os.path.realpath(os.curdir)
168 os.chdir(self.path)
169 cnestedlist.textfile_to_binaries('dm2_multiz15way.txt')
170 os.chdir(runPath)
172 msa1 = cnestedlist.NLMSA(msaname, 'r')
173 msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way'
174 pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1)
175 pygr.Data.save()
176 msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way')
177 newOutputName = os.path.join(self.path, 'splicesite_new2.txt')
178 tmpInputName = self.copyFile(outfileName)
179 tmpOutputName = self.copyFile(outputName)
180 outfile = open(newOutputName, 'w')
181 for lines in open(tmpInputName, 'r').xreadlines():
182 chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
183 intstart, intend, nobs = int(intstart), int(intend), int(nobs)
184 site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart+2]
185 site2 = msa.seqDict['dm2' + '.' + chrid][intend-2:intend]
186 edges1 = msa[site1].edges()
187 edges2 = msa[site2].edges()
188 if len(edges1) == 0: # EMPTY EDGES
189 wlist = str(site1), 'dm2', chrid, intstart, intstart+2, '', '', '', '', ''
190 outfile.write('\t'.join(map(str, wlist)) + '\n')
191 if len(edges2) == 0: # EMPTY EDGES
192 wlist = str(site2), 'dm2', chrid, intend-2, intend, '', '', '', '', ''
193 outfile.write('\t'.join(map(str, wlist)) + '\n')
194 saveList = []
195 for src, dest, e in edges1:
196 if len(str(src)) != 2 or len(str(dest)) != 2: continue
197 dotindex = (~msa.seqDict)[src].index('.')
198 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
199 dotindex = (~msa.seqDict)[dest].index('.')
200 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
201 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
202 destspecies, dest1, dest.start, dest.stop
203 saveList.append('\t'.join(map(str, wlist)) + '\n')
204 for src, dest, e in edges2:
205 if len(str(src)) != 2 or len(str(dest)) != 2: continue
206 dotindex = (~msa.seqDict)[src].index('.')
207 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
208 dotindex = (~msa.seqDict)[dest].index('.')
209 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
210 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
211 destspecies, dest1, dest.start, dest.stop
212 saveList.append('\t'.join(map(str, wlist)) + '\n')
213 saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
214 for saveline in saveList:
215 outfile.write(saveline)
216 outfile.close()
217 md5old = hashlib.md5()
218 md5old.update(open(newOutputName, 'r').read())
219 md5new = hashlib.md5()
220 md5new.update(open(tmpOutputName, 'r').read())
221 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
224 if __name__ == '__main__':
225 PygrTestProgram(verbosity=2)