Merge commit 'remotes/ctb/xmlrpc_patches' into tryme
[pygr.git] / tests / nlmsa_hg18_megatest.py
blob331ca92f7d6d869dd7541e2b02b48d6c1db8ecf5
1 import unittest
2 from testlib import testutil, PygrTestProgram
4 import ConfigParser, sys, os, string, glob
5 import pygr.Data
7 try:
8 import hashlib
9 except ImportError:
10 import md5 as hashlib
12 config = ConfigParser.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
13 config.read([ os.path.join(os.path.expanduser('~'), '.pygrrc'), os.path.join(os.path.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
14 mafDir = config.get('megatests_hg18', 'mafDir')
15 seqDir = config.get('megatests_hg18', 'seqDir')
16 smallSampleKey = config.get('megatests_hg18', 'smallSampleKey')
17 testInputDir = config.get('megatests', 'testInputDir')
18 testOutputBaseDir = config.get('megatests', 'testOutputBaseDir')
20 if smallSampleKey:
21 smallSamplePostfix = '_' + smallSampleKey
22 else:
23 smallSamplePostfix = ''
25 ## mafDir CONTAINS FOLLOWING DM2 MULTIZ15WAY MAF ALIGNMENTS
26 ## seqDir CONTAINS FOLLOWING 15 GENOME ASSEMBLIES AND THEIR SEQDB FILES
27 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
28 ## outfileName = 'splicesite_hg18.txt' # CHR4H TESTING
29 ## outputName = 'splicesite_hg18_multiz28way.txt' # CHR4H TESTING
30 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
32 # DIRECTIONARY FOR DOC STRING OF SEQDB
33 docStringDict = {
34 'anoCar1':' Lizard Genome (January 2007)',
35 'bosTau3':'Cow Genome (August 2006)',
36 'canFam2':'Dog Genome (May 2005)',
37 'cavPor2':'Guinea Pig (October 2005)',
38 'danRer4':'Zebrafish Genome (March 2006)',
39 'dasNov1':'Armadillo Genome (May 2005)',
40 'echTel1':'Tenrec Genome (July 2005)',
41 'eriEur1':'European Hedgehog (Junuary 2006)',
42 'equCab1':'Horse Genome (January 2007)',
43 'felCat3':'Cat Genome (March 2006)',
44 'fr2':'Fugu Genome (October 2004)',
45 'galGal3':'Chicken Genome (May 2006)',
46 'gasAcu1':'Stickleback Genome (February 2006)',
47 'hg18':'Human Genome (May 2006)',
48 'loxAfr1':'Elephant Genome (May 2005)',
49 'mm8':'Mouse Genome (March 2006)',
50 'monDom4':'Opossum Genome (January 2006)',
51 'ornAna1':'Platypus Genome (March 2007)',
52 'oryCun1':'Rabbit Genome (May 2005)',
53 'oryLat1':'Medaka Genome (April 2006)',
54 'otoGar1':'Bushbaby Genome (December 2006)',
55 'panTro2':'Chimpanzee Genome (March 2006)',
56 'rheMac2':'Rhesus Genome (January 2006)',
57 'rn4':'Rat Genome (November 2004)',
58 'sorAra1':'Shrew (Junuary 2006)',
59 'tetNig1':'Tetraodon Genome (February 2004)',
60 'tupBel1':'Tree Shrew (December 2006)',
61 'xenTro2':'X. tropicalis Genome (August 2005)'
64 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
65 msaSpeciesList = ['anoCar1', 'bosTau3', 'canFam2', 'cavPor2', 'danRer4', 'dasNov1', 'echTel1', \
66 'equCab1', 'eriEur1', 'felCat3', 'fr2', 'galGal3', 'gasAcu1', 'hg18', 'loxAfr1', \
67 'mm8', 'monDom4', 'ornAna1', 'oryCun1', 'oryLat1', 'otoGar1', 'panTro2', 'rheMac2', \
68 'rn4', 'sorAra1', 'tetNig1', 'tupBel1', 'xenTro2']
70 class PygrBuildNLMSAMegabase(unittest.TestCase):
71 'restrict megatest to an initially empty directory, need large space to perform'
72 def setUp(self, testDir = None):
73 import random
74 tmpList = [c for c in 'PygrBuildNLMSAMegabase']
75 random.shuffle(tmpList)
76 testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) # FOR TEST, SHOULD BE DELETED
77 if testDir is None: testDir = 'TEST_' + ''.join(tmpList) # NOT SPECIFIED, USE CURRENT DIRECTORY
78 try:
79 os.mkdir(testDir)
80 testDir = os.path.realpath(testDir)
81 except:
82 raise IOError
83 self.path = testDir
84 try:
85 tmpFileName = os.path.join(testDir, 'DELETE_THIS_TEMP_FILE')
86 open(tmpFileName, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
87 except:
88 raise IOError
89 pygr.Data.update(self.path)
90 from pygr import seqdb
91 for orgstr in msaSpeciesList:
92 genome = seqdb.BlastDB(os.path.join(seqDir, orgstr))
93 genome.__doc__ = docStringDict[orgstr]
94 pygr.Data.addResource('TEST.Seq.Genome.' + orgstr, genome)
95 pygr.Data.save()
96 def copyFile(self, filename): # COPY A FILE INTO TEST DIRECTORY
97 newname = os.path.join(self.path, os.path.basename(filename))
98 open(newname, 'w').write(open(filename, 'r').read())
99 return newname
100 def tearDown(self):
101 'delete the temporary directory and files, restore pygr.Data path'
102 for dirpath, subdirs, files in os.walk(self.path, topdown = False): # SHOULD BE DELETED BOTTOM-UP FASHION
103 # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
104 # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
105 for filename in files:
106 os.remove(os.path.join(dirpath, filename))
107 os.rmdir(dirpath)
108 # Restore original pygr.Data path to remedy lack of isolation
109 # between tests from the same run
110 pygr.Data.update(None)
113 class Build_Test(PygrBuildNLMSAMegabase):
114 def test_seqdb(self):
115 'Check pygr.Data contents'
116 l = pygr.Data.dir('TEST')
117 preList = ['TEST.Seq.Genome.' + orgstr for orgstr in msaSpeciesList]
118 assert l == preList
119 def test_build(self):
120 'Test building an NLMSA and querying results'
121 from pygr import seqdb, cnestedlist
122 genomedict = {}
123 for orgstr in msaSpeciesList:
124 genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' + orgstr)
125 uniondict = seqdb.PrefixUnionDict(genomedict)
126 if smallSampleKey:
127 maflist = ( os.path.join(mafDir, smallSampleKey + '.maf'), )
128 else:
129 maflist = glob.glob(os.path.join(mafDir, '*.maf'))
130 maflist.sort()
131 msaname = os.path.join(self.path, 'hg18_multiz28way')
132 msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, maflist, maxlen = 536870912, maxint = 22369620) # 500MB VERSION
133 msa1.__doc__ = 'TEST NLMSA for hg18 multiz28way'
134 pygr.Data.addResource('TEST.MSA.UCSC.hg18_multiz28way', msa1)
135 pygr.Data.save()
136 msa = pygr.Data.getResource('TEST.MSA.UCSC.hg18_multiz28way')
137 outfileName = os.path.join(testInputDir, 'splicesite_hg18%s.txt' % smallSamplePostfix)
138 outputName = os.path.join(testInputDir, 'splicesite_hg18%s_multiz28way.txt' % smallSamplePostfix)
139 newOutputName = os.path.join(self.path, 'splicesite_new1.txt')
140 tmpInputName = self.copyFile(outfileName)
141 tmpOutputName = self.copyFile(outputName)
142 outfile = open(newOutputName, 'w')
143 for lines in open(tmpInputName, 'r').xreadlines():
144 chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
145 intstart, intend, nobs = int(intstart), int(intend), int(nobs)
146 site1 = msa.seqDict['hg18' + '.' + chrid][intstart:intstart+2]
147 site2 = msa.seqDict['hg18' + '.' + chrid][intend-2:intend]
148 edges1 = msa[site1].edges()
149 edges2 = msa[site2].edges()
150 if len(edges1) == 0: # EMPTY EDGES
151 wlist = str(site1), 'hg18', chrid, intstart, intstart+2, '', '', '', '', ''
152 outfile.write('\t'.join(map(str, wlist)) + '\n')
153 if len(edges2) == 0: # EMPTY EDGES
154 wlist = str(site2), 'hg18', chrid, intend-2, intend, '', '', '', '', ''
155 outfile.write('\t'.join(map(str, wlist)) + '\n')
156 saveList = []
157 for src, dest, e in edges1:
158 if len(str(src)) != 2 or len(str(dest)) != 2: continue
159 dotindex = (~msa.seqDict)[src].index('.')
160 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
161 dotindex = (~msa.seqDict)[dest].index('.')
162 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
163 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
164 destspecies, dest1, dest.start, dest.stop
165 saveList.append('\t'.join(map(str, wlist)) + '\n')
166 for src, dest, e in edges2:
167 if len(str(src)) != 2 or len(str(dest)) != 2: continue
168 dotindex = (~msa.seqDict)[src].index('.')
169 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
170 dotindex = (~msa.seqDict)[dest].index('.')
171 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
172 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
173 destspecies, dest1, dest.start, dest.stop
174 saveList.append('\t'.join(map(str, wlist)) + '\n')
175 saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
176 for saveline in saveList:
177 outfile.write(saveline)
178 outfile.close()
179 md5old = hashlib.md5()
180 md5old.update(open(newOutputName, 'r').read())
181 md5new = hashlib.md5()
182 md5new.update(open(tmpOutputName, 'r').read())
183 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
185 # TEXT<->BINARY TEST
186 msafilelist = glob.glob(msaname + '*')
187 msa.save_seq_dict()
188 cnestedlist.dump_textfile(msaname, os.path.join(self.path, 'hg18_multiz28way.txt'))
189 for filename in msafilelist: os.remove(filename)
190 runPath = os.path.realpath(os.curdir)
191 os.chdir(self.path)
192 cnestedlist.textfile_to_binaries('hg18_multiz28way.txt')
193 os.chdir(runPath)
195 msa1 = cnestedlist.NLMSA(msaname, 'r')
196 msa1.__doc__ = 'TEST NLMSA for hg18 multiz28way'
197 pygr.Data.addResource('TEST.MSA.UCSC.hg18_multiz28way', msa1)
198 pygr.Data.save()
199 msa = pygr.Data.getResource('TEST.MSA.UCSC.hg18_multiz28way')
200 newOutputName = os.path.join(self.path, 'splicesite_new2.txt')
201 tmpInputName = self.copyFile(outfileName)
202 tmpOutputName = self.copyFile(outputName)
203 outfile = open(newOutputName, 'w')
204 for lines in open(tmpInputName, 'r').xreadlines():
205 chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
206 intstart, intend, nobs = int(intstart), int(intend), int(nobs)
207 site1 = msa.seqDict['hg18' + '.' + chrid][intstart:intstart+2]
208 site2 = msa.seqDict['hg18' + '.' + chrid][intend-2:intend]
209 edges1 = msa[site1].edges()
210 edges2 = msa[site2].edges()
211 if len(edges1) == 0: # EMPTY EDGES
212 wlist = str(site1), 'hg18', chrid, intstart, intstart+2, '', '', '', '', ''
213 outfile.write('\t'.join(map(str, wlist)) + '\n')
214 if len(edges2) == 0: # EMPTY EDGES
215 wlist = str(site2), 'hg18', chrid, intend-2, intend, '', '', '', '', ''
216 outfile.write('\t'.join(map(str, wlist)) + '\n')
217 saveList = []
218 for src, dest, e in edges1:
219 if len(str(src)) != 2 or len(str(dest)) != 2: continue
220 dotindex = (~msa.seqDict)[src].index('.')
221 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
222 dotindex = (~msa.seqDict)[dest].index('.')
223 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
224 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
225 destspecies, dest1, dest.start, dest.stop
226 saveList.append('\t'.join(map(str, wlist)) + '\n')
227 for src, dest, e in edges2:
228 if len(str(src)) != 2 or len(str(dest)) != 2: continue
229 dotindex = (~msa.seqDict)[src].index('.')
230 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
231 dotindex = (~msa.seqDict)[dest].index('.')
232 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
233 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
234 destspecies, dest1, dest.start, dest.stop
235 saveList.append('\t'.join(map(str, wlist)) + '\n')
236 saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
237 for saveline in saveList:
238 outfile.write(saveline)
239 outfile.close()
240 md5old = hashlib.md5()
241 md5old.update(open(newOutputName, 'r').read())
242 md5new = hashlib.md5()
243 md5new.update(open(tmpOutputName, 'r').read())
244 assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
247 if __name__ == '__main__':
248 PygrTestProgram(verbosity=2)