2 from testlib
import testutil
, PygrTestProgram
4 import ConfigParser
, sys
, os
, string
, glob
12 config
= ConfigParser
.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
13 config
.read([ os
.path
.join(os
.path
.expanduser('~'), '.pygrrc'), os
.path
.join(os
.path
.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
14 mafDir
= config
.get('megatests_dm2', 'mafDir')
15 seqDir
= config
.get('megatests_dm2', 'seqDir')
16 testInputDir
= config
.get('megatests', 'testInputDir')
17 testOutputBaseDir
= config
.get('megatests', 'testOutputBaseDir')
19 ## mafDir CONTAINS FOLLOWING DM2 MULTIZ15WAY MAF ALIGNMENTS
20 ## seqDir CONTAINS FOLLOWING 15 GENOME ASSEMBLIES AND THEIR SEQDB FILES
21 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
22 ## outfileName = 'splicesite_dm2.txt' # CHR4H TESTING
23 ## outputName = 'splicesite_dm2_multiz15way.txt' # CHR4H TESTING
24 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
26 # DIRECTIONARY FOR DOC STRING OF SEQDB
28 'anoGam1':'A. gambiae Genome (February 2003)',
29 'apiMel2':'A. mellifera Genome (January 2005)',
30 'dm2':'D. melanogaster Genome (April 2004)',
31 'dp4':'D. pseudoobscura Genome (February 2006)',
32 'droAna3':'D. ananassae Genome (February 2006)',
33 'droEre2':'D. erecta Genome (February 2006)',
34 'droGri2':'D. grimshawi Genome (February 2006)',
35 'droMoj3':'D. mojavensis Genome (February 2006)',
36 'droPer1':'D. persimilis Genome (October 2005)',
37 'droSec1':'D. sechellia Genome (October 2005)',
38 'droSim1':'D. simulans Genome (April 2005)',
39 'droVir3':'D. virilis Genome (February 2006)',
40 'droWil1':'D. willistoni Genome (February 2006)',
41 'droYak2':'D. yakuba Genome (November 2005)',
42 'triCas2':'T. castaneum Genome (September 2005)',
45 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
46 msaSpeciesList
= ['anoGam1', 'apiMel2', 'dm2', 'dp4', 'droAna3', 'droEre2', 'droGri2', 'droMoj3', \
47 'droPer1', 'droSec1', 'droSim1', 'droVir3', 'droWil1', 'droYak2', 'triCas2']
49 class PygrBuildNLMSAMegabase(unittest
.TestCase
):
50 'restrict megatest to an initially empty directory, need large space to perform'
51 def setUp(self
, testDir
= None):
53 tmpList
= [c
for c
in 'PygrBuildNLMSAMegabase']
54 random
.shuffle(tmpList
)
55 testDir
= os
.path
.join(testOutputBaseDir
, 'TEST_' + ''.join(tmpList
)) # FOR TEST, SHOULD BE DELETED
56 if testDir
is None: testDir
= 'TEST_' + ''.join(tmpList
) # NOT SPECIFIED, USE CURRENT DIRECTORY
59 testDir
= os
.path
.realpath(testDir
)
64 tmpFileName
= os
.path
.join(testDir
, 'DELETE_THIS_TEMP_FILE')
65 open(tmpFileName
, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
68 pygr
.Data
.update(self
.path
)
69 from pygr
import seqdb
70 for orgstr
in msaSpeciesList
:
71 genome
= seqdb
.BlastDB(os
.path
.join(seqDir
, orgstr
))
72 genome
.__doc
__ = docStringDict
[orgstr
]
73 pygr
.Data
.addResource('TEST.Seq.Genome.' + orgstr
, genome
)
75 def copyFile(self
, filename
): # COPY A FILE INTO TEST DIRECTORY
76 newname
= os
.path
.join(self
.path
, os
.path
.basename(filename
))
77 open(newname
, 'w').write(open(filename
, 'r').read())
80 'delete the temporary directory and files, restore pygr.Data path'
81 for dirpath
, subdirs
, files
in os
.walk(self
.path
, topdown
= False): # SHOULD BE DELETED BOTTOM-UP FASHION
82 # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
83 # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
84 for filename
in files
:
85 os
.remove(os
.path
.join(dirpath
, filename
))
87 # Restore original pygr.Data path to remedy lack of isolation
88 # between tests from the same run
89 pygr
.Data
.update(None)
92 class Build_Test(PygrBuildNLMSAMegabase
):
94 'Check pygr.Data contents'
95 l
= pygr
.Data
.dir('TEST')
96 preList
= ['TEST.Seq.Genome.' + orgstr
for orgstr
in msaSpeciesList
]
99 'Test building an NLMSA and querying results'
100 from pygr
import seqdb
, cnestedlist
102 for orgstr
in msaSpeciesList
:
103 genomedict
[orgstr
] = pygr
.Data
.getResource('TEST.Seq.Genome.' + orgstr
)
104 uniondict
= seqdb
.PrefixUnionDict(genomedict
)
106 maflist
= glob
.glob(os
.path
.join(mafDir
, 'chr4h.maf')) # CHR4H TESTING
108 msaname
= os
.path
.join(self
.path
, 'dm2_multiz15way')
109 msa1
= cnestedlist
.NLMSA(msaname
, 'w', uniondict
, maflist
, maxlen
= 536870912, maxint
= 22369620) # 500MB VERSION
110 msa1
.__doc__
= 'TEST NLMSA for dm2 multiz15way'
111 pygr
.Data
.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1
)
113 msa
= pygr
.Data
.getResource('TEST.MSA.UCSC.dm2_multiz15way')
114 outfileName
= os
.path
.join(testInputDir
, 'splicesite_dm2_chr4h.txt') # CHR4H TESTING
115 outputName
= os
.path
.join(testInputDir
, 'splicesite_dm2_chr4h_multiz15way.txt') # CHR4H TESTING
116 newOutputName
= os
.path
.join(self
.path
, 'splicesite_new1.txt')
117 tmpInputName
= self
.copyFile(outfileName
)
118 tmpOutputName
= self
.copyFile(outputName
)
119 outfile
= open(newOutputName
, 'w')
120 for lines
in open(tmpInputName
, 'r').xreadlines():
121 chrid
, intstart
, intend
, nobs
= string
.split(lines
.strip(), '\t')
122 intstart
, intend
, nobs
= int(intstart
), int(intend
), int(nobs
)
123 site1
= msa
.seqDict
['dm2' + '.' + chrid
][intstart
:intstart
+2]
124 site2
= msa
.seqDict
['dm2' + '.' + chrid
][intend
-2:intend
]
125 edges1
= msa
[site1
].edges()
126 edges2
= msa
[site2
].edges()
127 if len(edges1
) == 0: # EMPTY EDGES
128 wlist
= str(site1
), 'dm2', chrid
, intstart
, intstart
+2, '', '', '', '', ''
129 outfile
.write('\t'.join(map(str, wlist
)) + '\n')
130 if len(edges2
) == 0: # EMPTY EDGES
131 wlist
= str(site2
), 'dm2', chrid
, intend
-2, intend
, '', '', '', '', ''
132 outfile
.write('\t'.join(map(str, wlist
)) + '\n')
134 for src
, dest
, e
in edges1
:
135 if len(str(src
)) != 2 or len(str(dest
)) != 2: continue
136 dotindex
= (~msa
.seqDict
)[src
].index('.')
137 srcspecies
, src1
= (~msa
.seqDict
)[src
][:dotindex
], (~msa
.seqDict
)[src
][dotindex
+1:]
138 dotindex
= (~msa
.seqDict
)[dest
].index('.')
139 destspecies
, dest1
= (~msa
.seqDict
)[dest
][:dotindex
], (~msa
.seqDict
)[dest
][dotindex
+1:]
140 wlist
= str(src
), srcspecies
, src1
, src
.start
, src
.stop
, str(dest
), \
141 destspecies
, dest1
, dest
.start
, dest
.stop
142 saveList
.append('\t'.join(map(str, wlist
)) + '\n')
143 for src
, dest
, e
in edges2
:
144 if len(str(src
)) != 2 or len(str(dest
)) != 2: continue
145 dotindex
= (~msa
.seqDict
)[src
].index('.')
146 srcspecies
, src1
= (~msa
.seqDict
)[src
][:dotindex
], (~msa
.seqDict
)[src
][dotindex
+1:]
147 dotindex
= (~msa
.seqDict
)[dest
].index('.')
148 destspecies
, dest1
= (~msa
.seqDict
)[dest
][:dotindex
], (~msa
.seqDict
)[dest
][dotindex
+1:]
149 wlist
= str(src
), srcspecies
, src1
, src
.start
, src
.stop
, str(dest
), \
150 destspecies
, dest1
, dest
.start
, dest
.stop
151 saveList
.append('\t'.join(map(str, wlist
)) + '\n')
152 saveList
.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
153 for saveline
in saveList
:
154 outfile
.write(saveline
)
156 md5old
= hashlib
.md5()
157 md5old
.update(open(newOutputName
, 'r').read())
158 md5new
= hashlib
.md5()
159 md5new
.update(open(tmpOutputName
, 'r').read())
160 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
163 msafilelist
= glob
.glob(msaname
+ '*')
165 cnestedlist
.dump_textfile(msaname
, os
.path
.join(self
.path
, 'dm2_multiz15way.txt'))
166 for filename
in msafilelist
: os
.remove(filename
)
167 runPath
= os
.path
.realpath(os
.curdir
)
169 cnestedlist
.textfile_to_binaries('dm2_multiz15way.txt')
172 msa1
= cnestedlist
.NLMSA(msaname
, 'r')
173 msa1
.__doc__
= 'TEST NLMSA for dm2 multiz15way'
174 pygr
.Data
.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1
)
176 msa
= pygr
.Data
.getResource('TEST.MSA.UCSC.dm2_multiz15way')
177 newOutputName
= os
.path
.join(self
.path
, 'splicesite_new2.txt')
178 tmpInputName
= self
.copyFile(outfileName
)
179 tmpOutputName
= self
.copyFile(outputName
)
180 outfile
= open(newOutputName
, 'w')
181 for lines
in open(tmpInputName
, 'r').xreadlines():
182 chrid
, intstart
, intend
, nobs
= string
.split(lines
.strip(), '\t')
183 intstart
, intend
, nobs
= int(intstart
), int(intend
), int(nobs
)
184 site1
= msa
.seqDict
['dm2' + '.' + chrid
][intstart
:intstart
+2]
185 site2
= msa
.seqDict
['dm2' + '.' + chrid
][intend
-2:intend
]
186 edges1
= msa
[site1
].edges()
187 edges2
= msa
[site2
].edges()
188 if len(edges1
) == 0: # EMPTY EDGES
189 wlist
= str(site1
), 'dm2', chrid
, intstart
, intstart
+2, '', '', '', '', ''
190 outfile
.write('\t'.join(map(str, wlist
)) + '\n')
191 if len(edges2
) == 0: # EMPTY EDGES
192 wlist
= str(site2
), 'dm2', chrid
, intend
-2, intend
, '', '', '', '', ''
193 outfile
.write('\t'.join(map(str, wlist
)) + '\n')
195 for src
, dest
, e
in edges1
:
196 if len(str(src
)) != 2 or len(str(dest
)) != 2: continue
197 dotindex
= (~msa
.seqDict
)[src
].index('.')
198 srcspecies
, src1
= (~msa
.seqDict
)[src
][:dotindex
], (~msa
.seqDict
)[src
][dotindex
+1:]
199 dotindex
= (~msa
.seqDict
)[dest
].index('.')
200 destspecies
, dest1
= (~msa
.seqDict
)[dest
][:dotindex
], (~msa
.seqDict
)[dest
][dotindex
+1:]
201 wlist
= str(src
), srcspecies
, src1
, src
.start
, src
.stop
, str(dest
), \
202 destspecies
, dest1
, dest
.start
, dest
.stop
203 saveList
.append('\t'.join(map(str, wlist
)) + '\n')
204 for src
, dest
, e
in edges2
:
205 if len(str(src
)) != 2 or len(str(dest
)) != 2: continue
206 dotindex
= (~msa
.seqDict
)[src
].index('.')
207 srcspecies
, src1
= (~msa
.seqDict
)[src
][:dotindex
], (~msa
.seqDict
)[src
][dotindex
+1:]
208 dotindex
= (~msa
.seqDict
)[dest
].index('.')
209 destspecies
, dest1
= (~msa
.seqDict
)[dest
][:dotindex
], (~msa
.seqDict
)[dest
][dotindex
+1:]
210 wlist
= str(src
), srcspecies
, src1
, src
.start
, src
.stop
, str(dest
), \
211 destspecies
, dest1
, dest
.start
, dest
.stop
212 saveList
.append('\t'.join(map(str, wlist
)) + '\n')
213 saveList
.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
214 for saveline
in saveList
:
215 outfile
.write(saveline
)
217 md5old
= hashlib
.md5()
218 md5old
.update(open(newOutputName
, 'r').read())
219 md5new
= hashlib
.md5()
220 md5new
.update(open(tmpOutputName
, 'r').read())
221 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
224 if __name__
== '__main__':
225 PygrTestProgram(verbosity
=2)