tests/nlmsa_dm2_megatest.py

   1 import unittest
   2 from testlib import testutil, PygrTestProgram
   3
   4 import ConfigParser, sys, os, string, glob
   5 import pygr.Data
   6
   7 try:
   8     import hashlib
   9 except ImportError:
  10     import md5 as hashlib
  11
  12 config = ConfigParser.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
  13 config.read([ os.path.join(os.path.expanduser('~'), '.pygrrc'), os.path.join(os.path.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
  14 mafDir = config.get('megatests_dm2', 'mafDir')
  15 seqDir = config.get('megatests_dm2', 'seqDir')
  16 testInputDir = config.get('megatests', 'testInputDir')
  17 testOutputBaseDir = config.get('megatests', 'testOutputBaseDir')
  18
  19 ## mafDir CONTAINS FOLLOWING DM2 MULTIZ15WAY MAF ALIGNMENTS
  20 ## seqDir CONTAINS FOLLOWING 15 GENOME ASSEMBLIES AND THEIR SEQDB FILES
  21 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
  22 ##        outfileName = 'splicesite_dm2.txt' # CHR4H TESTING
  23 ##        outputName = 'splicesite_dm2_multiz15way.txt' # CHR4H TESTING
  24 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
  25
  26 # DIRECTIONARY FOR DOC STRING OF SEQDB
  27 docStringDict = {
  28     'anoGam1':'A. gambiae Genome (February 2003)',
  29     'apiMel2':'A. mellifera Genome (January 2005)',
  30     'dm2':'D. melanogaster Genome (April 2004)',
  31     'dp4':'D. pseudoobscura Genome (February 2006)',
  32     'droAna3':'D. ananassae Genome (February 2006)',
  33     'droEre2':'D. erecta Genome (February 2006)',
  34     'droGri2':'D. grimshawi Genome (February 2006)',
  35     'droMoj3':'D. mojavensis Genome (February 2006)',
  36     'droPer1':'D. persimilis Genome (October 2005)',
  37     'droSec1':'D. sechellia Genome (October 2005)',
  38     'droSim1':'D. simulans Genome (April 2005)',
  39     'droVir3':'D. virilis Genome (February 2006)',
  40     'droWil1':'D. willistoni Genome (February 2006)',
  41     'droYak2':'D. yakuba Genome (November 2005)',
  42     'triCas2':'T. castaneum Genome (September 2005)',
  43     }
  44
  45 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
  46 msaSpeciesList = ['anoGam1', 'apiMel2', 'dm2', 'dp4', 'droAna3', 'droEre2', 'droGri2', 'droMoj3', \
  47                   'droPer1', 'droSec1', 'droSim1', 'droVir3', 'droWil1', 'droYak2', 'triCas2']
  48
  49 class PygrBuildNLMSAMegabase(unittest.TestCase):
  50     'restrict megatest to an initially empty directory, need large space to perform'
  51     def setUp(self, testDir = None):
  52         import random
  53         tmpList = [c for c in 'PygrBuildNLMSAMegabase']
  54         random.shuffle(tmpList)
  55         testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) # FOR TEST, SHOULD BE DELETED
  56         if testDir is None: testDir = 'TEST_' + ''.join(tmpList) # NOT SPECIFIED, USE CURRENT DIRECTORY
  57         try:
  58             os.mkdir(testDir)
  59             testDir = os.path.realpath(testDir)
  60         except:
  61             raise IOError
  62         self.path = testDir
  63         try:
  64             tmpFileName = os.path.join(testDir, 'DELETE_THIS_TEMP_FILE')
  65             open(tmpFileName, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
  66         except:
  67             raise IOError
  68         pygr.Data.update(self.path)
  69         from pygr import seqdb
  70         for orgstr in msaSpeciesList:
  71             genome = seqdb.BlastDB(os.path.join(seqDir, orgstr))
  72             genome.__doc__ = docStringDict[orgstr]
  73             pygr.Data.addResource('TEST.Seq.Genome.' + orgstr, genome)
  74         pygr.Data.save()
  75     def copyFile(self, filename): # COPY A FILE INTO TEST DIRECTORY
  76         newname = os.path.join(self.path, os.path.basename(filename))
  77         open(newname, 'w').write(open(filename, 'r').read())
  78         return newname
  79     def tearDown(self):
  80         'delete the temporary directory and files, restore pygr.Data path'
  81         for dirpath, subdirs, files in os.walk(self.path, topdown = False): # SHOULD BE DELETED BOTTOM-UP FASHION
  82             # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
  83             # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
  84             for filename in files:
  85                 os.remove(os.path.join(dirpath, filename))
  86             os.rmdir(dirpath)
  87         # Restore original pygr.Data path to remedy lack of isolation
  88         # between tests from the same run
  89         pygr.Data.update(None)
  90
  91
  92 class Build_Test(PygrBuildNLMSAMegabase):
  93     def test_seqdb(self):
  94         'Check pygr.Data contents'
  95         l = pygr.Data.dir('TEST')
  96         preList = ['TEST.Seq.Genome.' + orgstr for orgstr in msaSpeciesList]
  97         assert l == preList
  98     def test_build(self):
  99         'Test building an NLMSA and querying results'
 100         from pygr import seqdb, cnestedlist
 101         genomedict = {}
 102         for orgstr in msaSpeciesList:
 103             genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' + orgstr)
 104         uniondict = seqdb.PrefixUnionDict(genomedict)
 105         import glob
 106         maflist = glob.glob(os.path.join(mafDir, 'chr4h.maf')) # CHR4H TESTING
 107         maflist.sort()
 108         msaname = os.path.join(self.path, 'dm2_multiz15way')
 109         msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, maflist, maxlen = 536870912, maxint = 22369620) # 500MB VERSION
 110         msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way'
 111         pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1)
 112         pygr.Data.save()
 113         msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way')
 114         outfileName = os.path.join(testInputDir, 'splicesite_dm2_chr4h.txt') # CHR4H TESTING
 115         outputName = os.path.join(testInputDir, 'splicesite_dm2_chr4h_multiz15way.txt') # CHR4H TESTING
 116         newOutputName = os.path.join(self.path, 'splicesite_new1.txt')
 117         tmpInputName = self.copyFile(outfileName)
 118         tmpOutputName = self.copyFile(outputName)
 119         outfile = open(newOutputName, 'w')
 120         for lines in open(tmpInputName, 'r').xreadlines():
 121             chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
 122             intstart, intend, nobs = int(intstart), int(intend), int(nobs)
 123             site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart+2]
 124             site2 = msa.seqDict['dm2' + '.' + chrid][intend-2:intend]
 125             edges1 = msa[site1].edges()
 126             edges2 = msa[site2].edges()
 127             if len(edges1) == 0: # EMPTY EDGES
 128                 wlist = str(site1), 'dm2', chrid, intstart, intstart+2, '', '', '', '', ''
 129                 outfile.write('\t'.join(map(str, wlist)) + '\n')
 130             if len(edges2) == 0: # EMPTY EDGES
 131                 wlist = str(site2), 'dm2', chrid, intend-2, intend, '', '', '', '', ''
 132                 outfile.write('\t'.join(map(str, wlist)) + '\n')
 133             saveList = []
 134             for src, dest, e in edges1:
 135                 if len(str(src)) != 2 or len(str(dest)) != 2: continue
 136                 dotindex = (~msa.seqDict)[src].index('.')
 137                 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
 138                 dotindex = (~msa.seqDict)[dest].index('.')
 139                 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
 140                 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
 141                     destspecies, dest1, dest.start, dest.stop
 142                 saveList.append('\t'.join(map(str, wlist)) + '\n')
 143             for src, dest, e in edges2:
 144                 if len(str(src)) != 2 or len(str(dest)) != 2: continue
 145                 dotindex = (~msa.seqDict)[src].index('.')
 146                 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
 147                 dotindex = (~msa.seqDict)[dest].index('.')
 148                 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
 149                 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
 150                     destspecies, dest1, dest.start, dest.stop
 151                 saveList.append('\t'.join(map(str, wlist)) + '\n')
 152             saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
 153             for saveline in saveList:
 154                 outfile.write(saveline)
 155         outfile.close()
 156         md5old = hashlib.md5()
 157         md5old.update(open(newOutputName, 'r').read())
 158         md5new = hashlib.md5()
 159         md5new.update(open(tmpOutputName, 'r').read())
 160         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 161
 162         # TEXT<->BINARY TEST
 163         msafilelist = glob.glob(msaname + '*')
 164         msa.save_seq_dict()
 165         cnestedlist.dump_textfile(msaname, os.path.join(self.path, 'dm2_multiz15way.txt'))
 166         for filename in msafilelist: os.remove(filename)
 167         runPath = os.path.realpath(os.curdir)
 168         os.chdir(self.path)
 169         cnestedlist.textfile_to_binaries('dm2_multiz15way.txt')
 170         os.chdir(runPath)
 171
 172         msa1 = cnestedlist.NLMSA(msaname, 'r')
 173         msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way'
 174         pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1)
 175         pygr.Data.save()
 176         msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way')
 177         newOutputName = os.path.join(self.path, 'splicesite_new2.txt')
 178         tmpInputName = self.copyFile(outfileName)
 179         tmpOutputName = self.copyFile(outputName)
 180         outfile = open(newOutputName, 'w')
 181         for lines in open(tmpInputName, 'r').xreadlines():
 182             chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
 183             intstart, intend, nobs = int(intstart), int(intend), int(nobs)
 184             site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart+2]
 185             site2 = msa.seqDict['dm2' + '.' + chrid][intend-2:intend]
 186             edges1 = msa[site1].edges()
 187             edges2 = msa[site2].edges()
 188             if len(edges1) == 0: # EMPTY EDGES
 189                 wlist = str(site1), 'dm2', chrid, intstart, intstart+2, '', '', '', '', ''
 190                 outfile.write('\t'.join(map(str, wlist)) + '\n')
 191             if len(edges2) == 0: # EMPTY EDGES
 192                 wlist = str(site2), 'dm2', chrid, intend-2, intend, '', '', '', '', ''
 193                 outfile.write('\t'.join(map(str, wlist)) + '\n')
 194             saveList = []
 195             for src, dest, e in edges1:
 196                 if len(str(src)) != 2 or len(str(dest)) != 2: continue
 197                 dotindex = (~msa.seqDict)[src].index('.')
 198                 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
 199                 dotindex = (~msa.seqDict)[dest].index('.')
 200                 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
 201                 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
 202                     destspecies, dest1, dest.start, dest.stop
 203                 saveList.append('\t'.join(map(str, wlist)) + '\n')
 204             for src, dest, e in edges2:
 205                 if len(str(src)) != 2 or len(str(dest)) != 2: continue
 206                 dotindex = (~msa.seqDict)[src].index('.')
 207                 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
 208                 dotindex = (~msa.seqDict)[dest].index('.')
 209                 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
 210                 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
 211                     destspecies, dest1, dest.start, dest.stop
 212                 saveList.append('\t'.join(map(str, wlist)) + '\n')
 213             saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
 214             for saveline in saveList:
 215                 outfile.write(saveline)
 216         outfile.close()
 217         md5old = hashlib.md5()
 218         md5old.update(open(newOutputName, 'r').read())
 219         md5new = hashlib.md5()
 220         md5new.update(open(tmpOutputName, 'r').read())
 221         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 222
 223
 224 if __name__ == '__main__':
 225     PygrTestProgram(verbosity=2)
 226