tests/nlmsa_hg18_megatest.py

   1 import unittest
   2 from testlib import testutil, PygrTestProgram
   3
   4 import ConfigParser, sys, os, string, glob
   5 import pygr.Data
   6
   7 try:
   8     import hashlib
   9 except ImportError:
  10     import md5 as hashlib
  11
  12 config = ConfigParser.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
  13 config.read([ os.path.join(os.path.expanduser('~'), '.pygrrc'), os.path.join(os.path.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
  14 mafDir = config.get('megatests_hg18', 'mafDir')
  15 seqDir = config.get('megatests_hg18', 'seqDir')
  16 smallSampleKey = config.get('megatests_hg18', 'smallSampleKey')
  17 testInputDir = config.get('megatests', 'testInputDir')
  18 testOutputBaseDir = config.get('megatests', 'testOutputBaseDir')
  19
  20 if smallSampleKey:
  21     smallSamplePostfix = '_' + smallSampleKey
  22 else:
  23     smallSamplePostfix = ''
  24
  25 ## mafDir CONTAINS FOLLOWING DM2 MULTIZ15WAY MAF ALIGNMENTS
  26 ## seqDir CONTAINS FOLLOWING 15 GENOME ASSEMBLIES AND THEIR SEQDB FILES
  27 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
  28 ##        outfileName = 'splicesite_hg18.txt' # CHR4H TESTING
  29 ##        outputName = 'splicesite_hg18_multiz28way.txt' # CHR4H TESTING
  30 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
  31
  32 # DIRECTIONARY FOR DOC STRING OF SEQDB
  33 docStringDict = {
  34     'anoCar1':' Lizard Genome (January 2007)',
  35     'bosTau3':'Cow Genome (August 2006)',
  36     'canFam2':'Dog Genome (May 2005)',
  37     'cavPor2':'Guinea Pig (October 2005)',
  38     'danRer4':'Zebrafish Genome (March 2006)',
  39     'dasNov1':'Armadillo Genome (May 2005)',
  40     'echTel1':'Tenrec Genome (July 2005)',
  41     'eriEur1':'European Hedgehog (Junuary 2006)',
  42     'equCab1':'Horse Genome (January 2007)',
  43     'felCat3':'Cat Genome (March 2006)',
  44     'fr2':'Fugu Genome (October 2004)',
  45     'galGal3':'Chicken Genome (May 2006)',
  46     'gasAcu1':'Stickleback Genome (February 2006)',
  47     'hg18':'Human Genome (May 2006)',
  48     'loxAfr1':'Elephant Genome (May 2005)',
  49     'mm8':'Mouse Genome (March 2006)',
  50     'monDom4':'Opossum Genome (January 2006)',
  51     'ornAna1':'Platypus Genome (March 2007)',
  52     'oryCun1':'Rabbit Genome (May 2005)',
  53     'oryLat1':'Medaka Genome (April 2006)',
  54     'otoGar1':'Bushbaby Genome (December 2006)',
  55     'panTro2':'Chimpanzee Genome (March 2006)',
  56     'rheMac2':'Rhesus Genome (January 2006)',
  57     'rn4':'Rat Genome (November 2004)',
  58     'sorAra1':'Shrew (Junuary 2006)',
  59     'tetNig1':'Tetraodon Genome (February 2004)',
  60     'tupBel1':'Tree Shrew (December 2006)',
  61     'xenTro2':'X. tropicalis Genome (August 2005)'
  62     }
  63
  64 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
  65 msaSpeciesList = ['anoCar1', 'bosTau3', 'canFam2', 'cavPor2', 'danRer4', 'dasNov1', 'echTel1', \
  66                   'equCab1', 'eriEur1', 'felCat3', 'fr2', 'galGal3', 'gasAcu1', 'hg18', 'loxAfr1', \
  67                   'mm8', 'monDom4', 'ornAna1', 'oryCun1', 'oryLat1', 'otoGar1', 'panTro2', 'rheMac2', \
  68                   'rn4', 'sorAra1', 'tetNig1', 'tupBel1', 'xenTro2']
  69
  70 class PygrBuildNLMSAMegabase(unittest.TestCase):
  71     'restrict megatest to an initially empty directory, need large space to perform'
  72     def setUp(self, testDir = None):
  73         import random
  74         tmpList = [c for c in 'PygrBuildNLMSAMegabase']
  75         random.shuffle(tmpList)
  76         testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) # FOR TEST, SHOULD BE DELETED
  77         if testDir is None: testDir = 'TEST_' + ''.join(tmpList) # NOT SPECIFIED, USE CURRENT DIRECTORY
  78         try:
  79             os.mkdir(testDir)
  80             testDir = os.path.realpath(testDir)
  81         except:
  82             raise IOError
  83         self.path = testDir
  84         try:
  85             tmpFileName = os.path.join(testDir, 'DELETE_THIS_TEMP_FILE')
  86             open(tmpFileName, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
  87         except:
  88             raise IOError
  89         pygr.Data.update(self.path)
  90         from pygr import seqdb
  91         for orgstr in msaSpeciesList:
  92             genome = seqdb.BlastDB(os.path.join(seqDir, orgstr))
  93             genome.__doc__ = docStringDict[orgstr]
  94             pygr.Data.addResource('TEST.Seq.Genome.' + orgstr, genome)
  95         pygr.Data.save()
  96     def copyFile(self, filename): # COPY A FILE INTO TEST DIRECTORY
  97         newname = os.path.join(self.path, os.path.basename(filename))
  98         open(newname, 'w').write(open(filename, 'r').read())
  99         return newname
 100     def tearDown(self):
 101         'delete the temporary directory and files, restore pygr.Data path'
 102         for dirpath, subdirs, files in os.walk(self.path, topdown = False): # SHOULD BE DELETED BOTTOM-UP FASHION
 103             # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
 104             # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
 105             for filename in files:
 106                 os.remove(os.path.join(dirpath, filename))
 107             os.rmdir(dirpath)
 108         # Restore original pygr.Data path to remedy lack of isolation
 109         # between tests from the same run
 110         pygr.Data.update(None)
 111
 112
 113 class Build_Test(PygrBuildNLMSAMegabase):
 114     def test_seqdb(self):
 115         'Check pygr.Data contents'
 116         l = pygr.Data.dir('TEST')
 117         preList = ['TEST.Seq.Genome.' + orgstr for orgstr in msaSpeciesList]
 118         assert l == preList
 119     def test_build(self):
 120         'Test building an NLMSA and querying results'
 121         from pygr import seqdb, cnestedlist
 122         genomedict = {}
 123         for orgstr in msaSpeciesList:
 124             genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' + orgstr)
 125         uniondict = seqdb.PrefixUnionDict(genomedict)
 126         if smallSampleKey:
 127             maflist = ( os.path.join(mafDir, smallSampleKey + '.maf'), )
 128         else:
 129             maflist = glob.glob(os.path.join(mafDir, '*.maf'))
 130             maflist.sort()
 131         msaname = os.path.join(self.path, 'hg18_multiz28way')
 132         msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, maflist, maxlen = 536870912, maxint = 22369620) # 500MB VERSION
 133         msa1.__doc__ = 'TEST NLMSA for hg18 multiz28way'
 134         pygr.Data.addResource('TEST.MSA.UCSC.hg18_multiz28way', msa1)
 135         pygr.Data.save()
 136         msa = pygr.Data.getResource('TEST.MSA.UCSC.hg18_multiz28way')
 137         outfileName = os.path.join(testInputDir, 'splicesite_hg18%s.txt' % smallSamplePostfix)
 138         outputName = os.path.join(testInputDir, 'splicesite_hg18%s_multiz28way.txt' % smallSamplePostfix)
 139         newOutputName = os.path.join(self.path, 'splicesite_new1.txt')
 140         tmpInputName = self.copyFile(outfileName)
 141         tmpOutputName = self.copyFile(outputName)
 142         outfile = open(newOutputName, 'w')
 143         for lines in open(tmpInputName, 'r').xreadlines():
 144             chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
 145             intstart, intend, nobs = int(intstart), int(intend), int(nobs)
 146             site1 = msa.seqDict['hg18' + '.' + chrid][intstart:intstart+2]
 147             site2 = msa.seqDict['hg18' + '.' + chrid][intend-2:intend]
 148             edges1 = msa[site1].edges()
 149             edges2 = msa[site2].edges()
 150             if len(edges1) == 0: # EMPTY EDGES
 151                 wlist = str(site1), 'hg18', chrid, intstart, intstart+2, '', '', '', '', ''
 152                 outfile.write('\t'.join(map(str, wlist)) + '\n')
 153             if len(edges2) == 0: # EMPTY EDGES
 154                 wlist = str(site2), 'hg18', chrid, intend-2, intend, '', '', '', '', ''
 155                 outfile.write('\t'.join(map(str, wlist)) + '\n')
 156             saveList = []
 157             for src, dest, e in edges1:
 158                 if len(str(src)) != 2 or len(str(dest)) != 2: continue
 159                 dotindex = (~msa.seqDict)[src].index('.')
 160                 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
 161                 dotindex = (~msa.seqDict)[dest].index('.')
 162                 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
 163                 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
 164                     destspecies, dest1, dest.start, dest.stop
 165                 saveList.append('\t'.join(map(str, wlist)) + '\n')
 166             for src, dest, e in edges2:
 167                 if len(str(src)) != 2 or len(str(dest)) != 2: continue
 168                 dotindex = (~msa.seqDict)[src].index('.')
 169                 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
 170                 dotindex = (~msa.seqDict)[dest].index('.')
 171                 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
 172                 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
 173                     destspecies, dest1, dest.start, dest.stop
 174                 saveList.append('\t'.join(map(str, wlist)) + '\n')
 175             saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
 176             for saveline in saveList:
 177                 outfile.write(saveline)
 178         outfile.close()
 179         md5old = hashlib.md5()
 180         md5old.update(open(newOutputName, 'r').read())
 181         md5new = hashlib.md5()
 182         md5new.update(open(tmpOutputName, 'r').read())
 183         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 184
 185         # TEXT<->BINARY TEST
 186         msafilelist = glob.glob(msaname + '*')
 187         msa.save_seq_dict()
 188         cnestedlist.dump_textfile(msaname, os.path.join(self.path, 'hg18_multiz28way.txt'))
 189         for filename in msafilelist: os.remove(filename)
 190         runPath = os.path.realpath(os.curdir)
 191         os.chdir(self.path)
 192         cnestedlist.textfile_to_binaries('hg18_multiz28way.txt')
 193         os.chdir(runPath)
 194
 195         msa1 = cnestedlist.NLMSA(msaname, 'r')
 196         msa1.__doc__ = 'TEST NLMSA for hg18 multiz28way'
 197         pygr.Data.addResource('TEST.MSA.UCSC.hg18_multiz28way', msa1)
 198         pygr.Data.save()
 199         msa = pygr.Data.getResource('TEST.MSA.UCSC.hg18_multiz28way')
 200         newOutputName = os.path.join(self.path, 'splicesite_new2.txt')
 201         tmpInputName = self.copyFile(outfileName)
 202         tmpOutputName = self.copyFile(outputName)
 203         outfile = open(newOutputName, 'w')
 204         for lines in open(tmpInputName, 'r').xreadlines():
 205             chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
 206             intstart, intend, nobs = int(intstart), int(intend), int(nobs)
 207             site1 = msa.seqDict['hg18' + '.' + chrid][intstart:intstart+2]
 208             site2 = msa.seqDict['hg18' + '.' + chrid][intend-2:intend]
 209             edges1 = msa[site1].edges()
 210             edges2 = msa[site2].edges()
 211             if len(edges1) == 0: # EMPTY EDGES
 212                 wlist = str(site1), 'hg18', chrid, intstart, intstart+2, '', '', '', '', ''
 213                 outfile.write('\t'.join(map(str, wlist)) + '\n')
 214             if len(edges2) == 0: # EMPTY EDGES
 215                 wlist = str(site2), 'hg18', chrid, intend-2, intend, '', '', '', '', ''
 216                 outfile.write('\t'.join(map(str, wlist)) + '\n')
 217             saveList = []
 218             for src, dest, e in edges1:
 219                 if len(str(src)) != 2 or len(str(dest)) != 2: continue
 220                 dotindex = (~msa.seqDict)[src].index('.')
 221                 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
 222                 dotindex = (~msa.seqDict)[dest].index('.')
 223                 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
 224                 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
 225                     destspecies, dest1, dest.start, dest.stop
 226                 saveList.append('\t'.join(map(str, wlist)) + '\n')
 227             for src, dest, e in edges2:
 228                 if len(str(src)) != 2 or len(str(dest)) != 2: continue
 229                 dotindex = (~msa.seqDict)[src].index('.')
 230                 srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
 231                 dotindex = (~msa.seqDict)[dest].index('.')
 232                 destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
 233                 wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
 234                     destspecies, dest1, dest.start, dest.stop
 235                 saveList.append('\t'.join(map(str, wlist)) + '\n')
 236             saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
 237             for saveline in saveList:
 238                 outfile.write(saveline)
 239         outfile.close()
 240         md5old = hashlib.md5()
 241         md5old.update(open(newOutputName, 'r').read())
 242         md5new = hashlib.md5()
 243         md5new.update(open(tmpOutputName, 'r').read())
 244         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 245
 246
 247 if __name__ == '__main__':
 248     PygrTestProgram(verbosity=2)
 249