tests/annotation_dm2_megatest.py

   1 import unittest
   2 from testlib import testutil, PygrTestProgram
   3
   4 import ConfigParser, sys, os, string
   5 from pygr.mapping import Collection
   6 import pygr.Data
   7
   8 try:
   9     import hashlib
  10 except ImportError:
  11     import md5 as hashlib
  12
  13 config = ConfigParser.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
  14 config.read([ os.path.join(os.path.expanduser('~'), '.pygrrc'), os.path.join(os.path.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
  15 msaDir = config.get('megatests_dm2', 'msaDir')
  16 seqDir = config.get('megatests_dm2', 'seqDir')
  17 smallSampleKey = config.get('megatests_dm2', 'smallSampleKey')
  18 testInputDB = config.get('megatests', 'testInputDB')
  19 testInputDir = config.get('megatests', 'testInputDir')
  20 testOutputBaseDir = config.get('megatests', 'testOutputBaseDir')
  21
  22 if smallSampleKey:
  23     smallSamplePostfix = '_' + smallSampleKey
  24 else:
  25     smallSamplePostfix = ''
  26
  27 ## msaDir CONTAINS PRE-BUILT NLMSA
  28 ## seqDir CONTAINS GENOME ASSEMBLIES AND THEIR SEQDB FILES
  29 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
  30 ##        exonAnnotFileName = 'Annotation_ConservedElement_Exons_dm2.txt'
  31 ##        intronAnnotFileName = 'Annotation_ConservedElement_Introns_dm2.txt'
  32 ##        stopAnnotFileName = 'Annotation_ConservedElement_Stop_dm2.txt'
  33 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
  34
  35 # DIRECTIONARY FOR DOC STRING OF SEQDB
  36 docStringDict = {
  37     'anoGam1':'A. gambiae Genome (February 2003)',
  38     'apiMel2':'A. mellifera Genome (January 2005)',
  39     'dm2':'D. melanogaster Genome (April 2004)',
  40     'dp4':'D. pseudoobscura Genome (February 2006)',
  41     'droAna3':'D. ananassae Genome (February 2006)',
  42     'droEre2':'D. erecta Genome (February 2006)',
  43     'droGri2':'D. grimshawi Genome (February 2006)',
  44     'droMoj3':'D. mojavensis Genome (February 2006)',
  45     'droPer1':'D. persimilis Genome (October 2005)',
  46     'droSec1':'D. sechellia Genome (October 2005)',
  47     'droSim1':'D. simulans Genome (April 2005)',
  48     'droVir3':'D. virilis Genome (February 2006)',
  49     'droWil1':'D. willistoni Genome (February 2006)',
  50     'droYak2':'D. yakuba Genome (November 2005)',
  51     'triCas2':'T. castaneum Genome (September 2005)',
  52     }
  53
  54 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
  55 msaSpeciesList = ['anoGam1', 'apiMel2', 'dm2', 'dp4', 'droAna3', 'droEre2', 'droGri2', 'droMoj3', \
  56                   'droPer1', 'droSec1', 'droSim1', 'droVir3', 'droWil1', 'droYak2', 'triCas2']
  57
  58 class PygrBuildNLMSAMegabase(unittest.TestCase):
  59     'restrict megatest to an initially empty directory, need large space to perform'
  60     def setUp(self, testDir = None):
  61         import random
  62         tmpList = [c for c in 'PygrBuildNLMSAMegabase']
  63         random.shuffle(tmpList)
  64         testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) # FOR TEST, SHOULD BE DELETED
  65         if testDir is None: testDir = 'TEST_' + ''.join(tmpList) # NOT SPECIFIED, USE CURRENT DIRECTORY
  66         try:
  67             os.mkdir(testDir)
  68             testDir = os.path.realpath(testDir)
  69         except:
  70             raise IOError
  71         self.path = testDir
  72         try:
  73             tmpFileName = os.path.join(testDir, 'DELETE_THIS_TEMP_FILE')
  74             open(tmpFileName, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
  75         except:
  76             raise IOError
  77         pygr.Data.update(self.path)
  78         from pygr import seqdb
  79         for orgstr in msaSpeciesList:
  80             genome = seqdb.BlastDB(os.path.join(seqDir, orgstr))
  81             genome.__doc__ = docStringDict[orgstr]
  82             pygr.Data.addResource('TEST.Seq.Genome.' + orgstr, genome)
  83         pygr.Data.save()
  84     def copyFile(self, filename): # COPY A FILE INTO TEST DIRECTORY
  85         newname = os.path.join(self.path, os.path.basename(filename))
  86         open(newname, 'w').write(open(filename, 'r').read())
  87         return newname
  88     def tearDown(self):
  89         'delete the temporary directory and files, restore pygr.Data path'
  90         for dirpath, subdirs, files in os.walk(self.path, topdown = False): # SHOULD BE DELETED BOTTOM-UP FASHION
  91             # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
  92             # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
  93             for filename in files:
  94                 os.remove(os.path.join(dirpath, filename))
  95             os.rmdir(dirpath)
  96         # Restore original pygr.Data path to remedy lack of isolation
  97         # between tests from the same run
  98         pygr.Data.update(None)
  99
 100
 101 class Build_Test(PygrBuildNLMSAMegabase):
 102     def test_seqdb(self):
 103         'Check pygr.Data contents'
 104         l = pygr.Data.dir('TEST')
 105         preList = ['TEST.Seq.Genome.' + orgstr for orgstr in msaSpeciesList]
 106         assert l == preList
 107     def test_collectionannot(self):
 108         'Test building an AnnotationDB from file'
 109         from pygr import seqdb, cnestedlist, sqlgraph
 110         dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
 111         # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS
 112         exon_slices = Collection(filename = os.path.join(self.path, 'refGene_exonAnnot_dm2.cdb'), \
 113             intKeys = True, mode = 'cr', writeback = False)
 114         exon_db = seqdb.AnnotationDB(exon_slices, dm2,
 115                                sliceAttrDict = dict(id = 0, exon_id = 1, orientation = 2,
 116                                                   gene_id = 3, start = 4, stop = 5))
 117         msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_dm2'), 'w', \
 118             pairwiseMode = True, bidirectional = False)
 119         for lines in open(os.path.join(testInputDir, 'refGene_exonAnnot%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines():
 120             row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
 121             row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
 122             exon_slices[row[1]] = row
 123             exon = exon_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
 124             msa.addAnnotation(exon) # SAVE IT TO GENOME MAPPING
 125         exon_db.clear_cache() # not really necessary; cache should autoGC
 126         exon_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
 127         msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
 128         exon_db.__doc__ = 'Exon Annotation Database for dm2'
 129         pygr.Data.addResource('TEST.Annotation.dm2.exons', exon_db)
 130         msa.__doc__ = 'NLMSA Exon for dm2'
 131         pygr.Data.addResource('TEST.Annotation.NLMSA.dm2.exons', msa)
 132         exon_schema = pygr.Data.ManyToManyRelation(dm2, exon_db, bindAttrs = ('exon1',))
 133         exon_schema.__doc__ = 'Exon Schema for dm2'
 134         pygr.Data.addSchema('TEST.Annotation.NLMSA.dm2.exons', exon_schema)
 135         # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES
 136         splice_slices = Collection(filename = os.path.join(self.path, 'refGene_spliceAnnot_dm2.cdb'), \
 137             intKeys = True, mode = 'cr', writeback = False)
 138         splice_db = seqdb.AnnotationDB(splice_slices, dm2,
 139                                sliceAttrDict = dict(id = 0, splice_id = 1, orientation = 2,
 140                                                   gene_id = 3, start = 4, stop = 5))
 141         msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_dm2'), 'w', \
 142             pairwiseMode = True, bidirectional = False)
 143         for lines in open(os.path.join(testInputDir, 'refGene_spliceAnnot%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines():
 144             row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
 145             row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
 146             splice_slices[row[1]] = row
 147             splice = splice_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
 148             msa.addAnnotation(splice) # SAVE IT TO GENOME MAPPING
 149         splice_db.clear_cache() # not really necessary; cache should autoGC
 150         splice_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
 151         msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
 152         splice_db.__doc__ = 'Splice Annotation Database for dm2'
 153         pygr.Data.addResource('TEST.Annotation.dm2.splices', splice_db)
 154         msa.__doc__ = 'NLMSA Splice for dm2'
 155         pygr.Data.addResource('TEST.Annotation.NLMSA.dm2.splices', msa)
 156         splice_schema = pygr.Data.ManyToManyRelation(dm2, splice_db, bindAttrs = ('splice1',))
 157         splice_schema.__doc__ = 'Splice Schema for dm2'
 158         pygr.Data.addSchema('TEST.Annotation.NLMSA.dm2.splices', splice_schema)
 159         # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC
 160         ucsc_slices = Collection(filename = os.path.join(self.path, 'phastConsElements15way_dm2.cdb'), \
 161             intKeys = True, mode = 'cr', writeback = False)
 162         ucsc_db = seqdb.AnnotationDB(ucsc_slices, dm2,
 163                                sliceAttrDict = dict(id = 0, ucsc_id = 1, orientation = 2,
 164                                                   gene_id = 3, start = 4, stop = 5))
 165         msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements15way_dm2'), 'w', \
 166             pairwiseMode = True, bidirectional = False)
 167         for lines in open(os.path.join(testInputDir, 'phastConsElements15way%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines():
 168             row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
 169             row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
 170             ucsc_slices[row[1]] = row
 171             ucsc = ucsc_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
 172             msa.addAnnotation(ucsc) # SAVE IT TO GENOME MAPPING
 173         ucsc_db.clear_cache() # not really necessary; cache should autoGC
 174         ucsc_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
 175         msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
 176         ucsc_db.__doc__ = 'Most Conserved Elements for dm2'
 177         pygr.Data.addResource('TEST.Annotation.UCSC.dm2.mostconserved', ucsc_db)
 178         msa.__doc__ = 'NLMSA for Most Conserved Elements for dm2'
 179         pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved', msa)
 180         ucsc_schema = pygr.Data.ManyToManyRelation(dm2, ucsc_db, bindAttrs = ('element1',))
 181         ucsc_schema.__doc__ = 'Schema for UCSC Most Conserved Elements for dm2'
 182         pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved', ucsc_schema)
 183         pygr.Data.save()
 184         pygr.Data.clear_cache() # force resources to reload when requested
 185
 186         # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
 187         dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
 188         exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.dm2.exons')
 189         splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.dm2.splices')
 190         conservedmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved')
 191         exons = pygr.Data.getResource('TEST.Annotation.dm2.exons')
 192         splices = pygr.Data.getResource('TEST.Annotation.dm2.splices')
 193         mostconserved = pygr.Data.getResource('TEST.Annotation.UCSC.dm2.mostconserved')
 194
 195         # OPEN DM2_MULTIZ15WAY NLMSA
 196         msa = cnestedlist.NLMSA(os.path.join(msaDir, 'dm2_multiz15way'), 'r', trypath = [seqDir])
 197
 198         exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_dm2.txt' % smallSamplePostfix)
 199         intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_dm2.txt' % smallSamplePostfix)
 200         newexonAnnotFileName = os.path.join(self.path, 'new_Exons_dm2.txt')
 201         newintronAnnotFileName = os.path.join(self.path, 'new_Introns_dm2.txt')
 202         tmpexonAnnotFileName = self.copyFile(exonAnnotFileName)
 203         tmpintronAnnotFileName = self.copyFile(intronAnnotFileName)
 204
 205         if smallSampleKey:
 206             chrList = [ smallSampleKey ]
 207         else:
 208             chrList = dm2.seqLenDict.keys()
 209             chrList.sort()
 210
 211         outfile = open(newexonAnnotFileName, 'w')
 212         for chrid in chrList:
 213             slice = dm2[chrid]
 214             try:
 215                 ex1 = exonmsa[slice]
 216             except KeyError:
 217                 continue
 218             else:
 219                 exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()]
 220                 exlist1.sort()
 221                 for ixx, exon in exlist1:
 222                     saveList = []
 223                     tmp = exon.sequence
 224                     tmpexon = exons[exon.exon_id]
 225                     tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE
 226                     wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, tmpslice.start, tmpslice.stop
 227                     try:
 228                         out1 = conservedmsa[tmp]
 229                     except KeyError:
 230                         pass
 231                     else:
 232                         elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
 233                         elementlist.sort()
 234                         for iyy, element in elementlist:
 235                             if element.stop - element.start < 100: continue
 236                             score = int(string.split(element.gene_id, '=')[1])
 237                             if score < 100: continue
 238                             tmp2 = element.sequence
 239                             tmpelement = mostconserved[element.ucsc_id]
 240                             tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
 241                             wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
 242                             slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
 243                             tmp1 = msa.seqDict['dm2.' + chrid][slicestart:sliceend]
 244                             edges = msa[tmp1].edges()
 245                             for src, dest, e in edges:
 246                                 if src.stop - src.start < 100: continue
 247                                 palign, pident = e.pAligned(), e.pIdentity()
 248                                 if palign < 0.8 or pident < 0.8: continue
 249                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 250                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 251                                     (~msa.seqDict)[dest], \
 252                                     str(dest), dest.start, dest.stop, palign, pident)
 253                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 254                         saveList.sort()
 255                         for saveline in saveList:
 256                             outfile.write(saveline)
 257         outfile.close()
 258         md5old = hashlib.md5()
 259         md5old.update(open(tmpexonAnnotFileName, 'r').read())
 260         md5new = hashlib.md5()
 261         md5new.update(open(newexonAnnotFileName, 'r').read())
 262         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 263
 264         outfile = open(newintronAnnotFileName, 'w')
 265         for chrid in chrList:
 266             slice = dm2[chrid]
 267             try:
 268                 sp1 = splicemsa[slice]
 269             except:
 270                 continue
 271             else:
 272                 splist1 = [(ix.splice_id, ix) for ix in sp1.keys()]
 273                 splist1.sort()
 274                 for ixx, splice in splist1:
 275                     saveList = []
 276                     tmp = splice.sequence
 277                     tmpsplice = splices[splice.splice_id]
 278                     tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE
 279                     wlist1 = 'INTRON', chrid, tmpsplice.splice_id, tmpsplice.gene_id, tmpslice.start, tmpslice.stop
 280                     try:
 281                         out1 = conservedmsa[tmp]
 282                     except KeyError:
 283                         pass
 284                     else:
 285                         elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
 286                         elementlist.sort()
 287                         for iyy, element in elementlist:
 288                             if element.stop - element.start < 100: continue
 289                             score = int(string.split(element.gene_id, '=')[1])
 290                             if score < 100: continue
 291                             tmp2 = element.sequence
 292                             tmpelement = mostconserved[element.ucsc_id]
 293                             tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
 294                             wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
 295                             slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
 296                             tmp1 = msa.seqDict['dm2.' + chrid][slicestart:sliceend]
 297                             edges = msa[tmp1].edges()
 298                             for src, dest, e in edges:
 299                                 if src.stop - src.start < 100: continue
 300                                 palign, pident = e.pAligned(), e.pIdentity()
 301                                 if palign < 0.8 or pident < 0.8: continue
 302                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 303                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 304                                     (~msa.seqDict)[dest], \
 305                                     str(dest), dest.start, dest.stop, palign, pident)
 306                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 307                         saveList.sort()
 308                         for saveline in saveList:
 309                             outfile.write(saveline)
 310         outfile.close()
 311         md5old = hashlib.md5()
 312         md5old.update(open(tmpintronAnnotFileName, 'r').read())
 313         md5new = hashlib.md5()
 314         md5new.update(open(newintronAnnotFileName, 'r').read())
 315         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 316
 317     def test_mysqlannot(self):
 318         'Test building an AnnotationDB from MySQL'
 319         from pygr import seqdb, cnestedlist, sqlgraph
 320         dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
 321         # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION
 322         exon_slices = sqlgraph.SQLTableClustered('%s.pygr_refGene_exonAnnot%s_dm2' % ( testInputDB, smallSamplePostfix ),
 323             clusterKey = 'chromosome', maxCache = 0)
 324         exon_db = seqdb.AnnotationDB(exon_slices, dm2, sliceAttrDict = dict(id = 'chromosome', \
 325             gene_id = 'name', exon_id = 'exon_id'))
 326         msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_SQL_dm2'), 'w', \
 327             pairwiseMode = True, bidirectional = False)
 328         for id in exon_db:
 329             msa.addAnnotation(exon_db[id])
 330         exon_db.clear_cache() # not really necessary; cache should autoGC
 331         exon_slices.clear_cache()
 332         msa.build()
 333         exon_db.__doc__ = 'SQL Exon Annotation Database for dm2'
 334         pygr.Data.addResource('TEST.Annotation.SQL.dm2.exons', exon_db)
 335         msa.__doc__ = 'SQL NLMSA Exon for dm2'
 336         pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.dm2.exons', msa)
 337         exon_schema = pygr.Data.ManyToManyRelation(dm2, exon_db, bindAttrs = ('exon2',))
 338         exon_schema.__doc__ = 'SQL Exon Schema for dm2'
 339         pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.dm2.exons', exon_schema)
 340         # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES: MYSQL VERSION
 341         splice_slices = sqlgraph.SQLTableClustered('%s.pygr_refGene_spliceAnnot%s_dm2' % ( testInputDB, smallSamplePostfix ),
 342             clusterKey = 'chromosome', maxCache = 0)
 343         splice_db = seqdb.AnnotationDB(splice_slices, dm2, sliceAttrDict = dict(id = 'chromosome', \
 344             gene_id = 'name', splice_id = 'splice_id'))
 345         msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_SQL_dm2'), 'w', \
 346             pairwiseMode = True, bidirectional = False)
 347         for id in splice_db:
 348             msa.addAnnotation(splice_db[id])
 349         splice_db.clear_cache() # not really necessary; cache should autoGC
 350         splice_slices.clear_cache()
 351         msa.build()
 352         splice_db.__doc__ = 'SQL Splice Annotation Database for dm2'
 353         pygr.Data.addResource('TEST.Annotation.SQL.dm2.splices', splice_db)
 354         msa.__doc__ = 'SQL NLMSA Splice for dm2'
 355         pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.dm2.splices', msa)
 356         splice_schema = pygr.Data.ManyToManyRelation(dm2, splice_db, bindAttrs = ('splice2',))
 357         splice_schema.__doc__ = 'SQL Splice Schema for dm2'
 358         pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.dm2.splices', splice_schema)
 359         # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC: MYSQL VERSION
 360         ucsc_slices = sqlgraph.SQLTableClustered('%s.pygr_phastConsElements15way%s_dm2' % ( testInputDB, smallSamplePostfix ),
 361             clusterKey = 'chromosome', maxCache = 0)
 362         ucsc_db = seqdb.AnnotationDB(ucsc_slices, dm2, sliceAttrDict = dict(id = 'chromosome', \
 363             gene_id = 'name', ucsc_id = 'ucsc_id'))
 364         msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements15way_SQL_dm2'), 'w', \
 365             pairwiseMode = True, bidirectional = False)
 366         for id in ucsc_db:
 367             msa.addAnnotation(ucsc_db[id])
 368         ucsc_db.clear_cache() # not really necessary; cache should autoGC
 369         ucsc_slices.clear_cache()
 370         msa.build()
 371         ucsc_db.__doc__ = 'SQL Most Conserved Elements for dm2'
 372         pygr.Data.addResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved', ucsc_db)
 373         msa.__doc__ = 'SQL NLMSA for Most Conserved Elements for dm2'
 374         pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', msa)
 375         ucsc_schema = pygr.Data.ManyToManyRelation(dm2, ucsc_db, bindAttrs = ('element2',))
 376         ucsc_schema.__doc__ = 'SQL Schema for UCSC Most Conserved Elements for dm2'
 377         pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', ucsc_schema)
 378         pygr.Data.save()
 379         pygr.Data.clear_cache()
 380
 381         # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
 382         dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2')
 383         exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.dm2.exons')
 384         splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.dm2.splices')
 385         conservedmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved')
 386         exons = pygr.Data.getResource('TEST.Annotation.SQL.dm2.exons')
 387         splices = pygr.Data.getResource('TEST.Annotation.SQL.dm2.splices')
 388         mostconserved = pygr.Data.getResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved')
 389
 390         # OPEN DM2_MULTIZ15WAY NLMSA
 391         msa = cnestedlist.NLMSA(os.path.join(msaDir, 'dm2_multiz15way'), 'r', trypath = [seqDir])
 392
 393         exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_dm2.txt' % smallSamplePostfix)
 394         intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_dm2.txt' % smallSamplePostfix)
 395         newexonAnnotFileName = os.path.join(self.path, 'new_Exons_dm2.txt')
 396         newintronAnnotFileName = os.path.join(self.path, 'new_Introns_dm2.txt')
 397         tmpexonAnnotFileName = self.copyFile(exonAnnotFileName)
 398         tmpintronAnnotFileName = self.copyFile(intronAnnotFileName)
 399
 400         if smallSampleKey:
 401             chrList = [ smallSampleKey ]
 402         else:
 403             chrList = dm2.seqLenDict.keys()
 404             chrList.sort()
 405
 406         outfile = open(newexonAnnotFileName, 'w')
 407         for chrid in chrList:
 408             slice = dm2[chrid]
 409             try:
 410                 ex1 = exonmsa[slice]
 411             except KeyError:
 412                 continue
 413             else:
 414                 exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()]
 415                 exlist1.sort()
 416                 for ixx, exon in exlist1:
 417                     saveList = []
 418                     tmp = exon.sequence
 419                     tmpexon = exons[exon.exon_id]
 420                     tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE
 421                     wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, tmpslice.start, tmpslice.stop
 422                     try:
 423                         out1 = conservedmsa[tmp]
 424                     except KeyError:
 425                         pass
 426                     else:
 427                         elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
 428                         elementlist.sort()
 429                         for iyy, element in elementlist:
 430                             if element.stop - element.start < 100: continue
 431                             score = int(string.split(element.gene_id, '=')[1])
 432                             if score < 100: continue
 433                             tmp2 = element.sequence
 434                             tmpelement = mostconserved[element.ucsc_id]
 435                             tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
 436                             wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
 437                             slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
 438                             tmp1 = msa.seqDict['dm2.' + chrid][slicestart:sliceend]
 439                             edges = msa[tmp1].edges()
 440                             for src, dest, e in edges:
 441                                 if src.stop - src.start < 100: continue
 442                                 palign, pident = e.pAligned(), e.pIdentity()
 443                                 if palign < 0.8 or pident < 0.8: continue
 444                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 445                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 446                                     (~msa.seqDict)[dest], \
 447                                     str(dest), dest.start, dest.stop, palign, pident)
 448                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 449                         saveList.sort()
 450                         for saveline in saveList:
 451                             outfile.write(saveline)
 452         outfile.close()
 453         md5old = hashlib.md5()
 454         md5old.update(open(tmpexonAnnotFileName, 'r').read())
 455         md5new = hashlib.md5()
 456         md5new.update(open(newexonAnnotFileName, 'r').read())
 457         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 458
 459         outfile = open(newintronAnnotFileName, 'w')
 460         for chrid in chrList:
 461             slice = dm2[chrid]
 462             try:
 463                 sp1 = splicemsa[slice]
 464             except:
 465                 continue
 466             else:
 467                 splist1 = [(ix.splice_id, ix) for ix in sp1.keys()]
 468                 splist1.sort()
 469                 for ixx, splice in splist1:
 470                     saveList = []
 471                     tmp = splice.sequence
 472                     tmpsplice = splices[splice.splice_id]
 473                     tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE
 474                     wlist1 = 'INTRON', chrid, tmpsplice.splice_id, tmpsplice.gene_id, tmpslice.start, tmpslice.stop
 475                     try:
 476                         out1 = conservedmsa[tmp]
 477                     except KeyError:
 478                         pass
 479                     else:
 480                         elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
 481                         elementlist.sort()
 482                         for iyy, element in elementlist:
 483                             if element.stop - element.start < 100: continue
 484                             score = int(string.split(element.gene_id, '=')[1])
 485                             if score < 100: continue
 486                             tmp2 = element.sequence
 487                             tmpelement = mostconserved[element.ucsc_id]
 488                             tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
 489                             wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
 490                             slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
 491                             tmp1 = msa.seqDict['dm2.' + chrid][slicestart:sliceend]
 492                             edges = msa[tmp1].edges()
 493                             for src, dest, e in edges:
 494                                 if src.stop - src.start < 100: continue
 495                                 palign, pident = e.pAligned(), e.pIdentity()
 496                                 if palign < 0.8 or pident < 0.8: continue
 497                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 498                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 499                                     (~msa.seqDict)[dest], \
 500                                     str(dest), dest.start, dest.stop, palign, pident)
 501                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 502                         saveList.sort()
 503                         for saveline in saveList:
 504                             outfile.write(saveline)
 505         outfile.close()
 506         md5old = hashlib.md5()
 507         md5old.update(open(tmpintronAnnotFileName, 'r').read())
 508         md5new = hashlib.md5()
 509         md5new.update(open(newintronAnnotFileName, 'r').read())
 510         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 511
 512
 513 if __name__ == '__main__':
 514     PygrTestProgram(verbosity=2)
 515