tests/annotation_hg18_megatest.py

   1 import unittest
   2 from testlib import testutil, PygrTestProgram
   3
   4 import ConfigParser, sys, os, string
   5 from pygr.mapping import Collection
   6 import pygr.Data
   7
   8 try:
   9     import hashlib
  10 except ImportError:
  11     import md5 as hashlib
  12
  13 config = ConfigParser.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
  14 config.read([ os.path.join(os.path.expanduser('~'), '.pygrrc'), os.path.join(os.path.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
  15 msaDir = config.get('megatests_hg18', 'msaDir')
  16 seqDir = config.get('megatests_hg18', 'seqDir')
  17 smallSampleKey = config.get('megatests_hg18', 'smallSampleKey')
  18 testInputDB = config.get('megatests', 'testInputDB')
  19 testInputDir = config.get('megatests', 'testInputDir')
  20 testOutputBaseDir = config.get('megatests', 'testOutputBaseDir')
  21
  22 if smallSampleKey:
  23     smallSamplePostfix = '_' + smallSampleKey
  24 else:
  25     smallSamplePostfix = ''
  26
  27 ## msaDir CONTAINS PRE-BUILT NLMSA
  28 ## seqDir CONTAINS GENOME ASSEMBLIES AND THEIR SEQDB FILES
  29 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
  30 ##        exonAnnotFileName = 'Annotation_ConservedElement_Exons_hg18.txt'
  31 ##        intronAnnotFileName = 'Annotation_ConservedElement_Introns_hg18.txt'
  32 ##        stopAnnotFileName = 'Annotation_ConservedElement_Stop_hg18.txt'
  33 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
  34
  35 # DIRECTIONARY FOR DOC STRING OF SEQDB
  36 docStringDict = {
  37     'anoCar1':' Lizard Genome (January 2007)',
  38     'bosTau3':'Cow Genome (August 2006)',
  39     'canFam2':'Dog Genome (May 2005)',
  40     'cavPor2':'Guinea Pig (October 2005)',
  41     'danRer4':'Zebrafish Genome (March 2006)',
  42     'dasNov1':'Armadillo Genome (May 2005)',
  43     'echTel1':'Tenrec Genome (July 2005)',
  44     'eriEur1':'European Hedgehog (Junuary 2006)',
  45     'equCab1':'Horse Genome (January 2007)',
  46     'felCat3':'Cat Genome (March 2006)',
  47     'fr2':'Fugu Genome (October 2004)',
  48     'galGal3':'Chicken Genome (May 2006)',
  49     'gasAcu1':'Stickleback Genome (February 2006)',
  50     'hg18':'Human Genome (May 2006)',
  51     'loxAfr1':'Elephant Genome (May 2005)',
  52     'mm8':'Mouse Genome (March 2006)',
  53     'monDom4':'Opossum Genome (January 2006)',
  54     'ornAna1':'Platypus Genome (March 2007)',
  55     'oryCun1':'Rabbit Genome (May 2005)',
  56     'oryLat1':'Medaka Genome (April 2006)',
  57     'otoGar1':'Bushbaby Genome (December 2006)',
  58     'panTro2':'Chimpanzee Genome (March 2006)',
  59     'rheMac2':'Rhesus Genome (January 2006)',
  60     'rn4':'Rat Genome (November 2004)',
  61     'sorAra1':'Shrew (Junuary 2006)',
  62     'tetNig1':'Tetraodon Genome (February 2004)',
  63     'tupBel1':'Tree Shrew (December 2006)',
  64     'xenTro2':'X. tropicalis Genome (August 2005)'
  65     }
  66
  67 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
  68 msaSpeciesList = ['anoCar1', 'bosTau3', 'canFam2', 'cavPor2', 'danRer4', 'dasNov1', 'echTel1', \
  69                   'equCab1', 'eriEur1', 'felCat3', 'fr2', 'galGal3', 'gasAcu1', 'hg18', 'loxAfr1', \
  70                   'mm8', 'monDom4', 'ornAna1', 'oryCun1', 'oryLat1', 'otoGar1', 'panTro2', 'rheMac2', \
  71                   'rn4', 'sorAra1', 'tetNig1', 'tupBel1', 'xenTro2']
  72
  73 class PygrBuildNLMSAMegabase(unittest.TestCase):
  74     'restrict megatest to an initially empty directory, need large space to perform'
  75     def setUp(self, testDir = None):
  76         import random
  77         tmpList = [c for c in 'PygrBuildNLMSAMegabase']
  78         random.shuffle(tmpList)
  79         testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) # FOR TEST, SHOULD BE DELETED
  80         if testDir is None: testDir = 'TEST_' + ''.join(tmpList) # NOT SPECIFIED, USE CURRENT DIRECTORY
  81         try:
  82             os.mkdir(testDir)
  83             testDir = os.path.realpath(testDir)
  84         except:
  85             raise IOError
  86         self.path = testDir
  87         try:
  88             tmpFileName = os.path.join(testDir, 'DELETE_THIS_TEMP_FILE')
  89             open(tmpFileName, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
  90         except:
  91             raise IOError
  92         pygr.Data.update(self.path)
  93         from pygr import seqdb
  94         for orgstr in msaSpeciesList:
  95             genome = seqdb.BlastDB(os.path.join(seqDir, orgstr))
  96             genome.__doc__ = docStringDict[orgstr]
  97             pygr.Data.addResource('TEST.Seq.Genome.' + orgstr, genome)
  98         pygr.Data.save()
  99     def copyFile(self, filename): # COPY A FILE INTO TEST DIRECTORY
 100         newname = os.path.join(self.path, os.path.basename(filename))
 101         open(newname, 'w').write(open(filename, 'r').read())
 102         return newname
 103     def tearDown(self):
 104         'delete the temporary directory and files, restore pygr.Data path'
 105         for dirpath, subdirs, files in os.walk(self.path, topdown = False): # SHOULD BE DELETED BOTTOM-UP FASHION
 106             # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
 107             # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
 108             for filename in files:
 109                 os.remove(os.path.join(dirpath, filename))
 110             os.rmdir(dirpath)
 111         # Restore original pygr.Data path to remedy lack of isolation
 112         # between tests from the same run
 113         pygr.Data.update(None)
 114
 115
 116 class Build_Test(PygrBuildNLMSAMegabase):
 117     def test_seqdb(self):
 118         'Check pygr.Data contents'
 119         l = pygr.Data.dir('TEST')
 120         preList = ['TEST.Seq.Genome.' + orgstr for orgstr in msaSpeciesList]
 121         assert l == preList
 122     def test_collectionannot(self):
 123         'Test building an AnnotationDB from file'
 124         from pygr import seqdb, cnestedlist, sqlgraph
 125         hg18 = pygr.Data.getResource('TEST.Seq.Genome.hg18')
 126         # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS
 127         exon_slices = Collection(filename = os.path.join(self.path, 'refGene_exonAnnot_hg18.cdb'), \
 128             intKeys = True, mode = 'cr', writeback = False)
 129         exon_db = seqdb.AnnotationDB(exon_slices, hg18,
 130                                sliceAttrDict = dict(id = 0, exon_id = 1, orientation = 2,
 131                                                   gene_id = 3, start = 4, stop = 5))
 132         msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_hg18'), 'w', \
 133             pairwiseMode = True, bidirectional = False)
 134         for lines in open(os.path.join(testInputDir, 'refGene_exonAnnot%s_hg18.txt' % smallSamplePostfix), 'r').xreadlines():
 135             row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
 136             row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
 137             exon_slices[row[1]] = row
 138             exon = exon_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
 139             msa.addAnnotation(exon) # SAVE IT TO GENOME MAPPING
 140         exon_db.clear_cache() # not really necessary; cache should autoGC
 141         exon_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
 142         msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
 143         exon_db.__doc__ = 'Exon Annotation Database for hg18'
 144         pygr.Data.addResource('TEST.Annotation.hg18.exons', exon_db)
 145         msa.__doc__ = 'NLMSA Exon for hg18'
 146         pygr.Data.addResource('TEST.Annotation.NLMSA.hg18.exons', msa)
 147         exon_schema = pygr.Data.ManyToManyRelation(hg18, exon_db, bindAttrs = ('exon1',))
 148         exon_schema.__doc__ = 'Exon Schema for hg18'
 149         pygr.Data.addSchema('TEST.Annotation.NLMSA.hg18.exons', exon_schema)
 150         # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES
 151         splice_slices = Collection(filename = os.path.join(self.path, 'refGene_spliceAnnot_hg18.cdb'), \
 152             intKeys = True, mode = 'cr', writeback = False)
 153         splice_db = seqdb.AnnotationDB(splice_slices, hg18,
 154                                sliceAttrDict = dict(id = 0, splice_id = 1, orientation = 2,
 155                                                   gene_id = 3, start = 4, stop = 5))
 156         msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_hg18'), 'w', \
 157             pairwiseMode = True, bidirectional = False)
 158         for lines in open(os.path.join(testInputDir, 'refGene_spliceAnnot%s_hg18.txt' % smallSamplePostfix), 'r').xreadlines():
 159             row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
 160             row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
 161             splice_slices[row[1]] = row
 162             splice = splice_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
 163             msa.addAnnotation(splice) # SAVE IT TO GENOME MAPPING
 164         splice_db.clear_cache() # not really necessary; cache should autoGC
 165         splice_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
 166         msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
 167         splice_db.__doc__ = 'Splice Annotation Database for hg18'
 168         pygr.Data.addResource('TEST.Annotation.hg18.splices', splice_db)
 169         msa.__doc__ = 'NLMSA Splice for hg18'
 170         pygr.Data.addResource('TEST.Annotation.NLMSA.hg18.splices', msa)
 171         splice_schema = pygr.Data.ManyToManyRelation(hg18, splice_db, bindAttrs = ('splice1',))
 172         splice_schema.__doc__ = 'Splice Schema for hg18'
 173         pygr.Data.addSchema('TEST.Annotation.NLMSA.hg18.splices', splice_schema)
 174         # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS
 175         cds_slices = Collection(filename = os.path.join(self.path, 'refGene_cdsAnnot_hg18.cdb'), \
 176             intKeys = True, mode = 'cr', writeback = False)
 177         cds_db = seqdb.AnnotationDB(cds_slices, hg18,
 178                                sliceAttrDict = dict(id = 0, cds_id = 1, orientation = 2,
 179                                                   gene_id = 3, start = 4, stop = 5))
 180         msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_cdsAnnot_hg18'), 'w', \
 181             pairwiseMode = True, bidirectional = False)
 182         for lines in open(os.path.join(testInputDir, 'refGene_cdsAnnot%s_hg18.txt' % smallSamplePostfix), 'r').xreadlines():
 183             row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
 184             row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
 185             cds_slices[row[1]] = row
 186             cds = cds_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
 187             msa.addAnnotation(cds) # SAVE IT TO GENOME MAPPING
 188         cds_db.clear_cache() # not really necessary; cache should autoGC
 189         cds_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
 190         msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
 191         cds_db.__doc__ = 'CDS Annotation Database for hg18'
 192         pygr.Data.addResource('TEST.Annotation.hg18.cdss', cds_db)
 193         msa.__doc__ = 'NLMSA CDS for hg18'
 194         pygr.Data.addResource('TEST.Annotation.NLMSA.hg18.cdss', msa)
 195         cds_schema = pygr.Data.ManyToManyRelation(hg18, cds_db, bindAttrs = ('cds1',))
 196         cds_schema.__doc__ = 'CDS Schema for hg18'
 197         pygr.Data.addSchema('TEST.Annotation.NLMSA.hg18.cdss', cds_schema)
 198         # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC
 199         ucsc_slices = Collection(filename = os.path.join(self.path, 'phastConsElements28way_hg18.cdb'), \
 200             intKeys = True, mode = 'cr', writeback = False)
 201         ucsc_db = seqdb.AnnotationDB(ucsc_slices, hg18,
 202                                sliceAttrDict = dict(id = 0, ucsc_id = 1, orientation = 2,
 203                                                   gene_id = 3, start = 4, stop = 5))
 204         msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements28way_hg18'), 'w', \
 205             pairwiseMode = True, bidirectional = False)
 206         for lines in open(os.path.join(testInputDir, 'phastConsElements28way%s_hg18.txt' % smallSamplePostfix), 'r').xreadlines():
 207             row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
 208             row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
 209             ucsc_slices[row[1]] = row
 210             ucsc = ucsc_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
 211             msa.addAnnotation(ucsc) # SAVE IT TO GENOME MAPPING
 212         ucsc_db.clear_cache() # not really necessary; cache should autoGC
 213         ucsc_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
 214         msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
 215         ucsc_db.__doc__ = 'Most Conserved Elements for hg18'
 216         pygr.Data.addResource('TEST.Annotation.UCSC.hg18.mostconserved', ucsc_db)
 217         msa.__doc__ = 'NLMSA for Most Conserved Elements for hg18'
 218         pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.hg18.mostconserved', msa)
 219         ucsc_schema = pygr.Data.ManyToManyRelation(hg18, ucsc_db, bindAttrs = ('element1',))
 220         ucsc_schema.__doc__ = 'Schema for UCSC Most Conserved Elements for hg18'
 221         pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.hg18.mostconserved', ucsc_schema)
 222         # BUILD ANNOTATION DATABASE FOR SNP126 FROM UCSC
 223         snp_slices = Collection(filename = os.path.join(self.path, 'snp126_hg18.cdb'), \
 224                                           intKeys = True, protocol = 2, mode = 'cr', writeback = False)
 225         snp_db = seqdb.AnnotationDB(snp_slices, hg18,
 226                                sliceAttrDict = dict(id = 0, snp_id = 1, orientation = 2, gene_id = 3, start = 4,
 227                                                     stop = 5, score = 6, ref_NCBI = 7, ref_UCSC = 8, observed = 9,
 228                                                     molType = 10, myClass = 11, myValid = 12, avHet = 13, avHetSE = 14,
 229                                                     myFunc = 15, locType = 16, myWeight = 17))
 230         msa = cnestedlist.NLMSA(os.path.join(self.path, 'snp126_hg18'), 'w', \
 231             pairwiseMode = True, bidirectional = False)
 232         for lines in open(os.path.join(testInputDir, 'snp126%s_hg18.txt' % smallSamplePostfix), 'r').xreadlines():
 233             row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE
 234             row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER
 235             snp_slices[row[1]] = row
 236             snp = snp_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
 237             msa.addAnnotation(snp) # SAVE IT TO GENOME MAPPING
 238         snp_db.clear_cache() # not really necessary; cache should autoGC
 239         snp_slices.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
 240         msa.build() # FINALIZE GENOME ALIGNMENT INDEXES
 241         snp_db.__doc__ = 'SNP126 for hg18'
 242         pygr.Data.addResource('TEST.Annotation.UCSC.hg18.snp126', snp_db)
 243         msa.__doc__ = 'NLMSA for SNP126 for hg18'
 244         pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.hg18.snp126', msa)
 245         snp_schema = pygr.Data.ManyToManyRelation(hg18, snp_db, bindAttrs = ('snp1',))
 246         snp_schema.__doc__ = 'Schema for UCSC SNP126 for hg18'
 247         pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.hg18.snp126', snp_schema)
 248         pygr.Data.save()
 249         pygr.Data.clear_cache()
 250
 251         # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
 252         hg18 = pygr.Data.getResource('TEST.Seq.Genome.hg18')
 253         exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.hg18.exons')
 254         splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.hg18.splices')
 255         conservedmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.hg18.mostconserved')
 256         snpmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.hg18.snp126')
 257         cdsmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.hg18.cdss')
 258         exons = pygr.Data.getResource('TEST.Annotation.hg18.exons')
 259         splices = pygr.Data.getResource('TEST.Annotation.hg18.splices')
 260         mostconserved = pygr.Data.getResource('TEST.Annotation.UCSC.hg18.mostconserved')
 261         snp126 = pygr.Data.getResource('TEST.Annotation.UCSC.hg18.snp126')
 262         cdss = pygr.Data.getResource('TEST.Annotation.hg18.cdss')
 263
 264         # OPEN hg18_MULTIZ28WAY NLMSA
 265         msa = cnestedlist.NLMSA(os.path.join(msaDir, 'hg18_multiz28way'), 'r', trypath = [seqDir])
 266
 267         exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_hg18.txt' % smallSamplePostfix)
 268         intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_hg18.txt' % smallSamplePostfix)
 269         stopAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Stop%s_hg18.txt' % smallSamplePostfix)
 270         newexonAnnotFileName = os.path.join(self.path, 'new_Exons_hg18.txt')
 271         newintronAnnotFileName = os.path.join(self.path, 'new_Introns_hg18.txt')
 272         newstopAnnotFileName = os.path.join(self.path, 'new_stop_hg18.txt')
 273         tmpexonAnnotFileName = self.copyFile(exonAnnotFileName)
 274         tmpintronAnnotFileName = self.copyFile(intronAnnotFileName)
 275         tmpstopAnnotFileName = self.copyFile(stopAnnotFileName)
 276
 277         if smallSampleKey:
 278             chrList = [ smallSampleKey ]
 279         else:
 280             chrList = hg18.seqLenDict.keys()
 281             chrList.sort()
 282
 283         outfile = open(newexonAnnotFileName, 'w')
 284         for chrid in chrList:
 285             slice = hg18[chrid]
 286             # EXON ANNOTATION DATABASE
 287             try:
 288                 ex1 = exonmsa[slice]
 289             except:
 290                 continue
 291             else:
 292                 exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()]
 293                 exlist1.sort()
 294                 for ixx, exon in exlist1:
 295                     saveList = []
 296                     tmp = exon.sequence
 297                     tmpexon = exons[exon.exon_id]
 298                     tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE
 299                     wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, tmpslice.start, tmpslice.stop
 300                     try:
 301                         out1 = conservedmsa[tmp]
 302                     except KeyError:
 303                         pass
 304                     else:
 305                         elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
 306                         elementlist.sort()
 307                         for iyy, element in elementlist:
 308                             if element.stop - element.start < 100: continue
 309                             score = int(string.split(element.gene_id, '=')[1])
 310                             if score < 100: continue
 311                             tmp2 = element.sequence
 312                             tmpelement = mostconserved[element.ucsc_id]
 313                             tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
 314                             wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
 315                             slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
 316                             if slicestart < 0 or sliceend < 0: sys.exit('wrong query')
 317                             tmp1 = msa.seqDict['hg18.' + chrid][slicestart:sliceend]
 318                             edges = msa[tmp1].edges()
 319                             for src, dest, e in edges:
 320                                 if src.stop - src.start < 100: continue
 321                                 palign, pident = e.pAligned(), e.pIdentity()
 322                                 if palign < 0.8 or pident < 0.8: continue
 323                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 324                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 325                                     (~msa.seqDict)[dest], \
 326                                     str(dest), dest.start, dest.stop, palign, pident)
 327                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 328                         saveList.sort()
 329                         for saveline in saveList:
 330                             outfile.write(saveline)
 331         outfile.close()
 332         md5old = hashlib.md5()
 333         md5old.update(open(tmpexonAnnotFileName, 'r').read())
 334         md5new = hashlib.md5()
 335         md5new.update(open(newexonAnnotFileName, 'r').read())
 336         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 337
 338         outfile = open(newintronAnnotFileName, 'w')
 339         for chrid in chrList:
 340             slice = hg18[chrid]
 341             # SPLICE ANNOTATION DATABASE
 342             try:
 343                 sp1 = splicemsa[slice]
 344             except:
 345                 continue
 346             else:
 347                 splist1 = [(ix.splice_id, ix) for ix in sp1.keys()]
 348                 splist1.sort()
 349                 for ixx, splice in splist1:
 350                     saveList = []
 351                     tmp = splice.sequence
 352                     tmpsplice = splices[splice.splice_id]
 353                     tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE
 354                     wlist1 = 'INTRON', chrid, tmpsplice.splice_id, tmpsplice.gene_id, tmpslice.start, tmpslice.stop
 355                     try:
 356                         out1 = conservedmsa[tmp]
 357                     except KeyError:
 358                         pass
 359                     else:
 360                         elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
 361                         elementlist.sort()
 362                         for iyy, element in elementlist:
 363                             if element.stop - element.start < 100: continue
 364                             score = int(string.split(element.gene_id, '=')[1])
 365                             if score < 100: continue
 366                             tmp2 = element.sequence
 367                             tmpelement = mostconserved[element.ucsc_id]
 368                             tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
 369                             wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
 370                             slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
 371                             if slicestart < 0 or sliceend < 0: sys.exit('wrong query')
 372                             tmp1 = msa.seqDict['hg18.' + chrid][slicestart:sliceend]
 373                             edges = msa[tmp1].edges()
 374                             for src, dest, e in edges:
 375                                 if src.stop - src.start < 100: continue
 376                                 palign, pident = e.pAligned(), e.pIdentity()
 377                                 if palign < 0.8 or pident < 0.8: continue
 378                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 379                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 380                                     (~msa.seqDict)[dest], \
 381                                     str(dest), dest.start, dest.stop, palign, pident)
 382                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 383                         saveList.sort()
 384                         for saveline in saveList:
 385                             outfile.write(saveline)
 386                     # SNP IN SPLICE SITES
 387                     saveList = []
 388                     gt = tmpslice[:2]
 389                     ag = tmpslice[-2:]
 390                     try:
 391                         gtout = snpmsa[gt]
 392                         agout = snpmsa[ag]
 393                     except KeyError:
 394                         pass
 395                     else:
 396                         gtlist = gtout.keys()
 397                         aglist = agout.keys()
 398                         for snp in gtlist:
 399                             tmpsnp = snp.sequence
 400                             annsnp = snp126[snp.snp_id]
 401                             wlist2 = ('SNP5', chrid, tmpsplice.gene_id, gt.start, gt.stop, str(gt))  \
 402                                 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
 403                                 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
 404                                 annsnp.observed, annsnp.molType, \
 405                                 annsnp.myClass, annsnp.myValid)
 406                             tmp1 = msa.seqDict['hg18.' + chrid][abs(gt.start):abs(gt.stop)]
 407                             edges = msa[tmp1].edges()
 408                             for src, dest, e in edges:
 409                                 if src.stop - src.start != 2 or dest.stop - dest.start != 2: continue
 410                                 palign, pident = e.pAligned(), e.pIdentity()
 411                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 412                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 413                                     (~msa.seqDict)[dest], \
 414                                     str(dest), dest.start, dest.stop, palign, pident)
 415                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 416                         for snp in aglist:
 417                             tmpsnp = snp.sequence
 418                             annsnp = snp126[snp.snp_id]
 419                             wlist2 = ('SNP3', chrid, tmpsplice.gene_id, ag.start, ag.stop, str(ag))  \
 420                                 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
 421                                 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
 422                                 annsnp.observed, annsnp.molType, \
 423                                 annsnp.myClass, annsnp.myValid)
 424                             tmp1 = msa.seqDict['hg18.' + chrid][abs(ag.start):abs(ag.stop)]
 425                             edges = msa[tmp1].edges()
 426                             for src, dest, e in edges:
 427                                 if src.stop - src.start != 2 or dest.stop - dest.start != 2: continue
 428                                 palign, pident = e.pAligned(), e.pIdentity()
 429                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 430                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 431                                     (~msa.seqDict)[dest], \
 432                                     str(dest), dest.start, dest.stop, palign, pident)
 433                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 434                         saveList.sort()
 435                         for saveline in saveList:
 436                             outfile.write(saveline)
 437         outfile.close()
 438         md5old = hashlib.md5()
 439         md5old.update(open(tmpintronAnnotFileName, 'r').read())
 440         md5new = hashlib.md5()
 441         md5new.update(open(newintronAnnotFileName, 'r').read())
 442         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 443
 444         outfile = open(newstopAnnotFileName, 'w')
 445         for chrid in chrList:
 446             slice = hg18[chrid]
 447             # STOP ANNOTATION DATABASE
 448             try:
 449                 cds1 = cdsmsa[slice]
 450             except:
 451                 continue
 452             else:
 453                 cdslist1 = [(ix.cds_id, ix) for ix in cds1.keys()]
 454                 cdslist1.sort()
 455                 for ixx, cds in cdslist1:
 456                     saveList = []
 457                     tmp = cds.sequence
 458                     tmpcds = cdss[cds.cds_id]
 459                     tmpslice = tmpcds.sequence # FOR REAL EXON COORDINATE
 460                     wlist1 = 'STOP', chrid, tmpcds.cds_id, tmpcds.gene_id, tmpslice.start, tmpslice.stop
 461                     if tmpslice.start < 0:
 462                         stopstart, stopend = -tmpslice.stop, -tmpslice.start
 463                         stop = -hg18[chrid][stopstart:stopstart+3]
 464                     else:
 465                         stopstart, stopend = tmpslice.start, tmpslice.stop
 466                         stop = hg18[chrid][stopend-3:stopend]
 467                     if str(stop).upper() not in ('TAA', 'TAG', 'TGA'): continue
 468                     try:
 469                         snp1 = snpmsa[stop]
 470                     except KeyError:
 471                         pass
 472                     else:
 473                         snplist = [(ix.snp_id, ix) for ix in snp1.keys()]
 474                         snplist.sort()
 475                         for iyy, snp in snplist:
 476                             tmpsnp = snp.sequence
 477                             annsnp = snp126[snp.snp_id]
 478                             wlist2 = wlist1 + (str(stop), stop.start, stop.stop) \
 479                                 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
 480                                 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
 481                                 annsnp.observed, annsnp.molType, \
 482                                 annsnp.myClass, annsnp.myValid)
 483                             if tmpslice.start < 0:
 484                                 tmp1 = -msa.seqDict['hg18.' + chrid][stopstart:stopstart+3]
 485                             else:
 486                                 tmp1 = msa.seqDict['hg18.' + chrid][stopend-3:stopend]
 487                             edges = msa[tmp1].edges()
 488                             for src, dest, e in edges:
 489                                 if src.stop - src.start != 3 or dest.stop - dest.start != 3: continue
 490                                 palign, pident = e.pAligned(), e.pIdentity()
 491                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 492                                 if str(dest).upper() not in ('TAA', 'TAG', 'TGA'): nonstr = 'NONSENSE'
 493                                 else: nonstr = 'STOP'
 494                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 495                                     (~msa.seqDict)[dest], \
 496                                     str(dest), dest.start, dest.stop, palign, pident, nonstr)
 497                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 498                         saveList.sort()
 499                         for saveline in saveList:
 500                             outfile.write(saveline)
 501         outfile.close()
 502         md5old = hashlib.md5()
 503         md5old.update(open(tmpstopAnnotFileName, 'r').read())
 504         md5new = hashlib.md5()
 505         md5new.update(open(newstopAnnotFileName, 'r').read())
 506         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 507
 508     def test_mysqlannot(self):
 509         'Test building an AnnotationDB from MySQL'
 510         from pygr import seqdb, cnestedlist, sqlgraph
 511         hg18 = pygr.Data.getResource('TEST.Seq.Genome.hg18')
 512         # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION
 513         exon_slices = sqlgraph.SQLTableClustered('%s.pygr_refGene_exonAnnot%s_hg18' % ( testInputDB, smallSamplePostfix ),
 514             clusterKey = 'chromosome', maxCache = 0)
 515         exon_db = seqdb.AnnotationDB(exon_slices, hg18, sliceAttrDict = dict(id = 'chromosome', \
 516             gene_id = 'name', exon_id = 'exon_id'))
 517         msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_SQL_hg18'), 'w', \
 518             pairwiseMode = True, bidirectional = False)
 519         for id in exon_db:
 520             msa.addAnnotation(exon_db[id])
 521         exon_db.clear_cache() # not really necessary; cache should autoGC
 522         exon_slices.clear_cache()
 523         msa.build()
 524         exon_db.__doc__ = 'SQL Exon Annotation Database for hg18'
 525         pygr.Data.addResource('TEST.Annotation.SQL.hg18.exons', exon_db)
 526         msa.__doc__ = 'SQL NLMSA Exon for hg18'
 527         pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.hg18.exons', msa)
 528         exon_schema = pygr.Data.ManyToManyRelation(hg18, exon_db, bindAttrs = ('exon2',))
 529         exon_schema.__doc__ = 'SQL Exon Schema for hg18'
 530         pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.hg18.exons', exon_schema)
 531         # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES: MYSQL VERSION
 532         splice_slices = sqlgraph.SQLTableClustered('%s.pygr_refGene_spliceAnnot%s_hg18' % ( testInputDB, smallSamplePostfix ),
 533             clusterKey = 'chromosome', maxCache = 0)
 534         splice_db = seqdb.AnnotationDB(splice_slices, hg18, sliceAttrDict = dict(id = 'chromosome', \
 535             gene_id = 'name', splice_id = 'splice_id'))
 536         msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_SQL_hg18'), 'w', \
 537             pairwiseMode = True, bidirectional = False)
 538         for id in splice_db:
 539             msa.addAnnotation(splice_db[id])
 540         splice_db.clear_cache() # not really necessary; cache should autoGC
 541         splice_slices.clear_cache()
 542         msa.build()
 543         splice_db.__doc__ = 'SQL Splice Annotation Database for hg18'
 544         pygr.Data.addResource('TEST.Annotation.SQL.hg18.splices', splice_db)
 545         msa.__doc__ = 'SQL NLMSA Splice for hg18'
 546         pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.hg18.splices', msa)
 547         splice_schema = pygr.Data.ManyToManyRelation(hg18, splice_db, bindAttrs = ('splice2',))
 548         splice_schema.__doc__ = 'SQL Splice Schema for hg18'
 549         pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.hg18.splices', splice_schema)
 550         # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION
 551         cds_slices = sqlgraph.SQLTableClustered('%s.pygr_refGene_cdsAnnot%s_hg18' % ( testInputDB, smallSamplePostfix ),
 552             clusterKey = 'chromosome', maxCache = 0)
 553         cds_db = seqdb.AnnotationDB(cds_slices, hg18, sliceAttrDict = dict(id = 'chromosome', \
 554             gene_id = 'name', cds_id = 'cds_id'))
 555         msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_cdsAnnot_SQL_hg18'), 'w', \
 556             pairwiseMode = True, bidirectional = False)
 557         for id in cds_db:
 558             msa.addAnnotation(cds_db[id])
 559         cds_db.clear_cache() # not really necessary; cache should autoGC
 560         cds_slices.clear_cache()
 561         msa.build()
 562         cds_db.__doc__ = 'SQL CDS Annotation Database for hg18'
 563         pygr.Data.addResource('TEST.Annotation.SQL.hg18.cdss', cds_db)
 564         msa.__doc__ = 'SQL NLMSA CDS for hg18'
 565         pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.hg18.cdss', msa)
 566         cds_schema = pygr.Data.ManyToManyRelation(hg18, cds_db, bindAttrs = ('cds2',))
 567         cds_schema.__doc__ = 'SQL CDS Schema for hg18'
 568         pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.hg18.cdss', cds_schema)
 569         # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC: MYSQL VERSION
 570         ucsc_slices = sqlgraph.SQLTableClustered('%s.pygr_phastConsElements28way%s_hg18' % ( testInputDB, smallSamplePostfix ),
 571             clusterKey = 'chromosome', maxCache = 0)
 572         ucsc_db = seqdb.AnnotationDB(ucsc_slices, hg18, sliceAttrDict = dict(id = 'chromosome', \
 573             gene_id = 'name', ucsc_id = 'ucsc_id'))
 574         msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements28way_SQL_hg18'), 'w', \
 575             pairwiseMode = True, bidirectional = False)
 576         for id in ucsc_db:
 577             msa.addAnnotation(ucsc_db[id])
 578         ucsc_db.clear_cache() # not really necessary; cache should autoGC
 579         ucsc_slices.clear_cache()
 580         msa.build()
 581         ucsc_db.__doc__ = 'SQL Most Conserved Elements for hg18'
 582         pygr.Data.addResource('TEST.Annotation.UCSC.SQL.hg18.mostconserved', ucsc_db)
 583         msa.__doc__ = 'SQL NLMSA for Most Conserved Elements for hg18'
 584         pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.mostconserved', msa)
 585         ucsc_schema = pygr.Data.ManyToManyRelation(hg18, ucsc_db, bindAttrs = ('element2',))
 586         ucsc_schema.__doc__ = 'SQL Schema for UCSC Most Conserved Elements for hg18'
 587         pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.hg18.mostconserved', ucsc_schema)
 588         # BUILD ANNOTATION DATABASE FOR SNP126 FROM UCSC: MYSQL VERSION
 589         snp_slices = sqlgraph.SQLTableClustered('%s.pygr_snp126%s_hg18' % ( testInputDB, smallSamplePostfix ),
 590             clusterKey = 'clusterKey', maxCache = 0)
 591         snp_db = seqdb.AnnotationDB(snp_slices, hg18, sliceAttrDict = dict(id = 'chromosome', gene_id = 'name',
 592                                     snp_id = 'snp_id', score = 'score', ref_NCBI = 'ref_NCBI', ref_UCSC = 'ref_UCSC',
 593                                     observed = 'observed', molType = 'molType', myClass = 'myClass', myValid = 'myValid',
 594                                     avHet = 'avHet', avHetSE = 'avHetSE', myFunc = 'myFunc', locType = 'locType',
 595                                     myWeight = 'myWeight'))
 596         msa = cnestedlist.NLMSA(os.path.join(self.path, 'snp126_SQL_hg18'), 'w', \
 597             pairwiseMode = True, bidirectional = False)
 598         for id in snp_db:
 599             msa.addAnnotation(snp_db[id])
 600         snp_db.clear_cache() # not really necessary; cache should autoGC
 601         snp_slices.clear_cache()
 602         msa.build()
 603         snp_db.__doc__ = 'SQL SNP126 for hg18'
 604         pygr.Data.addResource('TEST.Annotation.UCSC.SQL.hg18.snp126', snp_db)
 605         msa.__doc__ = 'SQL NLMSA for SNP126 for hg18'
 606         pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.snp126', msa)
 607         snp_schema = pygr.Data.ManyToManyRelation(hg18, snp_db, bindAttrs = ('snp2',))
 608         snp_schema.__doc__ = 'SQL Schema for UCSC SNP126 for hg18'
 609         pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.hg18.snp126', snp_schema)
 610         pygr.Data.save()
 611         pygr.Data.clear_cache()
 612
 613         # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
 614         hg18 = pygr.Data.getResource('TEST.Seq.Genome.hg18')
 615         exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.hg18.exons')
 616         splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.hg18.splices')
 617         conservedmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.mostconserved')
 618         snpmsa = pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.snp126')
 619         cdsmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.hg18.cdss')
 620         exons = pygr.Data.getResource('TEST.Annotation.SQL.hg18.exons')
 621         splices = pygr.Data.getResource('TEST.Annotation.SQL.hg18.splices')
 622         mostconserved = pygr.Data.getResource('TEST.Annotation.UCSC.SQL.hg18.mostconserved')
 623         snp126 = pygr.Data.getResource('TEST.Annotation.UCSC.SQL.hg18.snp126')
 624         cdss = pygr.Data.getResource('TEST.Annotation.SQL.hg18.cdss')
 625
 626         # OPEN hg18_MULTIZ28WAY NLMSA
 627         msa = cnestedlist.NLMSA(os.path.join(msaDir, 'hg18_multiz28way'), 'r', trypath = [seqDir])
 628
 629         exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_hg18.txt' % smallSamplePostfix)
 630         intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_hg18.txt' % smallSamplePostfix)
 631         stopAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Stop%s_hg18.txt' % smallSamplePostfix)
 632         newexonAnnotFileName = os.path.join(self.path, 'new_Exons_hg18.txt')
 633         newintronAnnotFileName = os.path.join(self.path, 'new_Introns_hg18.txt')
 634         newstopAnnotFileName = os.path.join(self.path, 'new_stop_hg18.txt')
 635         tmpexonAnnotFileName = self.copyFile(exonAnnotFileName)
 636         tmpintronAnnotFileName = self.copyFile(intronAnnotFileName)
 637         tmpstopAnnotFileName = self.copyFile(stopAnnotFileName)
 638
 639         if smallSampleKey:
 640             chrList = [ smallSampleKey ]
 641         else:
 642             chrList = hg18.seqLenDict.keys()
 643             chrList.sort()
 644
 645         outfile = open(newexonAnnotFileName, 'w')
 646         for chrid in chrList:
 647             slice = hg18[chrid]
 648             # EXON ANNOTATION DATABASE
 649             try:
 650                 ex1 = exonmsa[slice]
 651             except:
 652                 continue
 653             else:
 654                 exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()]
 655                 exlist1.sort()
 656                 for ixx, exon in exlist1:
 657                     saveList = []
 658                     tmp = exon.sequence
 659                     tmpexon = exons[exon.exon_id]
 660                     tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE
 661                     wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, tmpslice.start, tmpslice.stop
 662                     try:
 663                         out1 = conservedmsa[tmp]
 664                     except KeyError:
 665                         pass
 666                     else:
 667                         elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
 668                         elementlist.sort()
 669                         for iyy, element in elementlist:
 670                             if element.stop - element.start < 100: continue
 671                             score = int(string.split(element.gene_id, '=')[1])
 672                             if score < 100: continue
 673                             tmp2 = element.sequence
 674                             tmpelement = mostconserved[element.ucsc_id]
 675                             tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
 676                             wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
 677                             slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
 678                             if slicestart < 0 or sliceend < 0: sys.exit('wrong query')
 679                             tmp1 = msa.seqDict['hg18.' + chrid][slicestart:sliceend]
 680                             edges = msa[tmp1].edges()
 681                             for src, dest, e in edges:
 682                                 if src.stop - src.start < 100: continue
 683                                 palign, pident = e.pAligned(), e.pIdentity()
 684                                 if palign < 0.8 or pident < 0.8: continue
 685                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 686                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 687                                     (~msa.seqDict)[dest], \
 688                                     str(dest), dest.start, dest.stop, palign, pident)
 689                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 690                         saveList.sort()
 691                         for saveline in saveList:
 692                             outfile.write(saveline)
 693         outfile.close()
 694         md5old = hashlib.md5()
 695         md5old.update(open(tmpexonAnnotFileName, 'r').read())
 696         md5new = hashlib.md5()
 697         md5new.update(open(newexonAnnotFileName, 'r').read())
 698         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 699
 700         outfile = open(newintronAnnotFileName, 'w')
 701         for chrid in chrList:
 702             slice = hg18[chrid]
 703             # SPLICE ANNOTATION DATABASE
 704             try:
 705                 sp1 = splicemsa[slice]
 706             except:
 707                 continue
 708             else:
 709                 splist1 = [(ix.splice_id, ix) for ix in sp1.keys()]
 710                 splist1.sort()
 711                 for ixx, splice in splist1:
 712                     saveList = []
 713                     tmp = splice.sequence
 714                     tmpsplice = splices[splice.splice_id]
 715                     tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE
 716                     wlist1 = 'INTRON', chrid, tmpsplice.splice_id, tmpsplice.gene_id, tmpslice.start, tmpslice.stop
 717                     try:
 718                         out1 = conservedmsa[tmp]
 719                     except KeyError:
 720                         pass
 721                     else:
 722                         elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()]
 723                         elementlist.sort()
 724                         for iyy, element in elementlist:
 725                             if element.stop - element.start < 100: continue
 726                             score = int(string.split(element.gene_id, '=')[1])
 727                             if score < 100: continue
 728                             tmp2 = element.sequence
 729                             tmpelement = mostconserved[element.ucsc_id]
 730                             tmpslice2 = tmpelement.sequence # FOR REAL ELEMENT COORDINATE
 731                             wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop)
 732                             slicestart, sliceend = max(tmp.start, tmp2.start), min(tmp.stop, tmp2.stop)
 733                             if slicestart < 0 or sliceend < 0: sys.exit('wrong query')
 734                             tmp1 = msa.seqDict['hg18.' + chrid][slicestart:sliceend]
 735                             edges = msa[tmp1].edges()
 736                             for src, dest, e in edges:
 737                                 if src.stop - src.start < 100: continue
 738                                 palign, pident = e.pAligned(), e.pIdentity()
 739                                 if palign < 0.8 or pident < 0.8: continue
 740                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 741                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 742                                     (~msa.seqDict)[dest], \
 743                                     str(dest), dest.start, dest.stop, palign, pident)
 744                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 745                         saveList.sort()
 746                         for saveline in saveList:
 747                             outfile.write(saveline)
 748                     # SNP IN SPLICE SITES
 749                     saveList = []
 750                     gt = tmpslice[:2]
 751                     ag = tmpslice[-2:]
 752                     try:
 753                         gtout = snpmsa[gt]
 754                         agout = snpmsa[ag]
 755                     except KeyError:
 756                         pass
 757                     else:
 758                         gtlist = gtout.keys()
 759                         aglist = agout.keys()
 760                         for snp in gtlist:
 761                             tmpsnp = snp.sequence
 762                             annsnp = snp126[snp.snp_id]
 763                             wlist2 = ('SNP5', chrid, tmpsplice.gene_id, gt.start, gt.stop, str(gt))  \
 764                                 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
 765                                 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
 766                                 annsnp.observed, annsnp.molType, \
 767                                 annsnp.myClass, annsnp.myValid)
 768                             tmp1 = msa.seqDict['hg18.' + chrid][abs(gt.start):abs(gt.stop)]
 769                             edges = msa[tmp1].edges()
 770                             for src, dest, e in edges:
 771                                 if src.stop - src.start != 2 or dest.stop - dest.start != 2: continue
 772                                 palign, pident = e.pAligned(), e.pIdentity()
 773                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 774                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 775                                     (~msa.seqDict)[dest], \
 776                                     str(dest), dest.start, dest.stop, palign, pident)
 777                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 778                         for snp in aglist:
 779                             tmpsnp = snp.sequence
 780                             annsnp = snp126[snp.snp_id]
 781                             wlist2 = ('SNP3', chrid, tmpsplice.gene_id, ag.start, ag.stop, str(ag))  \
 782                                 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
 783                                 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
 784                                 annsnp.observed, annsnp.molType, \
 785                                 annsnp.myClass, annsnp.myValid)
 786                             tmp1 = msa.seqDict['hg18.' + chrid][abs(ag.start):abs(ag.stop)]
 787                             edges = msa[tmp1].edges()
 788                             for src, dest, e in edges:
 789                                 if src.stop - src.start != 2 or dest.stop - dest.start != 2: continue
 790                                 palign, pident = e.pAligned(), e.pIdentity()
 791                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 792                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 793                                     (~msa.seqDict)[dest], \
 794                                     str(dest), dest.start, dest.stop, palign, pident)
 795                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 796                         saveList.sort()
 797                         for saveline in saveList:
 798                             outfile.write(saveline)
 799         outfile.close()
 800         md5old = hashlib.md5()
 801         md5old.update(open(tmpintronAnnotFileName, 'r').read())
 802         md5new = hashlib.md5()
 803         md5new.update(open(newintronAnnotFileName, 'r').read())
 804         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 805
 806         outfile = open(newstopAnnotFileName, 'w')
 807         for chrid in chrList:
 808             slice = hg18[chrid]
 809             # STOP ANNOTATION DATABASE
 810             try:
 811                 cds1 = cdsmsa[slice]
 812             except:
 813                 continue
 814             else:
 815                 cdslist1 = [(ix.cds_id, ix) for ix in cds1.keys()]
 816                 cdslist1.sort()
 817                 for ixx, cds in cdslist1:
 818                     saveList = []
 819                     tmp = cds.sequence
 820                     tmpcds = cdss[cds.cds_id]
 821                     tmpslice = tmpcds.sequence # FOR REAL EXON COORDINATE
 822                     wlist1 = 'STOP', chrid, tmpcds.cds_id, tmpcds.gene_id, tmpslice.start, tmpslice.stop
 823                     if tmpslice.start < 0:
 824                         stopstart, stopend = -tmpslice.stop, -tmpslice.start
 825                         stop = -hg18[chrid][stopstart:stopstart+3]
 826                     else:
 827                         stopstart, stopend = tmpslice.start, tmpslice.stop
 828                         stop = hg18[chrid][stopend-3:stopend]
 829                     if str(stop).upper() not in ('TAA', 'TAG', 'TGA'): continue
 830                     try:
 831                         snp1 = snpmsa[stop]
 832                     except KeyError:
 833                         pass
 834                     else:
 835                         snplist = [(ix.snp_id, ix) for ix in snp1.keys()]
 836                         snplist.sort()
 837                         for iyy, snp in snplist:
 838                             tmpsnp = snp.sequence
 839                             annsnp = snp126[snp.snp_id]
 840                             wlist2 = wlist1 + (str(stop), stop.start, stop.stop) \
 841                                 + (annsnp.snp_id, tmpsnp.start, tmpsnp.stop, \
 842                                 str(tmpsnp), annsnp.gene_id, annsnp.ref_NCBI, annsnp.ref_UCSC, \
 843                                 annsnp.observed, annsnp.molType, \
 844                                 annsnp.myClass, annsnp.myValid)
 845                             if tmpslice.start < 0:
 846                                 tmp1 = -msa.seqDict['hg18.' + chrid][stopstart:stopstart+3]
 847                             else:
 848                                 tmp1 = msa.seqDict['hg18.' + chrid][stopend-3:stopend]
 849                             edges = msa[tmp1].edges()
 850                             for src, dest, e in edges:
 851                                 if src.stop - src.start != 3 or dest.stop - dest.start != 3: continue
 852                                 palign, pident = e.pAligned(), e.pIdentity()
 853                                 palign, pident = '%.2f' % palign, '%.2f' % pident
 854                                 if str(dest).upper() not in ('TAA', 'TAG', 'TGA'): nonstr = 'NONSENSE'
 855                                 else: nonstr = 'STOP'
 856                                 wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, \
 857                                     (~msa.seqDict)[dest], \
 858                                     str(dest), dest.start, dest.stop, palign, pident, nonstr)
 859                                 saveList.append('\t'.join(map(str, wlist3)) + '\n')
 860                         saveList.sort()
 861                         for saveline in saveList:
 862                             outfile.write(saveline)
 863         outfile.close()
 864         md5old = hashlib.md5()
 865         md5old.update(open(tmpstopAnnotFileName, 'r').read())
 866         md5new = hashlib.md5()
 867         md5new.update(open(newstopAnnotFileName, 'r').read())
 868         assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
 869
 870
 871 if __name__ == '__main__':
 872     PygrTestProgram(verbosity=2)
 873