From 74df096fea4ea3f0dc771e9fc85fb45ec116fc56 Mon Sep 17 00:00:00 2001 From: Christopher Lee Date: Tue, 1 Sep 2009 20:40:12 -0700 Subject: [PATCH] added GFF3 example of gene building --- doc/rest/guidelist.rst | 1 + doc/rest/tutorials/gene.rst | 165 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 doc/rest/tutorials/gene.rst diff --git a/doc/rest/guidelist.rst b/doc/rest/guidelist.rst index 93d30e2..e0c1fc7 100644 --- a/doc/rest/guidelist.rst +++ b/doc/rest/guidelist.rst @@ -15,6 +15,7 @@ Tutorials tutorials/sequence tutorials/alignment + tutorials/gene tutorials/db tutorials/worldbase tutorials/annotation diff --git a/doc/rest/tutorials/gene.rst b/doc/rest/tutorials/gene.rst new file mode 100644 index 0000000..ad6c06a --- /dev/null +++ b/doc/rest/tutorials/gene.rst @@ -0,0 +1,165 @@ + +=========================== +Constructing Gene Databases +=========================== + + + +Building a Simple Gene & Exon Database +-------------------------------------- + +Let's say we want to build a gene database consisting of exons and +splicing structures for all the transcript isoforms. Our first +step would be to construct a little "database" that annotates some +exons on our genome:: + + from pygr import annotation, mapping + from pygr import worldbase + hg17 = worldbase.Bio.Seq.Genome.HUMAN.hg17() + exonSlices = {1:('chr1', 1000, 1300), 2:('chr1', 2000, 2099), + 3:('chr1', 3000, 3600)} + exons = annotation.AnnotationDB(exonSlices, hg17, + sliceAttrDict=dict(id=0, start=1, stop=2)) + exon = exons[1] + print repr(exon), repr(exon.sequence), exon.sequence + +This prints:: + + annot1[0:300] cctcagtaatccgaaaagccgggatcgaccgccccttgcttgcagccgggcactacaggacccgcttgctcacggtgctgtgccagggcgccccctgctggcgactagggcaactgcagggctctcttgcttagagtggtggccagcgccccctgctggcgccggggcactgcagggccctcttgcttactgtatagtggtggcacgccgcctgctggcagctagggacattgcagggtcctcttgctcaaggtgtagtggcagcacgcccacctgctggcagctggggacactgccggg + + +Note that since ``print`` converts its arguments to strings by default, +Pygr's methods for retrieving a slice of the human genome sequence +are automatically invoked on the third argument above. + +Next, let's create a splicing graph for the exon-skip and exon-inclusion +isoforms. This is a *many-to-many mapping*, i.e. each exon can be +spliced *to* multiple downstream exons, and each exon can be spliced +*from* multiple upstream exons. A standard Python dict can only handle +X-to-one mappings (i.e. each exon maps to a unique exon), not +many-to-many relations, so Pygr adopts the convention of representing +graphs as two-level dictionary interfaces, +i.e. ``graph[node1][node2]=edge``. + +Let's say we have three exons that produce two isoforms via +exon-skipping of the central exon. +So here is a basic graph that connects +exon 1 to exons 2 and 3, and exon 2 to exon 3 (with no edge information):: + + spliceGraph = {exons[1]:{exons[2]:None, exons[3]:None}, + exons[2]:{exons[3]:None}} + + +Next let's write a simple function to generate all possible isoforms +by walking the splice graph, starting from exon 1:: + + def walk_graph(graph, node, results, path=()): + try: + for node2 in graph[node]: + walk_graph(graph, node2, results, path + (node,)) + except KeyError: # end node, so save complete path + results.append(path + (node,)) + + isoforms = [] + walk_graph(spliceGraph, exons[1], isoforms) # generate isoforms + for i,transcript in enumerate(isoforms): + s = ''.join([str(exon.sequence) for exon in transcript]) + print 'isoform %d: %s' % (i, s) + +This yields the output:: + + isoform 0: cctcagtaatccgaaaagccgggatcgaccgccccttgcttgcagccgggcactacaggacccgcttgctcacggtgctgtgccagggcgccccctgctggcgactagggcaactgcagggctctcttgcttagagtggtggccagcgccccctgctggcgccggggcactgcagggccctcttgcttactgtatagtggtggcacgccgcctgctggcagctagggacattgcagggtcctcttgctcaaggtgtagtggcagcacgcccacctgctggcagctggggacactgccgggTACCTGAGGCTGAGGAAGGAGAAGGGGATGCACTGTTGGGGAGGCAGCTGTAACTCAAAGCCTTAGCCTCTGTTCCCACGAAGGCAGGGCCATCAGGCACCAAAGGGATTCTGCCAGCATAGTGCTCCTGGACCAGTGATACACCCGGCACCCTGTCCTGGACACGCTGTTGGCCTGGATCTGAGCCCTGGTGGAGGTCAAAGCCACCTTTGGTTCTGCCATTGCTGCTGTGTGGAAGTTCACTCCTGCCTTTTCCTTTCCCTAGAGCCTCCACCACCCCGAGATCACATTTCTCACTGCCTTTTGTCTGCCCAGTTTCACCAGAAGTAGGCCTCTTCCTGACAGGCAGCTGCACCACTGCCTGGCGCTGTGCCCTTCCTTTGCTCTGCCCGCTGGAGACGGTGTTTGTCATGGGCCTGGTCTGCAGGGATCCTGCTACAAAGGTGAAACCCAGGAGAGTGTGGAGTCCAGAGTGTTGCCAGGACCCAGGCACAGGCATTAGTGCCCGTTGGAGAAAACAGGGGAATCCCGAAGAAATGGTGGGTCCTGGCCATCCGTGAGATCTTCCCAGGGCAGCTCCCCTCTGTGGAATCCAATCTG + isoform 1: cctcagtaatccgaaaagccgggatcgaccgccccttgcttgcagccgggcactacaggacccgcttgctcacggtgctgtgccagggcgccccctgctggcgactagggcaactgcagggctctcttgcttagagtggtggccagcgccccctgctggcgccggggcactgcagggccctcttgcttactgtatagtggtggcacgccgcctgctggcagctagggacattgcagggtcctcttgctcaaggtgtagtggcagcacgcccacctgctggcagctggggacactgccgggCTGCATGTAACTTAATACCACAACCAGGCATAGGGGAAAGATTGGAGGAAAGATGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCAGTAAGTAGTTACCTGAGGCTGAGGAAGGAGAAGGGGATGCACTGTTGGGGAGGCAGCTGTAACTCAAAGCCTTAGCCTCTGTTCCCACGAAGGCAGGGCCATCAGGCACCAAAGGGATTCTGCCAGCATAGTGCTCCTGGACCAGTGATACACCCGGCACCCTGTCCTGGACACGCTGTTGGCCTGGATCTGAGCCCTGGTGGAGGTCAAAGCCACCTTTGGTTCTGCCATTGCTGCTGTGTGGAAGTTCACTCCTGCCTTTTCCTTTCCCTAGAGCCTCCACCACCCCGAGATCACATTTCTCACTGCCTTTTGTCTGCCCAGTTTCACCAGAAGTAGGCCTCTTCCTGACAGGCAGCTGCACCACTGCCTGGCGCTGTGCCCTTCCTTTGCTCTGCCCGCTGGAGACGGTGTTTGTCATGGGCCTGGTCTGCAGGGATCCTGCTACAAAGGTGAAACCCAGGAGAGTGTGGAGTCCAGAGTGTTGCCAGGACCCAGGCACAGGCATTAGTGCCCGTTGGAGAAAACAGGGGAATCCCGAAGAAATGGTGGGTCCTGGCCATCCGTGAGATCTTCCCAGGGCAGCTCCCCTCTGTGGAATCCAATCTG + +Loading Gene Data from GFF3 Files +--------------------------------- + +Now let's move on from "toy examples" to something a bit more useful: +reading gene annotations in the standard GFF3 format, a tab-separated +format that looks like this:: + + ##gff-version 3 + ##sequence-region ctg123 1 1497228 + ctg123 . gene 1000 9000 . + . ID=gene00001;Name=EDEN + ctg123 . TF_binding_site 1000 1012 . + . ID=tfbs00001;Parent=gene00001 + ctg123 . mRNA 1050 9000 . + . ID=mRNA00001;Parent=gene00001;Name=EDEN.1 + ctg123 . mRNA 1050 9000 . + . ID=mRNA00002;Parent=gene00001;Name=EDEN.2 + ctg123 . mRNA 1300 9000 . + . ID=mRNA00003;Parent=gene00001;Name=EDEN.3 + ctg123 . exon 1300 1500 . + . ID=exon00001;Parent=mRNA00003 + ctg123 . exon 1050 1500 . + . ID=exon00002;Parent=mRNA00001,mRNA00002 + ctg123 . exon 3000 3902 . + . ID=exon00003;Parent=mRNA00001,mRNA00003 + ctg123 . exon 5000 5500 . + . ID=exon00004;Parent=mRNA00001,mRNA00002,mRNA00003 + ctg123 . exon 7000 9000 . + . ID=exon00005;Parent=mRNA00001,mRNA00002,mRNA00003 + ctg123 . CDS 1201 1500 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1 + ctg123 . CDS 3000 3902 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1 + ctg123 . CDS 5000 5500 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1 + ctg123 . CDS 7000 7600 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1 + ctg123 . CDS 1201 1500 . + 0 ID=cds00002;Parent=mRNA00002;Name=edenprotein.2 + ctg123 . CDS 5000 5500 . + 0 ID=cds00002;Parent=mRNA00002;Name=edenprotein.2 + ctg123 . CDS 7000 7600 . + 0 ID=cds00002;Parent=mRNA00002;Name=edenprotein.2 + ctg123 . CDS 3301 3902 . + 0 ID=cds00003;Parent=mRNA00003;Name=edenprotein.3 + ctg123 . CDS 5000 5500 . + 1 ID=cds00003;Parent=mRNA00003;Name=edenprotein.3 + ctg123 . CDS 7000 7600 . + 2 ID=cds00003;Parent=mRNA00003;Name=edenprotein.3 + ctg123 . CDS 3391 3902 . + 0 ID=cds00004;Parent=mRNA00003;Name=edenprotein.4 + ctg123 . CDS 5000 5500 . + 1 ID=cds00004;Parent=mRNA00003;Name=edenprotein.4 + ctg123 . CDS 7000 7600 . + 2 ID=cds00004;Parent=mRNA00003;Name=edenprotein.4 + +First we need a basic GFF3 row class that Pygr can use as annotation slice +info:: + + class GFF3Row(object): + def __init__(self, line): + cols = line.split() + self.type = cols[2] + self.id = cols[0] # sequence ID + self.start = int(cols[3]) - 1 # correct for 1-based coords + self.stop = int(cols[4]) + if cols[6] == '+': # convert to Pygr convention + self.orientation = 1 + elif cols[6] == '-': + self.orientation = -1 + else: + raise ValueError('Bad strand: %s' % cols[6]) + for s in cols[8].split(';'): # parse attributes + attr,val = s.split('=') + if ',' in val: + setattr(self, attr, val.split(',')) + else: + setattr(self, attr, val) + + class GFF3File(object): + def __init__(self, filename, genome): + d = {} + self.filename = filename + self.genome = genome + ifile = file(filename) + for line in ifile: # parse all the GFF3 lines + if line.startswith('#'): # ignore this line + continue + row = GFF3Row(line) + try: + d.setdefault(row.type, {})[row.ID] = row + except AttributeError: + pass + ifile.close() + for atype,sliceDB in d.items(): # create annotation DBs + adb = annotation.AnnotationDB(sliceDB, genome) + setattr(self, atype, adb) + + +The key fields this must provide to be used as slice info +are id, start, stop, and orientation. + +TO DO +----- + +* for each mRNA, make a list of its exons + +* create splicegraph + +* create NLMSAs that reverse the mapping? + +* how to map CDS? + +* save to worldbase + -- 2.11.4.GIT