mapping tutorial just covers mappings, so removed reference to graphs
[pygr.git] / pygr / apps / splicegraph.py
blob4fd563746abda2a6c8458ca5be1ff91a31ea9997
2 from pygr.sqlgraph import *
3 from pygr.sequence import *
4 from pygr.seqdb import *
7 def buildClusterSpliceGraph(c,alt5,alt3):
8 """use exon/splice start and end positions to build splice graph for a cluster c.
9 Also finds exons that share same start (but differ at end: alt5), or
10 share the same end (but differ at start: alt3).
11 """
12 start={}
13 end={}
14 none=[]
15 for e in c.exons:
16 if e.genomic_start not in start:
17 start[e.genomic_start]=[]
18 start[e.genomic_start].append(e)
19 if e.genomic_end not in end:
20 end[e.genomic_end]=[]
21 end[e.genomic_end].append(e)
22 for s in c.splices:
23 try:
24 exons1=end[s.ver_gen_start]
25 except KeyError:
26 exons1=none
27 try:
28 exons2=start[s.ver_gen_end]
29 except KeyError:
30 exons2=none
31 for e1 in exons1:
32 for e2 in exons2:
33 e1.next[e2]=s # SAVE SPLICE AS EDGE INFO...
34 s.exons=(e1,e2) # SAVE EXONS DIRECTLY ON THE SPLICE OBJECT
35 for exons in start.values():
36 for e1 in exons:
37 for e2 in exons:
38 if e1!=e2:
39 alt5+=e1
40 alt5+=e2
41 e1.alt5+=e2
42 e2.alt5+=e1
43 for exons in end.values():
44 for e1 in exons:
45 for e2 in exons:
46 if e1!=e2:
47 alt3+=e1
48 alt3+=e2
49 e1.alt3+=e2
50 e2.alt3+=e1
53 def loadCluster(c,exon_forms,splices,clusterExons,clusterSplices,spliceGraph,alt5,alt3):
54 """Loads data for a single cluster, and builds it into a splice graph."""
55 clusterExons+=c
56 clusterSplices+=c
57 for e in exon_forms.select('where cluster_id=%s',(c.id,)):
58 c.exons+=e
59 spliceGraph+=e
60 for s in splices.select('where cluster_id=%s',(c.id,)):
61 c.splices+=s
62 buildClusterSpliceGraph(c,alt5,alt3)
66 class ExonForm(TupleO,SeqPath): # ADD ATTRIBUTES STORING SCHEMA INFO
67 def __init__(self,t):
68 TupleO.__init__(self,t) # 1ST INITIALIZE ATTRIBUTE ACCESS
69 SeqPath.__init__(self,g[self.cluster_id], # INITIALIZE AS SEQ INTERVAL
70 self.genomic_start-1,self.genomic_end)
71 def __getattr__(self,attr):
72 'both parent classes have getattr, so have to call them both...'
73 try:
74 return TupleO.__getattr__(self,attr)
75 except AttributeError:
76 return SeqPath.__getattr__(self,attr)
78 class Splice(TupleO):
79 pass
82 def loadSpliceGraph(jun03,cluster_t,exon_t,splice_t,genomic_seq_t,
83 mrna_seq_t=None,protein_seq_t=None,loadAll=True):
84 """
85 Build a splice graph from the specified SQL tables representing gene clusters,
86 exon forms, and splices. Each table must be specified as a DB.TABLENAME string.
87 These tables are loaded into memory.
88 The splice graph is built based on exact match of exon ends and splice ends.
89 In addition, also builds alt5Graph (exons that match at start, but differ at end)
90 and alt3Graph (exons that match at end, but differ at start).
92 Loads all cluster, exon and splice data if loadAll is True.
94 Returns tuple: clusters,exons,splices,spliceGraph,alt5Graph,alt3Graph
95 """
97 # CREATE OUR GRAPHS
98 clusterExons=dictGraph()
99 clusterSplices=dictGraph()
100 spliceGraph=dictGraph()
101 alt5=dictGraph()
102 alt3=dictGraph()
105 class YiGenomicSequence(DNASQLSequence):
106 def __len__(self): return self._select('length(seq)') # USE SEQ LENGTH FROM DATABASE
107 g=jun03[genomic_seq_t]
108 g.objclass(YiGenomicSequence) # FORCE GENOMIC SEQ TABLE TO USE TRANSPARENT ACCESS
110 if mrna_seq_t is not None: # ONLY PROCESS THIS IF USER PASSED US AN MRNA TABLE
111 mrna=jun03[mrna_seq_t]
112 mrna.objclass(SQLSequence) # FORCE mRNA SEQ TABLE TO USE TRANSPARENT ACCESS
113 else:
114 mrna=None
116 if protein_seq_t is not None: # ONLY PROCESS THIS IF USER PASSED US A PROTEIN TABLE
117 class YiProteinSQLSequence(ProteinSQLSequence):
118 def __len__(self): return self.protein_length # USE SEQ LENGTH FROM DATABASE
119 protein=jun03[protein_seq_t]
120 protein.objclass(YiProteinSQLSequence) # FORCE PROTEIN SEQ TABLE TO USE TRANSPARENT ACCESS
121 protein.addAttrAlias(seq='protein_seq') # ALIAS protein_seq TO APPEAR AS seq
122 else:
123 protein=None
125 exon_forms=jun03[exon_t]
126 ExonForm.__class_schema__=SchemaDict(((spliceGraph,'next'),(alt5,'alt5'),(alt3,'alt3')))
127 exon_forms.objclass(ExonForm) # BIND THIS CLASS TO CONTAINER, AS THE CLASS TO USE AS "ROW OBJECTS"
129 if loadAll:
130 print 'Loading %s...' % exon_forms
131 exon_forms.load(ExonForm)
133 clusters=jun03[cluster_t]
134 class Cluster(TupleO):
135 __class_schema__=SchemaDict(((clusterExons,'exons'),(clusterSplices,'splices')))
136 clusters.objclass(Cluster) # BIND THIS CLASS TO CONTAINER, AS THE CLASS TO USE AS "ROW OBJECTS"
137 if loadAll:
138 print 'Loading %s...' % clusters
139 clusters.load(Cluster)
141 splices=jun03[splice_t]
142 splices.objclass(Splice) # BIND THIS CLASS TO CONTAINER, AS THE CLASS TO USE AS "ROW OBJECTS"
143 if loadAll:
144 print 'Loading %s...' % splices
145 splices.load(Splice)
147 ## print 'Saving alignment of protein to mrna isoforms...'
148 ## mrna_protein=PathMapping2()
149 ## for form_id in protein:
150 ## p=protein[form_id]
151 ## m=mrna[form_id]
152 ## start=3*(p.mRNA_start-1)+int(p.reading_frame)
153 ## end=start+3*p.protein_length
154 ## mrna_protein[p]=m[start:end]
156 print 'Adding clusters to graph...'
157 for c in clusters.values(): # ADD CLUSTERS AS NODES TO GRAPH
158 clusterExons+=c
159 clusterSplices+=c
161 print 'Adding exons to graph...'
162 for e in exon_forms.values():
163 c=clusters[e.cluster_id]
164 try:
165 c.exons+=e
166 spliceGraph+=e
167 except IndexError:
168 pass # BAD EXON: EMPTY SEQUENCE INTERVAL... IGNORE IT
170 print 'Adding splices to graph...'
171 for s in splices.values():
172 try:
173 c=clusters[s.cluster_id]
174 except KeyError: # WIERD, ONE SPLICE WITH BLANK (NOT NULL) VALUE!
175 pass
176 else:
177 c.splices+=s
179 print 'Building splice graph...'
180 for c in clusters.values():
181 buildClusterSpliceGraph(c,alt5,alt3)
183 return clusters,exon_forms,splices,g,spliceGraph,alt5,alt3,mrna,protein,\
184 clusterExons,clusterSplices