2 from pygr
.sqlgraph
import *
3 from pygr
.sequence
import *
4 from pygr
.seqdb
import *
7 def buildClusterSpliceGraph(c
,alt5
,alt3
):
8 """use exon/splice start and end positions to build splice graph for a cluster c.
9 Also finds exons that share same start (but differ at end: alt5), or
10 share the same end (but differ at start: alt3).
16 if e
.genomic_start
not in start
:
17 start
[e
.genomic_start
]=[]
18 start
[e
.genomic_start
].append(e
)
19 if e
.genomic_end
not in end
:
21 end
[e
.genomic_end
].append(e
)
24 exons1
=end
[s
.ver_gen_start
]
28 exons2
=start
[s
.ver_gen_end
]
33 e1
.next
[e2
]=s
# SAVE SPLICE AS EDGE INFO...
34 s
.exons
=(e1
,e2
) # SAVE EXONS DIRECTLY ON THE SPLICE OBJECT
35 for exons
in start
.values():
43 for exons
in end
.values():
53 def loadCluster(c
,exon_forms
,splices
,clusterExons
,clusterSplices
,spliceGraph
,alt5
,alt3
):
54 """Loads data for a single cluster, and builds it into a splice graph."""
57 for e
in exon_forms
.select('where cluster_id=%s',(c
.id,)):
60 for s
in splices
.select('where cluster_id=%s',(c
.id,)):
62 buildClusterSpliceGraph(c
,alt5
,alt3
)
66 class ExonForm(TupleO
,SeqPath
): # ADD ATTRIBUTES STORING SCHEMA INFO
68 TupleO
.__init
__(self
,t
) # 1ST INITIALIZE ATTRIBUTE ACCESS
69 SeqPath
.__init
__(self
,g
[self
.cluster_id
], # INITIALIZE AS SEQ INTERVAL
70 self
.genomic_start
-1,self
.genomic_end
)
71 def __getattr__(self
,attr
):
72 'both parent classes have getattr, so have to call them both...'
74 return TupleO
.__getattr
__(self
,attr
)
75 except AttributeError:
76 return SeqPath
.__getattr
__(self
,attr
)
82 def loadSpliceGraph(jun03
,cluster_t
,exon_t
,splice_t
,genomic_seq_t
,
83 mrna_seq_t
=None,protein_seq_t
=None,loadAll
=True):
85 Build a splice graph from the specified SQL tables representing gene clusters,
86 exon forms, and splices. Each table must be specified as a DB.TABLENAME string.
87 These tables are loaded into memory.
88 The splice graph is built based on exact match of exon ends and splice ends.
89 In addition, also builds alt5Graph (exons that match at start, but differ at end)
90 and alt3Graph (exons that match at end, but differ at start).
92 Loads all cluster, exon and splice data if loadAll is True.
94 Returns tuple: clusters,exons,splices,spliceGraph,alt5Graph,alt3Graph
98 clusterExons
=dictGraph()
99 clusterSplices
=dictGraph()
100 spliceGraph
=dictGraph()
105 class YiGenomicSequence(DNASQLSequence
):
106 def __len__(self
): return self
._select
('length(seq)') # USE SEQ LENGTH FROM DATABASE
107 g
=jun03
[genomic_seq_t
]
108 g
.objclass(YiGenomicSequence
) # FORCE GENOMIC SEQ TABLE TO USE TRANSPARENT ACCESS
110 if mrna_seq_t
is not None: # ONLY PROCESS THIS IF USER PASSED US AN MRNA TABLE
111 mrna
=jun03
[mrna_seq_t
]
112 mrna
.objclass(SQLSequence
) # FORCE mRNA SEQ TABLE TO USE TRANSPARENT ACCESS
116 if protein_seq_t
is not None: # ONLY PROCESS THIS IF USER PASSED US A PROTEIN TABLE
117 class YiProteinSQLSequence(ProteinSQLSequence
):
118 def __len__(self
): return self
.protein_length
# USE SEQ LENGTH FROM DATABASE
119 protein
=jun03
[protein_seq_t
]
120 protein
.objclass(YiProteinSQLSequence
) # FORCE PROTEIN SEQ TABLE TO USE TRANSPARENT ACCESS
121 protein
.addAttrAlias(seq
='protein_seq') # ALIAS protein_seq TO APPEAR AS seq
125 exon_forms
=jun03
[exon_t
]
126 ExonForm
.__class
_schema
__=SchemaDict(((spliceGraph
,'next'),(alt5
,'alt5'),(alt3
,'alt3')))
127 exon_forms
.objclass(ExonForm
) # BIND THIS CLASS TO CONTAINER, AS THE CLASS TO USE AS "ROW OBJECTS"
130 print 'Loading %s...' % exon_forms
131 exon_forms
.load(ExonForm
)
133 clusters
=jun03
[cluster_t
]
134 class Cluster(TupleO
):
135 __class_schema__
=SchemaDict(((clusterExons
,'exons'),(clusterSplices
,'splices')))
136 clusters
.objclass(Cluster
) # BIND THIS CLASS TO CONTAINER, AS THE CLASS TO USE AS "ROW OBJECTS"
138 print 'Loading %s...' % clusters
139 clusters
.load(Cluster
)
141 splices
=jun03
[splice_t
]
142 splices
.objclass(Splice
) # BIND THIS CLASS TO CONTAINER, AS THE CLASS TO USE AS "ROW OBJECTS"
144 print 'Loading %s...' % splices
147 ## print 'Saving alignment of protein to mrna isoforms...'
148 ## mrna_protein=PathMapping2()
149 ## for form_id in protein:
150 ## p=protein[form_id]
152 ## start=3*(p.mRNA_start-1)+int(p.reading_frame)
153 ## end=start+3*p.protein_length
154 ## mrna_protein[p]=m[start:end]
156 print 'Adding clusters to graph...'
157 for c
in clusters
.values(): # ADD CLUSTERS AS NODES TO GRAPH
161 print 'Adding exons to graph...'
162 for e
in exon_forms
.values():
163 c
=clusters
[e
.cluster_id
]
168 pass # BAD EXON: EMPTY SEQUENCE INTERVAL... IGNORE IT
170 print 'Adding splices to graph...'
171 for s
in splices
.values():
173 c
=clusters
[s
.cluster_id
]
174 except KeyError: # WIERD, ONE SPLICE WITH BLANK (NOT NULL) VALUE!
179 print 'Building splice graph...'
180 for c
in clusters
.values():
181 buildClusterSpliceGraph(c
,alt5
,alt3
)
183 return clusters
,exon_forms
,splices
,g
,spliceGraph
,alt5
,alt3
,mrna
,protein
,\
184 clusterExons
,clusterSplices