1 # BioPerl module for Bio::ClusterIO::unigene
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Andrew Macgregor <andrew at cbbc.murdoch.edu.au>
7 # Copyright Andrew Macgregor, Jo-Ann Stanton, David Green
8 # Molecular Embryology Group, Anatomy & Structural Biology, University of Otago
9 # http://meg.otago.ac.nz
11 # You may distribute this module under the same terms as perl itself
14 # April 17, 2002 - Initial implementation by Andrew Macgregor
16 # POD documentation - main docs before the code
20 Bio::ClusterIO::unigene - UniGene input stream
24 Do not use this module directly. Use it via the Bio::ClusterIO class.
28 This object reads from Unigene *.data files downloaded from
29 ftp://ftp.ncbi.nih.gov/repository/UniGene/. It does not download and
30 decompress the file, you have to do that yourself.
36 User feedback is an integral part of the evolution of this and other
37 Bioperl modules. Send your comments and suggestions preferably to one
38 of the Bioperl mailing lists. Your participation is much appreciated.
40 bioperl-l@bioperl.org - General discussion
41 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 Please direct usage questions or support issues to the mailing list:
47 I<bioperl-l@bioperl.org>
49 rather than to the module maintainer directly. Many experienced and
50 reponsive experts will be able look at the problem and quickly
51 address it. Please include a thorough description of the problem
52 with code and data examples if at all possible.
56 Report bugs to the Bioperl bug tracking system to help us keep track
57 the bugs and their resolution. Bug reports can be submitted via the
60 https://redmine.open-bio.org/projects/bioperl/
62 =head1 AUTHORS - Andrew Macgregor
64 Email: andrew at cbbc.murdoch.edu.au
69 The rest of the documentation details each of the object
70 methods. Internal methods are usually preceded with a _
75 # Let the code begin...
77 package Bio
::ClusterIO
::unigene
;
80 use Bio
::Cluster
::UniGene
;
81 use Bio
::Cluster
::ClusterFactory
;
83 use base
qw(Bio::ClusterIO);
86 ID
=> q
/ID\s+(\w{2,3}\.\d+)/,
87 TITLE
=> q
/TITLE\s+(\S.*)/,
88 GENE
=> q
/GENE\s+(\S.*)/,
89 CYTOBAND
=> q
/CYTOBAND\s+(\S.*)/,
90 MGI
=> q
/MGI\s+(\S.*)/,
91 LOCUSLINK
=> q
/LOCUSLINK\s+(\S.*)/,
92 HOMOL
=> q
/HOMOL\s+(\S.*)/,
93 EXPRESS
=> q
/EXPRESS\s+(\S.*)/,
94 RESTR_EXPR
=> q
/RESTR_EXPR\s+(\S.*)/,
95 GNM_TERMINUS
=> q
/GNM_TERMINUS\s+(\S.*)/,
96 CHROMOSOME
=> q
/CHROMOSOME\s+(\S.*)/,
97 STS
=> q
/STS\s+(\S.*)/,
98 TXMAP
=> q
/TXMAP\s+(\S.*)/,
99 PROTSIM
=> q
/PROTSIM\s+(\S.*)/,
100 SCOUNT
=> q
/SCOUNT\s+(\S.*)/,
101 SEQUENCE
=> q
/SEQUENCE\s+(\S.*)/,
102 ACC
=> q
/ACC=(\w+)(\.\d+)?/,
103 NID
=> q
/NID=\s*(\S.*)/,
104 PID
=> q
/PID=\s*(\S.*)/,
105 CLONE
=> q
/CLONE=\s*(\S.*)/,
106 END => q
/END=\s*(\S.*)/,
107 LID
=> q
/LID=\s*(\S.*)/,
108 MGC
=> q
/MGC=\s*(\S.*)/,
109 SEQTYPE
=> q
/SEQTYPE=\s*(\S.*)/,
110 TRACE
=> q
/TRACE=\s*(\S.*)/,
111 PERIPHERAL
=> q
/PERIPHERAL=\s*(\S.*)/,
115 # we set the right factory here
117 my($self, @args) = @_;
119 $self->SUPER::_initialize
(@args);
120 if(! $self->cluster_factory()) {
121 $self->cluster_factory(Bio
::Cluster
::ClusterFactory
->new(
122 -type
=> 'Bio::Cluster::UniGene'));
129 Usage : $unigene = $stream->next_cluster()
130 Function: returns the next unigene in the stream
131 Returns : Bio::Cluster::UniGene object
139 return unless my $entry = $self->_readline;
141 # set up the variables we'll need
142 my (%unigene,@express,@locuslink,@chromosome,
143 @sts,@txmap,@protsim,@sequence);
148 # add whitespace parsing and precompile regexes
149 #foreach (values %line_is) {
150 # $_ =~ s/\s+/\\s+/g;
151 # print STDERR "Regex is
$_\n";
155 #$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/;
157 # run each line in an entry against the regexes
158 foreach my $line (split /\n/, $entry) {
159 #print STDERR "Wanting to match
$line\n";
160 if ($line =~ /$line_is{ID}/gcx) {
163 elsif ($line =~ /$line_is{TITLE}/gcx ) {
164 #print STDERR "MATCHED with
[$1]\n";
165 $unigene{TITLE} = $1;
167 elsif ($line =~ /$line_is{GENE}/gcx) {
170 elsif ($line =~ /$line_is{CYTOBAND}/gcx) {
171 $unigene{CYTOBAND} = $1;
173 elsif ($line =~ /$line_is{MGI}/gcx) {
176 elsif ($line =~ /$line_is{LOCUSLINK}/gcx) {
177 @locuslink = split /;/, $1;
179 elsif ($line =~ /$line_is{HOMOL}/gcx) {
180 $unigene{HOMOL} = $1;
182 elsif ($line =~ /$line_is{EXPRESS}/gcx) {
184 # remove initial semicolon if present
186 @express = split /\s*;/, $express;
188 elsif ($line =~ /$line_is{RESTR_EXPR}/gcx) {
189 $unigene{RESTR_EXPR} = $1;
191 elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) {
192 $unigene{GNM_TERMINUS} = $1;
194 elsif ($line =~ /$line_is{CHROMOSOME}/gcx) {
195 push @chromosome, $1;
197 elsif ($line =~ /$line_is{TXMAP}/gcx) {
200 elsif ($line =~ /$line_is{STS}/gcx) {
203 elsif ($line =~ /$line_is{PROTSIM}/gcx) {
206 elsif ($line =~ /$line_is{SCOUNT}/gcx) {
207 $unigene{SCOUNT} = $1;
209 elsif ($line =~ /$line_is{SEQUENCE}/gcx) {
210 # parse into each sequence line
212 # add unigene id to each seq
213 #$seq->{unigene_id} = $unigene{ID};
214 my @items = split(/;/, $1);
216 if (/$line_is{ACC}/gcx) {
218 # remove leading dot if version pattern matched
219 $seq->{version} = substr($2,1) if defined $2;
221 elsif (/$line_is{NID}/gcx) {
224 elsif (/$line_is{PID}/gcx) {
227 elsif (/$line_is{CLONE}/gcx) {
230 elsif (/$line_is{END}/gcx) {
233 elsif (/$line_is{LID}/gcx) {
236 elsif (/$line_is{MGC}/gcx) {
239 elsif (/$line_is{SEQTYPE}/gcx) {
240 $seq->{seqtype} = $1;
242 elsif (/$line_is{TRACE}/gcx) {
245 elsif (/$line_is{PERIPHERAL}/gcx) {
246 $seq->{peripheral} = $1;
249 push @sequence, $seq;
251 elsif ($line =~ /$line_is{DELIMITER}/gcx) {
252 # at the end of the record, add data to the object
253 $UGobj = $self->cluster_factory->create_object(
254 -display_id => $unigene{ID},
255 -description => $unigene{TITLE},
256 -size => $unigene{SCOUNT},
257 -members => \@sequence);
258 $UGobj->gene($unigene{GENE}) if defined ($unigene{GENE});
259 $UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND});
260 $UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI});
261 $UGobj->locuslink(\@locuslink);
262 $UGobj->homol($unigene{HOMOL}) if defined ($unigene{HOMOL});
263 $UGobj->express(\@express);
264 $UGobj->restr_expr($unigene{RESTR_EXPR}) if defined ($unigene{RESTR_EXPR});
265 $UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS});
266 $UGobj->chromosome(\@chromosome);
268 $UGobj->txmap(\@txmap);
269 $UGobj->protsim(\@protsim);