perltidy; fixes per Peter Rice, re: duplicated accessions in the index and unmunged...
[bioperl-live.git] / Bio / ClusterIO.pm
blob4d2b7745818737205c205a2f5c4389ac031286a9
2 # BioPerl module for Bio::ClusterIO.pm
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Andrew Macgregor <andrew@anatomy.otago.ac.nz>
8 # Copyright Andrew Macgregor, Jo-Ann Stanton, David Green
9 # Molecular Embryology Group, Anatomy & Structural Biology, University of Otago
10 # http://anatomy.otago.ac.nz/meg
12 # You may distribute this module under the same terms as perl itself
14 # _history
16 # May 7, 2002 - changed from UniGene.pm to more generic ClusterIO.pm
17 # by Andrew Macgregor
19 # April 17, 2002 - Initial implementation by Andrew Macgregor
20 # POD documentation - main docs before the code
22 =head1 NAME
24 Bio::ClusterIO - Handler for Cluster Formats
26 =head1 SYNOPSIS
28 #NB: This example is unigene specific
30 use Bio::ClusterIO;
32 $stream = Bio::ClusterIO->new('-file' => "Hs.data",
33 '-format' => "unigene");
34 # note: we quote -format to keep older perl's from complaining.
36 while ( my $in = $stream->next_cluster() ) {
37 print $in->unigene_id() . "\n";
38 while ( my $sequence = $in->next_seq() ) {
39 print $sequence->accession_number() . "\n";
42 # Parsing errors are printed to STDERR.
44 =head1 DESCRIPTION
46 The ClusterIO module works with the ClusterIO format module to read
47 various cluster formats such as NCBI UniGene.
50 =head1 CONSTRUCTORS
52 =head2 Bio::ClusterIO-E<gt>new()
54 $str = Bio::ClusterIO->new(-file => 'filename',
55 -format=>$format);
57 The new() class method constructs a new Bio::ClusterIO object. The
58 returned object can be used to retrieve or print cluster
59 objects. new() accepts the following parameters:
61 =over 4
63 =item -file
65 A file path to be opened for reading.
67 =item -format
69 Specify the format of the file. Supported formats include:
71 unigene *.data UniGene build files.
72 dbsnp *.xml dbSNP XML files
74 If no format is specified and a filename is given, then the module
75 will attempt to deduce it from the filename. If this is unsuccessful,
76 the main UniGene build format is assumed.
78 The format name is case insensitive. 'UNIGENE', 'UniGene' and
79 'unigene' are all supported, as are dbSNP, dbsnp, and DBSNP
81 =back
83 =head1 OBJECT METHODS
85 See below for more detailed summaries. The main methods are:
87 =head2 $cluster = $str-E<gt>next_cluster()
89 Fetch the next cluster from the stream.
92 =head2 TIEHANDLE(), READLINE(), PRINT()
94 These I've left in here because they were in the SeqIO
95 module. Feedback appreciated. There they provide the tie interface.
96 See L<perltie> for more details.
98 =head1 FEEDBACK
100 =head2 Mailing Lists
102 User feedback is an integral part of the evolution of this
103 and other Bioperl modules. Send your comments and suggestions preferably
104 to one of the Bioperl mailing lists.
105 Your participation is much appreciated.
107 bioperl-l@bioperl.org - General discussion
108 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
110 =head2 Support
112 Please direct usage questions or support issues to the mailing list:
114 I<bioperl-l@bioperl.org>
116 rather than to the module maintainer directly. Many experienced and
117 reponsive experts will be able look at the problem and quickly
118 address it. Please include a thorough description of the problem
119 with code and data examples if at all possible.
121 =head2 Reporting Bugs
123 Report bugs to the Bioperl bug tracking system to help us keep track
124 the bugs and their resolution. Bug reports can be submitted via the
125 web:
127 https://redmine.open-bio.org/projects/bioperl/
129 =head1 AUTHOR - Andrew Macgregor
131 Email andrew@anatomy.otago.ac.nz
133 =head1 APPENDIX
135 The rest of the documentation details each of the object
136 methods. Internal methods are usually preceded with a _
138 =cut
141 # Let the code begin...
143 package Bio::ClusterIO;
145 use strict;
148 use base qw(Bio::Root::Root Bio::Root::IO);
152 =head2 new
154 Title : new
155 Usage : Bio::ClusterIO->new(-file => $filename, -format => 'format')
156 Function: Returns a new cluster stream
157 Returns : A Bio::ClusterIO::Handler initialised with the appropriate format
158 Args : -file => $filename
159 -format => format
161 =cut
164 my $entry = 0;
166 sub new {
167 my ($caller,@args) = @_;
168 my $class = ref($caller) || $caller;
170 # or do we want to call SUPER on an object if $caller is an
171 # object?
172 if( $class =~ /Bio::ClusterIO::(\S+)/ ) {
173 my ($self) = $class->SUPER::new(@args);
174 $self->_initialize(@args);
175 return $self;
176 } else {
178 my %param = @args;
179 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
180 my $format = $param{'-format'} ||
181 $class->_guess_format( $param{-file} || $ARGV[0] );
182 $format = "\L$format"; # normalize capitalization to lower case
184 return unless( $class->_load_format_module($format) );
185 return "Bio::ClusterIO::$format"->new(@args);
189 =head2 format
191 Title : format
192 Usage : $format = $stream->format()
193 Function: Get the cluster format
194 Returns : cluster format
195 Args : none
197 =cut
199 # format() method inherited from Bio::Root::IO
202 # _initialize is chained for all ClusterIO classes
204 sub _initialize {
205 my($self, @args) = @_;
206 # initialize the IO part
207 $self->_initialize_io(@args);
210 =head2 next_cluster
212 Title : next_cluster
213 Usage : $cluster = $stream->next_cluster()
214 Function: Reads the next cluster object from the stream and returns it.
215 Returns : a L<Bio::ClusterI> compliant object
216 Args : none
219 =cut
221 sub next_cluster {
222 my ($self, $seq) = @_;
223 $self->throw("Sorry, you cannot read from a generic Bio::ClusterIO object.");
226 =head2 cluster_factory
228 Title : cluster_factory
229 Usage : $obj->cluster_factory($newval)
230 Function: Get/set the object factory to use for creating the cluster
231 objects.
232 Example :
233 Returns : a L<Bio::Factory::ObjectFactoryI> compliant object
234 Args : on set, new value (a L<Bio::Factory::ObjectFactoryI>
235 compliant object or undef, optional)
238 =cut
240 sub cluster_factory{
241 my $self = shift;
243 return $self->{'cluster_factory'} = shift if @_;
244 return $self->{'cluster_factory'};
247 =head2 object_factory
249 Title : object_factory
250 Usage : $obj->object_factory($newval)
251 Function: This is an alias to cluster_factory with a more generic name.
252 Example :
253 Returns : a L<Bio::Factory::ObjectFactoryI> compliant object
254 Args : on set, new value (a L<Bio::Factory::ObjectFactoryI>
255 compliant object or undef, optional)
258 =cut
260 sub object_factory{
261 return shift->cluster_factory(@_);
264 =head2 _load_format_module
266 Title : _load_format_module
267 Usage : *INTERNAL ClusterIO stuff*
268 Function: Loads up (like use) a module at run time on demand
269 Example :
270 Returns :
271 Args :
273 =cut
275 sub _load_format_module {
276 my ($self,$format) = @_;
277 my $module = "Bio::ClusterIO::" . $format;
278 my $ok;
280 eval {
281 $ok = $self->_load_module($module);
283 if ( $@ ) {
284 print STDERR <<END;
285 $self: could not load $format - for more details on supported formats please see the ClusterIO docs
286 Exception $@
290 return $ok;
293 =head2 _guess_format
295 Title : _guess_format
296 Usage : $obj->_guess_format($filename)
297 Function: guess format based on file suffix
298 Example :
299 Returns : guessed format of filename (lower case)
300 Args :
301 Notes : formats that _filehandle() will guess include unigene and dbsnp
303 =cut
305 sub _guess_format {
306 my $class = shift;
307 return unless $_ = shift;
308 return 'unigene' if /\.(data)$/i;
309 return 'dbsnp' if /\.(xml)$/i;
312 sub DESTROY {
313 my $self = shift;
315 $self->close();
318 # I need some direction on these!! The module works so I haven't fiddled with them!
320 sub TIEHANDLE {
321 my ($class,$val) = @_;
322 return bless {'seqio' => $val}, $class;
325 sub READLINE {
326 my $self = shift;
327 return $self->{'seqio'}->next_seq() unless wantarray;
328 my (@list, $obj);
329 push @list, $obj while $obj = $self->{'seqio'}->next_seq();
330 return @list;
333 sub PRINT {
334 my $self = shift;
335 $self->{'seqio'}->write_seq(@_);