maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / DB / Taxonomy / greengenes.pm
blob66b61d60439d4a72b5f9b847bbe23887645ba65f
2 # BioPerl module for Bio::DB::Taxonomy::greengenes
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Florent Angly <florent.angly@gmail.com>
8 # Copyright Florent Angly
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::DB::Taxonomy::greengenes - Use the Greengenes taxonomy
18 =head1 SYNOPSIS
20 use Bio::DB::Taxonomy;
22 my $db = Bio::DB::Taxonomy->new(
23 -source => 'greengenes',
24 -taxofile => 'taxonomy_16S_candiv_gg_2011_1.txt'
27 =head1 DESCRIPTION
29 I<This module is in beta. Its interface or its results may change in a future update.>
31 Bio::DB::Taxonomy::greengenes is an implementation of Bio::DB::Taxonomy which
32 stores and accesses the Greengenes taxonomy of Bacteria and Archaea. Internally,
33 it keeps the taxonomy into memory by using Bio::DB::Taxonomy::list. As a
34 consequence, note that the IDs assigned to the taxonomy nodes, e.g. gg123, are
35 arbitrary, contrary to the pre-defined IDs that NCBI assigns to taxons.
37 The latest release of the Greengene taxonomy (2011) contains about 4,600 taxa
38 and occupies about 4MB of memory once parsed into a Bio::DB::Taxonomy::greengenes
39 object. The taxonomy files taxonomy_16S_all_gg_2011_1.txt and
40 taxonomy_16S_candiv_gg_2011_1.txt that this module can use are available from
41 L<http://www.secondgenome.com/go/2011-greengenes-taxonomy/>.
43 =head1 FEEDBACK
45 =head2 Mailing Lists
47 User feedback is an integral part of the evolution of this and other
48 Bioperl modules. Send your comments and suggestions preferably to
49 the Bioperl mailing list. Your participation is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 =head2 Support
56 Please direct usage questions or support issues to the mailing list:
58 I<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
65 =head2 Reporting Bugs
67 Report bugs to the Bioperl bug tracking system to help us keep track
68 of the bugs and their resolution. Bug reports can be submitted via
69 the web:
71 https://github.com/bioperl/bioperl-live/issues
73 =head1 AUTHOR - Florent Angly
75 florent.angly@gmail.com
77 =head1 APPENDIX
79 The rest of the documentation details each of the object methods.
80 Internal methods are usually preceded with a _
82 =cut
84 # Let the code begin...
86 package Bio::DB::Taxonomy::greengenes;
88 use strict;
89 use base qw(Bio::DB::Taxonomy Bio::DB::Taxonomy::list);
91 $Bio::DB::Taxonomy::list::prefix = 'gg';
94 =head2 new
96 Title : new
97 Usage : my $obj = Bio::DB::Taxonomy::greengenes->new();
98 Function: Builds a new Bio::DB::Taxonomy::greengenes object
99 Returns : an instance of Bio::DB::Taxonomy::greengenes
100 Args : -taxofile => name of the file containing the taxonomic information,
101 typically 'taxonomy_16S_candiv_gg_2011_1.txt' (mandatory)
103 =cut
105 sub new {
106 # Override Bio::DB::Taxonomy
107 my($class, @args) = @_;
108 my $self = $class->SUPER::new(@args);
109 my ($taxofile) = $self->_rearrange([qw(TAXOFILE)], @args);
111 if ( $taxofile ) {
112 $self = $self->_build_taxonomy($taxofile);
115 return $self;
119 sub _build_taxonomy {
120 my ($self, $taxofile) = @_;
122 my $all_ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'];
124 my $taxonomy = Bio::DB::Taxonomy::list->new();
126 open my $fh, '<', $taxofile or $self->throw("Could not read file '$taxofile': $!");
128 # Will skip header line: prokMSA_id taxonomy
129 my $prev_taxo_string = 'taxonomy';
131 my $line;
133 # Parse taxonomy lines. Example:
134 # 348902 k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__Bacteroides plebeius
135 while ($line = <$fh>) {
136 chomp $line;
137 my ($prokmsa_id, $taxo_string) = split "\t", $line;
139 # Skip taxonomy string already seen on previous line (much faster!)
140 next if $taxo_string eq $prev_taxo_string;
141 $prev_taxo_string = $taxo_string;
143 # Remove ambiguous taxons, i.e. go from:
144 # k__Archaea; p__pMC2A384; c__; o__; f__; g__; s__
145 # to:
146 # k__Archaea; p__pMC2A384
147 my $names = [split /;\s*/, $taxo_string];
148 while ( ($names->[-1] || '') =~ m/__$/) {
149 pop @$names;
152 my $nof_ranks = scalar @$names;
153 next if $nof_ranks < 1;
155 $taxonomy->add_lineage(
156 -ranks => [ @{$all_ranks}[0..$nof_ranks-1] ],
157 -names => $names,
162 close $fh;
164 return $taxonomy;