Bio/AlignIO/arp.pm

   1 #
   2 # BioPerl module for Bio::AlignIO::arp
   3 #
   4 # Copyright Chris Fields
   5 #
   6 # You may distribute this module under the same terms as perl itself
   7 # POD documentation - main docs before the code
   8
   9 =head1 NAME
  10
  11 Bio::AlignIO::arp - ARP MSA Sequence input/output stream
  12
  13 =head1 SYNOPSIS
  14
  15 Do not use this module directly.  Use it via the L<Bio::AlignIO>
  16 class.
  17
  18 =head1 DESCRIPTION
  19
  20 This object can create L<Bio::SimpleAlign> objects from
  21 ARP flat files.  These are typically configuration-like data files
  22 for the program Arlequin.  For more information, see:
  23
  24   http://lgb.unige.ch/arlequin/
  25
  26 For the moment, this retains the allele sequence data in the DATA section and
  27 inserts them into SimpleAlign objects. ARP files that contain other data (RFLP,
  28 etc.) are not expected to parse properly.  Also, if the DNA data is actually SNP
  29 data, then the LocatableSeq object instantiation will throw an error.
  30
  31 This is now set up as a generic parser (i.e. it parses everything) and
  32 collects as much data as possible into the SimpleAlign object.  The following
  33 in a general mapping of where data can be found:
  34
  35     Tag        SimpleAlign
  36                Method
  37     ----------------------------------------------------------------------
  38     Title      description
  39     SampleName id
  40     ----------------------------------------------------------------------
  41
  42     Tag        Bio::Annotation   TagName                    Bio::Annotation
  43                Class                                        Parameters
  44     ----------------------------------------------------------------------
  45      NE        SimpleValue       pfam_family_accession      value
  46      NL        SimpleValue       sequence_start_stop        value
  47      SS        SimpleValue       sec_structure_source       value
  48      BM        SimpleValue       build_model                value
  49      RN        Reference         reference                  *
  50     ----------------------------------------------------------------------
  51   * RN is generated based on the number of Bio::Annotation::Reference objects
  52
  53 In addition, the number of samples found in the alignment is retained in a
  54 Bio::Annotation::TagTree object in the annotation collection and is accessible
  55 via:
  56
  57   ($samples) = $aln->annotation->get_Annotations('Samples');
  58   say $samples->display_text;
  59   # or use other relevant TagTree methods to retrieve data
  60
  61 =head1 FEEDBACK
  62
  63 =head2 Support
  64
  65 Please direct usage questions or support issues to the mailing list:
  66
  67 I<bioperl-l@bioperl.org>
  68
  69 rather than to the module maintainer directly. Many experienced and
  70 reponsive experts will be able look at the problem and quickly
  71 address it. Please include a thorough description of the problem
  72 with code and data examples if at all possible.
  73
  74 =head2 Reporting Bugs
  75
  76 Report bugs to the Bioperl bug tracking system to help us keep track
  77 the bugs and their resolution.  Bug reports can be submitted via the
  78 web:
  79
  80   https://github.com/bioperl/bioperl-live/issues
  81
  82 =head1 AUTHORS
  83
  84 Chris Fields (cjfields)
  85
  86 =head1 APPENDIX
  87
  88 The rest of the documentation details each of the object
  89 methods. Internal methods are usually preceded with a _
  90
  91 =cut
  92
  93 # Let the code begin...
  94
  95 package Bio::AlignIO::arp;
  96 use strict;
  97 use base qw(Bio::AlignIO);
  98
  99 use Data::Dumper;
 100 use Bio::Annotation::AnnotationFactory;
 101
 102 =head2 next_aln
 103
 104  Title   : next_aln
 105  Usage   : $aln = $stream->next_aln
 106  Function: returns the next alignment in the stream.
 107  Returns : Bio::Align::AlignI object - returns 0 on end of file
 108             or on error
 109  Args    : -width => optional argument to specify the width sequence
 110            will be written (60 chars by default)
 111
 112 See L<Bio::Align::AlignI>
 113
 114 =cut
 115
 116 sub next_aln {
 117     my $self = shift;
 118     my $aln = Bio::SimpleAlign->new(-source => 'arp');
 119     my ($data, $cur_block, $cur_type, $cur_data);
 120     SCAN:
 121     while (defined ($data = $self->_readline) ) {
 122         next if $data =~ m{^\s*$}xms;
 123         if ($data =~ m{\[{1,2}(\w+)\]{1,2}}xms) {
 124             $self->{state}->{current_block} = $1;
 125             next SCAN;
 126         }
 127         elsif ($data =~ m{^\s*(\w+)=\s?(\S[^\n]*$)}xms) {
 128             ($cur_type, $cur_data) = ($1, $2);
 129             if ($cur_data =~ m{^\s*\{\s*$}) {
 130                 $self->throw("Curly block must be embedded in a named Block")
 131                     if !exists($self->{state}->{current_block});
 132                 $self->{state}->{in_curly_block} = 1;
 133                 next SCAN;
 134             }
 135             $cur_data =~ s{[\"\']}{}g;
 136             $cur_data =~ s{\s*$}{};
 137             # per alignment annotation data (i.e. Sample Blocks) or
 138             # annotation data retained for each alignment?
 139             $self->{state}->{current_block} eq 'Samples' ?
 140                 push @{$self->{state}->{SampleAnnotation}->{$cur_type}}, $cur_data :
 141                 push @{$self->{state}->{Annotation}->{$cur_type}}, $cur_data;
 142         }
 143         elsif ($data =~ m{^\s*\}\s*$}xms) {
 144             $self->throw("Unmatched bracket in ARP file:\n$data") if
 145                 !exists($self->{state}->{in_curly_block});
 146             if ($self->{state}->{current_block} eq 'Samples') {;
 147                 my $ac = $self->_process_annotation($aln);
 148                 delete $self->{state}->{SampleAnnotation};
 149             } else {
 150                 # process other data at a later point
 151             }
 152             delete $self->{state}->{blockdata};
 153             $self->{state}->{in_curly_block} = 0;
 154             last SCAN;
 155         }
 156         else {
 157             # all other data should be in a curly block and have a block title
 158             $self->throw("Data found outside of proper block:\n$data") if
 159                 !exists($self->{state}->{current_block}) && !$self->{state}->{in_curly_block};
 160             # bypass commented stuff (but we may want to process it at a later
 161             # point, so turn back here)
 162             next if $data =~ m{^\s*\#}xms;
 163             if ($self->{state}->{current_block} eq 'Samples') {
 164                 chomp $data;
 165                 # we have two possible ways to deal with sample number, either
 166                 # clone the LocatableSeq (in which case we need to deal with ID
 167                 # duplication), or store as annotation data. I chose the latter
 168                 # route using a Bio::Annotation::TagTree. YMMV - cjfields 10-15-08
 169                 my ($ls, $samples) = $self->_process_sequence($data);
 170                 my $id = $ls->id;
 171                 push @{ $self->{state}->{SampleAnnotation}->{Samples} }, [$id => $samples];
 172                 $aln->add_seq($ls);
 173             } else {
 174                 # add elsif's for further processing
 175                 #$self->debug('Unmatched data in block '.
 176                 #             $self->{state}->{current_block}.
 177                 #             ":\n$data\n");
 178                 $self->{state}->{blockdata} .= $data;
 179             }
 180         }
 181     }
 182     # alignments only returned if they contain sequences
 183     return $aln if $aln->num_sequences;
 184     return;
 185 }
 186
 187 =head2 write_aln
 188
 189  Title   : write_aln
 190  Usage   : $stream->write_aln(@aln)
 191  Function: writes the $aln object into the stream in xmfa format
 192  Returns : 1 for success and 0 for error
 193  Args    : L<Bio::Align::AlignI> object
 194
 195 See L<Bio::Align::AlignI>
 196
 197 =cut
 198
 199 sub write_aln {
 200     my ($self,@aln) = @_;
 201     $self->throw_not_implemented;
 202 }
 203
 204 ################ PRIVATE SUBS ################
 205
 206 sub _process_sequence {
 207     my ($self, $raw) = @_;
 208     return unless defined $raw;
 209     $raw =~ s{(?:^\s+|\s+$)}{}g;
 210     my ($id, $samples, $seq) = split(' ', $raw);
 211     my $ls = Bio::LocatableSeq->new('-seq'        => $seq,
 212                                     '-start'      => 1,
 213                                     '-display_id' => $id,
 214                                     '-alphabet'   => $self->alphabet);
 215     return($ls, $samples);
 216 }
 217
 218 sub _process_annotation {
 219     my ($self, $aln) = @_;
 220     my $coll = Bio::Annotation::Collection->new();
 221     my $factory = Bio::Annotation::AnnotationFactory->new(-type => 'Bio::Annotation::SimpleValue');
 222     for my $anntype (qw(SampleAnnotation Annotation)) {
 223         for my $key (keys %{ $self->{state}->{$anntype} }) {
 224             if ($key eq 'Title') {
 225                 $aln->description($self->{state}->{$anntype}->{$key}[0]);
 226             } elsif ($key eq 'Samples') {
 227                 $factory->type('Bio::Annotation::TagTree');
 228                 $coll->add_Annotation($key, $factory->create_object(
 229                     -value => [$key => $self->{state}->{$anntype}->{$key}]));
 230                 $factory->type('Bio::Annotation::SimpleValue');
 231             } elsif ($key eq 'SampleName') {
 232                 $aln->id($self->{state}->{$anntype}->{$key}[0]);
 233             } else {
 234                 $self->throw('Expecting an array reference') unless
 235                     ref $self->{state}->{$anntype}->{$key} eq 'ARRAY';
 236                 for my $a (@{ $self->{state}->{$anntype}->{$key} }) {
 237                     $coll->add_Annotation($key, $factory->create_object(
 238                         -value => $a) );
 239                 }
 240             }
 241         }
 242     }
 243     #$self->debug("Collection:".Dumper($coll)."\n");
 244     $aln->annotation($coll);
 245 }
 246
 247 1;