Bio/ClusterIO/unigene.pm

   1 # BioPerl module for Bio::ClusterIO::unigene
   2 #
   3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   4 #
   5 # Cared for by Andrew Macgregor <andrew at cbbc.murdoch.edu.au>
   6 #
   7 # Copyright Andrew Macgregor, Jo-Ann Stanton, David Green
   8 # Molecular Embryology Group, Anatomy & Structural Biology, University of Otago
   9 # http://meg.otago.ac.nz
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12 #
  13 # _history
  14 # April 17, 2002 - Initial implementation by Andrew Macgregor
  15
  16 # POD documentation - main docs before the code
  17
  18 =head1 NAME
  19
  20 Bio::ClusterIO::unigene - UniGene input stream
  21
  22 =head1 SYNOPSIS
  23
  24 Do not use this module directly.  Use it via the Bio::ClusterIO class.
  25
  26 =head1 DESCRIPTION
  27
  28 This object reads from Unigene *.data files downloaded from
  29 ftp://ftp.ncbi.nih.gov/repository/UniGene/.  It does not download and
  30 decompress the file, you have to do that yourself.
  31
  32 =head1 FEEDBACK
  33
  34 =head2 Mailing Lists
  35
  36 User feedback is an integral part of the evolution of this and other
  37 Bioperl modules. Send your comments and suggestions preferably to one
  38 of the Bioperl mailing lists.  Your participation is much appreciated.
  39
  40   bioperl-l@bioperl.org                  - General discussion
  41   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  42
  43 =head2 Support
  44
  45 Please direct usage questions or support issues to the mailing list:
  46
  47 I<bioperl-l@bioperl.org>
  48
  49 rather than to the module maintainer directly. Many experienced and
  50 reponsive experts will be able look at the problem and quickly
  51 address it. Please include a thorough description of the problem
  52 with code and data examples if at all possible.
  53
  54 =head2 Reporting Bugs
  55
  56 Report bugs to the Bioperl bug tracking system to help us keep track
  57 the bugs and their resolution.  Bug reports can be submitted via the
  58 web:
  59
  60   https://redmine.open-bio.org/projects/bioperl/
  61
  62 =head1 AUTHORS - Andrew Macgregor
  63
  64 Email: andrew at cbbc.murdoch.edu.au
  65
  66
  67 =head1 APPENDIX
  68
  69 The rest of the documentation details each of the object
  70 methods. Internal methods are usually preceded with a _
  71
  72 =cut
  73
  74 #'
  75 # Let the code begin...
  76
  77 package Bio::ClusterIO::unigene;
  78 use strict;
  79
  80 use Bio::Cluster::UniGene;
  81 use Bio::Cluster::ClusterFactory;
  82
  83 use base qw(Bio::ClusterIO);
  84
  85 my %line_is = (
  86                 ID                      =>      q/ID\s+(\w{2,3}\.\d+)/,
  87                 TITLE                   =>      q/TITLE\s+(\S.*)/,
  88                 GENE                    =>      q/GENE\s+(\S.*)/,
  89                 CYTOBAND                =>      q/CYTOBAND\s+(\S.*)/,
  90                 MGI                     =>      q/MGI\s+(\S.*)/,
  91                 LOCUSLINK               =>      q/LOCUSLINK\s+(\S.*)/,
  92                 HOMOL           =>      q/HOMOL\s+(\S.*)/,
  93                 EXPRESS                 =>      q/EXPRESS\s+(\S.*)/,
  94                 RESTR_EXPR              =>      q/RESTR_EXPR\s+(\S.*)/,
  95                 GNM_TERMINUS            =>      q/GNM_TERMINUS\s+(\S.*)/,
  96                 CHROMOSOME              =>      q/CHROMOSOME\s+(\S.*)/,
  97                 STS                     =>      q/STS\s+(\S.*)/,
  98                 TXMAP                   =>      q/TXMAP\s+(\S.*)/,
  99                 PROTSIM                 =>      q/PROTSIM\s+(\S.*)/,
 100                 SCOUNT                  =>      q/SCOUNT\s+(\S.*)/,
 101                 SEQUENCE                =>      q/SEQUENCE\s+(\S.*)/,
 102                 ACC                     =>      q/ACC=(\w+)(\.\d+)?/,
 103                 NID                     =>      q/NID=\s*(\S.*)/,
 104                 PID                     =>      q/PID=\s*(\S.*)/,
 105                 CLONE                   =>      q/CLONE=\s*(\S.*)/,
 106                 END                     =>      q/END=\s*(\S.*)/,
 107                 LID                     =>      q/LID=\s*(\S.*)/,
 108                 MGC                     =>      q/MGC=\s*(\S.*)/,
 109                 SEQTYPE         =>      q/SEQTYPE=\s*(\S.*)/,
 110                 TRACE                   =>      q/TRACE=\s*(\S.*)/,
 111                 PERIPHERAL              =>      q/PERIPHERAL=\s*(\S.*)/,
 112                 DELIMITER               =>      q{^//},
 113 );
 114
 115 # we set the right factory here
 116 sub _initialize {
 117         my($self, @args) = @_;
 118
 119         $self->SUPER::_initialize(@args);
 120         if(! $self->cluster_factory()) {
 121         $self->cluster_factory(Bio::Cluster::ClusterFactory->new(
 122                                                 -type => 'Bio::Cluster::UniGene'));
 123         }
 124 }
 125
 126 =head2 next_cluster
 127
 128  Title   : next_cluster
 129  Usage   : $unigene = $stream->next_cluster()
 130  Function: returns the next unigene in the stream
 131  Returns : Bio::Cluster::UniGene object
 132  Args    : NONE
 133
 134 =cut
 135
 136 sub next_cluster {
 137         my( $self) = @_;
 138         local $/ = "\n//";
 139         return unless my $entry = $self->_readline;
 140
 141 # set up the variables we'll need
 142         my (%unigene,@express,@locuslink,@chromosome,
 143                 @sts,@txmap,@protsim,@sequence);
 144         my $UGobj;
 145
 146 # set up the regexes
 147
 148 # add whitespace parsing and precompile regexes
 149 #foreach (values %line_is) {
 150 #       $_ =~ s/\s+/\\s+/g;
 151 #       print STDERR "Regex is $_\n";
 152 #       #$_ = qr/$_/x;
 153 #}
 154
 155 #$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/;
 156
 157 # run each line in an entry against the regexes
 158         foreach my $line (split /\n/, $entry) {
 159           #print STDERR "Wanting to match $line\n";
 160                 if ($line =~ /$line_is{ID}/gcx) {
 161                         $unigene{ID} = $1;
 162                 }
 163                 elsif ($line =~ /$line_is{TITLE}/gcx ) {
 164                   #print STDERR "MATCHED with [$1]\n";
 165                         $unigene{TITLE} = $1;
 166                 }
 167                 elsif ($line =~ /$line_is{GENE}/gcx) {
 168                         $unigene{GENE} = $1;
 169                 }
 170                 elsif ($line =~ /$line_is{CYTOBAND}/gcx) {
 171                         $unigene{CYTOBAND} = $1;
 172                 }
 173                 elsif ($line =~ /$line_is{MGI}/gcx) {
 174                         $unigene{MGI} = $1;
 175                 }
 176                 elsif ($line =~ /$line_is{LOCUSLINK}/gcx) {
 177                         @locuslink = split /;/, $1;
 178                 }
 179                 elsif ($line =~ /$line_is{HOMOL}/gcx) {
 180                         $unigene{HOMOL} = $1;
 181                 }
 182                 elsif ($line =~ /$line_is{EXPRESS}/gcx) {
 183                         my $express = $1;
 184                         # remove initial semicolon if present
 185                         $express =~ s/^;//;
 186                         @express = split /\s*;/, $express;
 187                 }
 188                 elsif ($line =~ /$line_is{RESTR_EXPR}/gcx) {
 189                         $unigene{RESTR_EXPR} = $1;
 190                 }
 191                 elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) {
 192                         $unigene{GNM_TERMINUS} = $1;
 193                 }
 194                 elsif ($line =~ /$line_is{CHROMOSOME}/gcx) {
 195                         push @chromosome, $1;
 196                 }
 197                 elsif ($line =~ /$line_is{TXMAP}/gcx) {
 198                         push @txmap, $1;
 199                 }
 200                 elsif ($line =~ /$line_is{STS}/gcx) {
 201                         push @sts, $1;
 202                 }
 203                 elsif ($line =~ /$line_is{PROTSIM}/gcx) {
 204                         push @protsim, $1;
 205                 }
 206                 elsif ($line =~ /$line_is{SCOUNT}/gcx) {
 207                         $unigene{SCOUNT} = $1;
 208                 }
 209                 elsif ($line =~ /$line_is{SEQUENCE}/gcx) {
 210                         # parse into each sequence line
 211                         my $seq = {};
 212                         # add unigene id to each seq
 213                         #$seq->{unigene_id} = $unigene{ID};
 214                         my @items = split(/;/, $1);
 215                         foreach (@items) {
 216                             if (/$line_is{ACC}/gcx) {
 217                                 $seq->{acc} = $1;
 218                                 # remove leading dot if version pattern matched
 219                                 $seq->{version} = substr($2,1) if defined $2;
 220                             }
 221                             elsif (/$line_is{NID}/gcx) {
 222                                 $seq->{nid} = $1;
 223                             }
 224                             elsif (/$line_is{PID}/gcx) {
 225                                 $seq->{pid} = $1;
 226                             }
 227                             elsif (/$line_is{CLONE}/gcx) {
 228                                 $seq->{clone} = $1;
 229                             }
 230                             elsif (/$line_is{END}/gcx) {
 231                                 $seq->{end} = $1;
 232                             }
 233                             elsif (/$line_is{LID}/gcx) {
 234                                 $seq->{lid} = $1;
 235                             }
 236                             elsif (/$line_is{MGC}/gcx) {
 237                                 $seq->{mgc} = $1;
 238                             }
 239                             elsif (/$line_is{SEQTYPE}/gcx) {
 240                                 $seq->{seqtype} = $1;
 241                             }
 242                             elsif (/$line_is{TRACE}/gcx) {
 243                                 $seq->{trace} = $1;
 244                             }
 245                             elsif (/$line_is{PERIPHERAL}/gcx) {
 246                                 $seq->{peripheral} = $1;
 247                             }
 248                         }
 249                         push @sequence, $seq;
 250                 }
 251                 elsif ($line =~ /$line_is{DELIMITER}/gcx) {
 252                         # at the end of the record, add data to the object
 253                         $UGobj = $self->cluster_factory->create_object(
 254                                   -display_id  => $unigene{ID},
 255                                   -description => $unigene{TITLE},
 256                                   -size            => $unigene{SCOUNT},
 257                                   -members         => \@sequence);
 258                         $UGobj->gene($unigene{GENE}) if defined ($unigene{GENE});
 259                         $UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND});
 260                         $UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI});
 261                         $UGobj->locuslink(\@locuslink);
 262                         $UGobj->homol($unigene{HOMOL}) if defined ($unigene{HOMOL});
 263                         $UGobj->express(\@express);
 264                         $UGobj->restr_expr($unigene{RESTR_EXPR}) if defined ($unigene{RESTR_EXPR});
 265                         $UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS});
 266                         $UGobj->chromosome(\@chromosome);
 267                         $UGobj->sts(\@sts);
 268                         $UGobj->txmap(\@txmap);
 269                         $UGobj->protsim(\@protsim);
 270                 }
 271         }
 272         return $UGobj;
 273 }
 274
 275 1;
 276