scripts/Bio-DB-GFF/bp_process_wormbase.pl

   1 #!/usr/bin/perl
   2
   3 use constant ACEDB => 'sace://aceserver.cshl.org:2005';
   4 use strict;
   5 use warnings;
   6 use Ace;
   7
   8 my @framework = qw(mex-3 spe-15 lin-17 unc-11 dhc-1 unc-40 smg-5
   9                    unc-13 unc-29 eat-16 lin-11 spe-9 par-6 unc-59 unc-54 mab-9 lin-42
  10                    sri-71 smu-2 vab-1 bli-2 dpy-10 him-14 mig-5 unc-4 bli-1 sqt-1 rol-1
  11                    his-14 unc-52 unc-45 par-2 let-805 sel-8 mab-21 daf-4 sma-3 lin-39
  12                    unc-32 tax-4 ced-9 tra-1 nob-1 daf-1 ced-2 lin-1 unc-17 dpy-13 unc-5
  13                    smg-7 dif-1 lin-49 elt-1 daf-14 dpy-20 dpy-26 unc-30 tra-3 sup-24
  14                    rho-1 egl-8 unc-60 srh-36 apx-1 unc-62 let-418 dpy-11 let-413 sel-9
  15                    unc-42 egl-9 sma-1 sqt-3 odr-3 hda-1 unc-76 gcy-20 skr-5 par-4 unc-51
  16                    egl-17 lim-6 fox-1 fax-1 lon-2 unc-97 unc-6 unc-18 mec-10 sop-1 mab-18
  17                    sdc-2 odr-7 unc-9 unc-3 gas-1 ace-1);
  18 my %framework = map {$_=>1} @framework;
  19 my %framework_seen = ();
  20
  21 my $USAGE = <<USAGE;
  22 This script massages the Wormbase GFF files located at
  23 ftp://www.wormbase.org/pub/wormbase/GENE_DUMPS into a version of the
  24 GFF format suitable for display by the generic genome browser.  It
  25 mainly adds comments to the annotations and designates certain
  26 well-spaced genetic loci as framework landmarks.
  27
  28 This script requires the AcePerl distribution, which is available on
  29 CPAN (look for the "Ace" module).
  30
  31 To use this script, get the WormBase GFF files from the FTP site
  32 listed above and place them in a directory.  It might be a good idea
  33 to name the directory after the current release, such as WS61.  You do
  34 not need to uncompress the files.
  35
  36 Then give that directory as the argument to this script and capture
  37 the script's output to a file:
  38
  39   % process_wormbase.pl ./WS61 > wormbase.gff
  40
  41 It may take a while before you see output from this script, since it
  42 must first fetch gene and protein database from the remote AceDB
  43 running at www.wormbase.org.
  44 The wormbase.gff file can then be loaded into a Bio::DB::GFF database
  45 using the following command:
  46
  47   % bulk_load_gff.pl -d <databasename> wormbase.gff
  48 USAGE
  49 ;
  50 #'
  51
  52 die $USAGE if $ARGV[0]=~/^-?-h/i;
  53
  54 my $db = Ace->connect(-url=>ACEDB,
  55                       -query_timeout=>500) or die "Can't open ace database:",Ace->error;
  56
  57 if (-d $ARGV[0]) {
  58   @ARGV = <$ARGV[0]/*.gff.gz>;
  59 }
  60
  61 @ARGV || die $USAGE;
  62
  63 foreach (@ARGV) { # GFF FILES
  64   $_ = "gunzip -c $_ |" if /\.gz$/;
  65 }
  66
  67 my (%NOTES,%LOCUS,%GENBANK,%CONFIRMED,%ORFEOME);
  68 get_confirmed($db,\%CONFIRMED);
  69 get_genbank($db,\%GENBANK);
  70 get_loci($db,\%LOCUS);
  71 get_notes($db,\%NOTES);
  72 get_orfeome($db,\%ORFEOME);
  73
  74 while (<>) {
  75   chomp;
  76   next if /^\#/;
  77   my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split /\t/;
  78   next if $source eq 'assembly_tag';  # don't want 'em, don't need 'em
  79   $ref    =~ s/^CHROMOSOME_//;
  80   $group  =~ s/CHROMOSOME_//;
  81
  82   $source ='' if $source eq '*UNKNOWN*';
  83
  84   if ($method eq 'Sequence' && ($source eq 'curated' || $source eq 'RNA') && $group =~ /Sequence "(\w+\.\d+[a-z]?)"/) {
  85     my @notes;
  86     push @notes,map { qq(Note "$_")        } @{$NOTES{$1}}     if $NOTES{$1};
  87     push @notes,map { qq(Note "$_")        } @{$LOCUS{$1}}     if $LOCUS{$1};
  88     push @notes,qq(Confirmed_by "$CONFIRMED{$1}")              if $CONFIRMED{$1};
  89     $group = join ' ; ',$group,@notes;
  90     if (my $loci = $LOCUS{$1}) {
  91       foreach (@$loci) {
  92         print join("\t",$ref,$source,'gene',$start,$stop,$score,$strand,$phase,"Locus $_"),"\n";
  93         print join("\t",$ref,'framework','gene',$start,$stop,$score,$strand,$phase,"Locus $_"),"\n"
  94           if $framework{$_} && !$framework_seen{$_}++;
  95       }
  96     }
  97   }
  98
  99   if ($method eq 'Sequence' && $source eq 'Genomic_canonical' && $group =~ /Sequence "(\w+)"/) {
 100     if (my $accession = $GENBANK{$1}) {
 101       $group .= qq( ; Note "Genbank $accession");
 102       print join("\t",$ref,'Genbank',$method,$start,$stop,$score,$strand,$phase,"Genbank \"$accession\""),"\n";
 103     }
 104   }
 105
 106   if ($method eq 'reagent' && $source eq 'Orfeome_project' && $group =~ /PCR_product "([^\"]+)"/) {
 107     my $amp = $ORFEOME{$1};
 108     $group .= qq( ; Amplified $amp) if defined $amp;
 109   }
 110
 111   # fix variant fields: Variant "T" => Note "T"
 112   $group =~ s/(?:Variant|Insert) "(\w+)"/Note "$1"/;
 113
 114   # fix UTR fields
 115   if ($group =~ /UTR "([35])_UTR:(\S+)"/) {
 116     $method = 'UTR';
 117     $source = "$1_UTR";
 118     $group = qq(Sequence "$2");
 119   }
 120
 121   print join("\t",$ref,$source,$method,$start,$stop,$score,$strand,$phase,$group),"\n";
 122 }
 123
 124 sub get_loci {
 125   my ($db,$hash) = @_;  # hash keys are predicted gene names, values are one or more loci names
 126   my @genes = $db->fetch(-query=>'find Locus Genomic_sequence',-filltag=>'Genomic_sequence');
 127   foreach my $obj (@genes) {
 128     my @genomic = $obj->Genomic_sequence or next;
 129     foreach (@genomic) {
 130       push @{$hash->{$_}},$obj;
 131     }
 132   }
 133 }
 134
 135 sub get_notes {
 136   my ($db,$hash) = @_;  # hash keys are predicted gene names, values are one or more brief identifications
 137   my @genes = $db->fetch(-query=>'find Sequence Brief_identification',-filltag=>'Brief_identification');
 138   foreach my $obj (@genes) {
 139     my @notes = $obj->Brief_identification or next;
 140     $hash->{$obj} = \@notes;
 141   }
 142 }
 143
 144 sub get_genbank {
 145   my ($db,$hash) = @_;   # hash keys are cosmid names, values are genbank accessions (1 to 1)
 146   my @cosmids = $db->fetch(-query=>'find Genome_Sequence Database',-filltag=>'Database');
 147   for my $cosmid (@cosmids) {
 148     my ($database,undef,$accession) = $cosmid->Database(1)->row;
 149     next unless $accession;
 150     $hash->{$cosmid} = $accession;
 151   }
 152 }
 153
 154 sub get_confirmed {
 155   my ($db,$hash) = @_;  # hash keys are predicted gene names, values are confirmation type
 156   my @confirmed = $db->fetch(-query=>'find Sequence Confirmed_by',-filltag=>'Confirmed_by');
 157   foreach my $obj (@confirmed) {
 158     my $confirmed_by = $obj->Confirmed_by || 'Unknown';
 159     $hash->{$obj} = $confirmed_by;
 160   }
 161 }
 162
 163 sub get_orfeome {
 164   my ($db,$hash) = @_;
 165   my @mv_primers = $db->fetch(-query=>'find PCR_Product mv*',-filltag=>'Amplified');
 166   for my $obj (@mv_primers) {
 167     my $amplified = $obj->Amplified;
 168     $hash->{$obj} = $amplified;
 169   }
 170 }
 171
 172 __END__
 173
 174 =head1 NAME
 175
 176 bp_process_wormbase.pl - Massage WormBase GFF files into a version suitable for the Generic Genome Browser
 177
 178 =head1 SYNOPSIS
 179
 180   % bp_process_wormbase.pl ./WS61 > wormbase.gff
 181
 182 =head1 DESCRIPTION
 183
 184 This script massages the Wormbase GFF files located at
 185 ftp://www.wormbase.org/pub/wormbase/GENE_DUMPS into a version of the
 186 GFF format suitable for display by the generic genome browser.  It
 187 mainly adds comments to the annotations and designates certain
 188 well-spaced genetic loci as framework landmarks.
 189
 190 This script requires the AcePerl distribution, which is available on
 191 CPAN (look for the "Ace" module).
 192
 193 To use this script, get the WormBase GFF files from the FTP site
 194 listed above and place them in a directory.  It might be a good idea
 195 to name the directory after the current release, such as WS61.  You do
 196 not need to uncompress the files.
 197
 198 Then give that directory as the argument to this script and capture
 199 the script's output to a file:
 200
 201   % bp_process_wormbase.pl ./WS61 > wormbase.gff
 202
 203 It may take a while before you see output from this script, since it
 204 must first fetch gene and protein database from the remote AceDB
 205 running at www.wormbase.org.
 206 The wormbase.gff file can then be loaded into a Bio::DB::GFF database
 207 using the following command:
 208
 209   % bulk_load_gff.pl -d <databasename> wormbase.gff
 210
 211 =head1 SEE ALSO
 212
 213 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
 214
 215 =head1 AUTHOR
 216
 217 Lincoln Stein E<lt>lstein@cshl.orgE<gt>
 218
 219 Copyright (c) 2002 Cold Spring Harbor Laboratory
 220
 221 This library is free software; you can redistribute it and/or modify
 222 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 223 disclaimers of warranty.
 224
 225 =cut
 226