bin/bp_process_gadfly

   1 #!/usr/bin/perl
   2 if ($ARGV[0]=~/^-?-h/ || @ARGV < 1) {
   3 die <<'USAGE';
   4
   5 This script massages the RELEASE 3 Flybase/Gadfly GFF files located at
   6 http://www.fruitfly.org/sequence/release3download.shtml into the
   7 "correct" version of the GFF format.
   8
   9 To use this script, download the whole genome FASTA file and save it
  10 to disk.  (The downloaded file will be called something like
  11 "na_whole-genome_genomic_dmel_RELEASE3.FASTA", but the link on the
  12 HTML page doesn't give the filename.)  Do the same for the whole
  13 genome GFF annotation file (the saved file will be called something
  14 like "whole-genome_annotation-feature-region_dmel_RELEASE3.GFF".)  If
  15 you wish you can download the ZIP compressed versions of these files.
  16
  17 Next run this script on the two files, indicating the name of the
  18 downloaded FASTA file first, followed by the gff file:
  19
  20  % process_gadfly.pl na_whole-genome_genomic_dmel_RELEASE3.FASTA whole-genome_annotation-feature-region_dmel_RELEASE3.GFF > fly.gff
  21
  22 The gadfly.gff file and the fasta file can now be loaded into a Bio::DB::GFF database
  23 using the following command:
  24
  25   % bulk_load_gff.pl -d fly -fasta na_whole-genome_genomic_dmel_RELEASE3.FASTA fly.gff
  26
  27 (Where "fly" is the name of the database.  Change it as appropriate.
  28 The database must already exist and be writable by you!)
  29
  30 The resulting database will have the following feature types
  31 (represented as "method:source"):
  32
  33   Component:arm              A chromosome arm
  34   Component:scaffold         A chromosome scaffold (accession #)
  35   Component:gap              A gap in the assembly
  36   clone:clonelocator         A BAC clone
  37   gene:gadfly                A gene accession number
  38   transcript:gadfly          A transcript accession number
  39   translation:gadfly         A translation
  40   codon:gadfly               Significance unknown
  41   exon:gadfly                An exon
  42   symbol:gadfly              A classical gene symbol
  43   similarity:blastn          A BLASTN hit
  44   similarity:blastx          A BLASTX hit
  45   similarity:sim4            EST->genome using SIM4
  46   similarity:groupest        EST->genome using GROUPEST
  47   similarity:repeatmasker    A repeat
  48
  49 IMPORTANT NOTE: This script will *only* work with the RELEASE3 gadfly
  50 files and will not work with earlier releases.
  51
  52 USAGE
  53 ;
  54 }
  55
  56 use strict;
  57 use warnings;
  58
  59 foreach (@ARGV) {
  60   $_ = "gunzip -c $_ |" if /\.gz$/;
  61 }
  62
  63 if ($ARGV[0] =~ /fasta/i) {
  64   process_fasta();
  65 } else {
  66   die "call as process_gadfly.pl \"release3_dna.FASTA\" \"release3_features.GFF\"";
  67 }
  68
  69 while (<>) {
  70   next if /^\#/;
  71   chomp;
  72   my ($ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$cgroup) = split "\t";
  73   next if $start > $stop;  # something wrong. Don't bother fixing it.
  74
  75   my $fixed_group = fix_group($csource,$cmethod,$cgroup);
  76   print join("\t",$ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$fixed_group),"\n";
  77   dump_symbol($ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$cgroup) if $cgroup =~ /symbol/i;
  78 }
  79
  80 sub fix_group {
  81   my ($source,$method,$group) = @_;
  82   my (@group,$gene);
  83   push @group,"Transcript $1" if $group =~ /transgrp=([^; ]+)/;
  84   push @group,"Gene $1"       if $method eq 'gene' && $group =~ /genegrp=([^; ]+)/;
  85
  86   $gene ||= qq(Note "FlyBase $1")  if $group =~ /dbxref=FlyBase:(\w+)/;
  87   $gene ||= qq(Note "GadFly $1")   if $group =~ /genegrp=([^; ]+)/;
  88   push @group,qq(Note "Symbol $1") if $group =~ /symbol=([^; ]+)/ && "Gene $1" ne $group[0];
  89   push @group,$gene;
  90   return join ' ; ',@group;
  91 }
  92
  93 # called when we encounter a gene symbol
  94 sub dump_symbol {
  95   my ($ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,$cgroup) = @_;
  96   my ($symbol) = $cgroup=~/symbol=([^;]+)/;
  97   my ($gene)   = $cgroup=~/genegrp=([^;]+)/;
  98   return if $symbol eq $gene;
  99   $cmethod = 'symbol';
 100   print join("\t",$ref,$csource,$cmethod,$start,$stop,$cscore,$strand,$cphase,qq(Symbol "$symbol")),"\n";
 101 }
 102
 103 sub process_fasta {
 104   my $file = shift @ARGV;
 105   open my $F, '<', $file or die "Could not read file '$file': $!\n";
 106   print STDERR "Reading big FASTA file, please be patient...\n";
 107   my ($current_id,%lengths);
 108   while (my $line = <$F>) {
 109     if ($line =~ /^>(\S+)/) {
 110       $current_id = $1;
 111       next;
 112     }
 113     die "this doesn't look like a fasta file to me" unless $current_id;
 114     chomp $line;
 115     $lengths{$current_id} += length $line;
 116   }
 117   close $F;
 118
 119   foreach my $id (sort keys %lengths) {
 120     print join("\t", $id, 'arm', 'Component', 1, $lengths{$id},
 121                      '.', '+', '.', qq(Sequence "$id")
 122                ), "\n";
 123   }
 124 }
 125
 126 __END__
 127
 128 =head1 NAME
 129
 130 bp_process_gadfly.pl - Massage Gadfly/FlyBase GFF files into a version suitable for the Generic Genome Browser
 131
 132 =head1 SYNOPSIS
 133
 134   % bp_process_gadfly.pl ./RELEASE2 > gadfly.gff
 135
 136 =head1 DESCRIPTION
 137
 138 This script massages the RELEASE 3 Flybase/Gadfly GFF files located at
 139 http://www.fruitfly.org/sequence/release3download.shtml into the "correct"
 140 version of the GFF format.
 141
 142 To use this script, download the whole genome FASTA file and save it
 143 to disk.  (The downloaded file will be called something like
 144 "na_whole-genome_genomic_dmel_RELEASE3.FASTA", but the link on the
 145 HTML page doesn't give the filename.)  Do the same for the whole
 146 genome GFF annotation file (the saved file will be called something
 147 like "whole-genome_annotation-feature-region_dmel_RELEASE3.GFF".)  If
 148 you wish you can download the ZIP compressed versions of these files.
 149
 150 Next run this script on the two files, indicating the name of the
 151 downloaded FASTA file first, followed by the gff file:
 152
 153  % bp_process_gadfly.pl na_whole-genome_genomic_dmel_RELEASE3.FASTA whole-genome_annotation-feature-region_dmel_RELEASE3.GFF > fly.gff
 154
 155 The gadfly.gff file and the fasta file can now be loaded into a Bio::DB::GFF database
 156 using the following command:
 157
 158   % bulk_load_gff.pl -d fly -fasta na_whole-genome_genomic_dmel_RELEASE3.FASTA fly.gff
 159
 160 (Where "fly" is the name of the database.  Change it as appropriate.
 161 The database must already exist and be writable by you!)
 162
 163 The resulting database will have the following feature types
 164 (represented as "method:source"):
 165
 166   Component:arm              A chromosome arm
 167   Component:scaffold         A chromosome scaffold (accession #)
 168   Component:gap              A gap in the assembly
 169   clone:clonelocator         A BAC clone
 170   gene:gadfly                A gene accession number
 171   transcript:gadfly          A transcript accession number
 172   translation:gadfly         A translation
 173   codon:gadfly               Significance unknown
 174   exon:gadfly                An exon
 175   symbol:gadfly              A classical gene symbol
 176   similarity:blastn          A BLASTN hit
 177   similarity:blastx          A BLASTX hit
 178   similarity:sim4            EST->genome using SIM4
 179   similarity:groupest        EST->genome using GROUPEST
 180   similarity:repeatmasker    A repeat
 181
 182 IMPORTANT NOTE: This script will *only* work with the RELEASE3 gadfly
 183 files and will not work with earlier releases.
 184
 185 =head1 SEE ALSO
 186
 187 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
 188
 189 =head1 AUTHOR
 190
 191 Lincoln Stein, lstein@cshl.org
 192
 193 Copyright (c) 2002 Cold Spring Harbor Laboratory
 194
 195 This library is free software; you can redistribute it and/or modify
 196 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 197 disclaimers of warranty.
 198
 199 =cut