scripts/Bio-DB-GFF/bp_process_sgd.pl

   1 #!/usr/bin/perl
   2
   3 # This script will convert from SGD format to GFF format
   4 # See http://db.yeastgenome.org/schema/Schema.html
   5
   6 use strict;
   7 use warnings;
   8
   9 # hard-coded length data that I couldn't get directly
  10 my %CHROMOSOMES = (I => 230_203,
  11                    II => 813_139,
  12                    III => 316_613,
  13                    IV  => 1_531_929,
  14                    V   => 576_869,
  15                    VI => 270_148,
  16                    VII => 1_090_937,
  17                    VIII => 562_639,
  18                    IX => 439_885,
  19                    X => 745_444,
  20                    XI => 666_445,
  21                    XII => 1_078_173,
  22                    XIII => 924_430,
  23                    XIV => 784_328,
  24                    XV  => 1_091_284,
  25                    XVI => 948_061,
  26                    Mit => 85_779);
  27 my @ROMAN = qw(I II III IV V VI VII VIII IX X
  28                XI XII XIII XIV XV XVI Mit);
  29
  30 if ($ARGV[0] =~ /^--?h/) {
  31   die <<USAGE;
  32  Usage: $0 <SGD features file>
  33
  34 This script massages the SGD sequence annotation flat files located at
  35 ftp://genome-ftp.stanford.edu/pub/yeast/data_dump/feature/chromosomal_features.tab
  36 into a version of the GFF format suitable for display by the generic
  37 genome browser.
  38
  39 To use this script, get the SGD chromosomal_features.tab file from the
  40 FTP site listed above, and run the following command:
  41
  42   % process_sgd.pl chromosomal_features.tab > yeast.gff
  43
  44 The yeast.gff file can then be loaded into a Bio::DB::GFF database
  45 using the following command:
  46
  47   % bulk_load_gff.pl -d <databasename> yeast.gff
  48
  49 USAGE
  50 ;
  51 }
  52
  53 # first print out chromosomes
  54 # We hard coded the lengths because they are not available in the features table.
  55 for my $chrom (sort keys %CHROMOSOMES) {
  56   print join("\t",$chrom,'chromosome','Component',1,$CHROMOSOMES{$chrom},'.','.','.',qq(Sequence "$chrom")),"\n";
  57 }
  58
  59 # this is hard because the SGD idea of a feature doesn't really map onto the GFF idea.
  60 while (<>) {
  61   chomp;
  62   my($id,$gene,$aliases,$type,$chromosome,$start,$stop,$strand,$sgdid,$sgdid2,$description,$date) = split "\t";
  63   my $ref = $ROMAN[$chromosome-1];
  64   $description =~ s/"/\\"/g;
  65   $description =~ s/;/\\;/g;
  66
  67   $strand = $strand eq 'W' ? '+' : '-';
  68   ($start,$stop) = ($stop,$start) if $strand eq '-';
  69   die "Strand logic is messed up" if $stop < $start;
  70
  71   if ($gene) {
  72      my @aliases = split(/\|/,$aliases);
  73      my $aliases = join " ; ",map {qq(Alias "$_")} @aliases;
  74      my $group = qq(Gene "$gene" ; Note "$description");
  75      $group .= " ; $aliases" if $aliases;
  76      print join("\t",$ref,'sgd','gene',$start,$stop,'.',$strand,'.',$group),"\n";
  77      $description .= "\\; AKA @aliases" if @aliases;
  78   }
  79
  80   print join("\t",$ref,'sgd',$type,$start,$stop,'.',$strand,'.',qq($type "$id" ; Note "$description")),"\n";
  81 }
  82
  83 __END__
  84
  85 =head1 NAME
  86
  87 bp_process_sgd.pl - Massage SGD annotation flat files into a version suitable for the Generic Genome Browser
  88
  89 =head1 SYNOPSIS
  90
  91   % bp_process_sgd.pl chromosomal_features.tab > yeast.gff
  92
  93 =head1 DESCRIPTION
  94
  95 This script massages the SGD sequence annotation flat files located at
  96 ftp://genome-ftp.stanford.edu/pub/yeast/data_dump/feature/chromosomal_features.tab
  97 into a version of the GFF format suitable for display by the generic
  98 genome browser.
  99
 100 To use this script, get the SGD chromosomal_features.tab file from the
 101 FTP site listed above, and run the following command:
 102
 103   % bp_process_sgd.pl chromosomal_features.tab > yeast.gff
 104
 105 The yeast.gff file can then be loaded into a Bio::DB::GFF database
 106 using the following command:
 107
 108   % bulk_load_gff.pl -d <databasename> yeast.gff
 109
 110 =head1 SEE ALSO
 111
 112 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
 113
 114 =head1 AUTHOR
 115
 116 Lincoln Stein, lstein@cshl.org
 117
 118 Copyright (c) 2002 Cold Spring Harbor Laboratory
 119
 120 This library is free software; you can redistribute it and/or modify
 121 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 122 disclaimers of warranty.
 123
 124 =cut
 125
 126