Update Roy's email
[bioperl-live.git] / scripts / Bio-DB-GFF / process_sgd.PLS
bloba529cba223fcec404cd176c93452473241013e9c
1 #!/usr/bin/perl -w
3 # $Id$
4 # This script will convert from SGD format to GFF format
5 # See http://db.yeastgenome.org/schema/Schema.html
7 use strict;
9 # hard-coded length data that I couldn't get directly
10 my %CHROMOSOMES = (I => 230_203,
11 II => 813_139,
12 III => 316_613,
13 IV => 1_531_929,
14 V => 576_869,
15 VI => 270_148,
16 VII => 1_090_937,
17 VIII => 562_639,
18 IX => 439_885,
19 X => 745_444,
20 XI => 666_445,
21 XII => 1_078_173,
22 XIII => 924_430,
23 XIV => 784_328,
24 XV => 1_091_284,
25 XVI => 948_061,
26 Mit => 85_779);
27 my @ROMAN = qw(I II III IV V VI VII VIII IX X
28 XI XII XIII XIV XV XVI Mit);
30 if ($ARGV[0] =~ /^--?h/) {
31 die <<USAGE;
32 Usage: $0 <SGD features file>
34 This script massages the SGD sequence annotation flat files located at
35 ftp://genome-ftp.stanford.edu/pub/yeast/data_dump/feature/chromosomal_features.tab
36 into a version of the GFF format suitable for display by the generic
37 genome browser.
39 To use this script, get the SGD chromosomal_features.tab file from the
40 FTP site listed above, and run the following command:
42 % process_sgd.pl chromosomal_features.tab > yeast.gff
44 The yeast.gff file can then be loaded into a Bio::DB::GFF database
45 using the following command:
47 % bulk_load_gff.pl -d <databasename> yeast.gff
49 USAGE
53 # first print out chromosomes
54 # We hard coded the lengths because they are not available in the features table.
55 for my $chrom (sort keys %CHROMOSOMES) {
56 print join("\t",$chrom,'chromosome','Component',1,$CHROMOSOMES{$chrom},'.','.','.',qq(Sequence "$chrom")),"\n";
59 # this is hard because the SGD idea of a feature doesn't really map onto the GFF idea.
60 while (<>) {
61 chomp;
62 my($id,$gene,$aliases,$type,$chromosome,$start,$stop,$strand,$sgdid,$sgdid2,$description,$date) = split "\t";
63 my $ref = $ROMAN[$chromosome-1];
64 $description =~ s/"/\\"/g;
65 $description =~ s/;/\\;/g;
67 $strand = $strand eq 'W' ? '+' : '-';
68 ($start,$stop) = ($stop,$start) if $strand eq '-';
69 die "Strand logic is messed up" if $stop < $start;
71 if ($gene) {
72 my @aliases = split(/\|/,$aliases);
73 my $aliases = join " ; ",map {qq(Alias "$_")} @aliases;
74 my $group = qq(Gene "$gene" ; Note "$description");
75 $group .= " ; $aliases" if $aliases;
76 print join("\t",$ref,'sgd','gene',$start,$stop,'.',$strand,'.',$group),"\n";
77 $description .= "\\; AKA @aliases" if @aliases;
80 print join("\t",$ref,'sgd',$type,$start,$stop,'.',$strand,'.',qq($type "$id" ; Note "$description")),"\n";
83 __END__
85 =head1 NAME
87 process_sgd.pl - Massage SGD annotation flat files into a version suitable for the Generic Genome Browser
89 =head1 SYNOPSIS
91 % process_sgd.pl chromosomal_features.tab > yeast.gff
93 =head1 DESCRIPTION
95 This script massages the SGD sequence annotation flat files located at
96 ftp://genome-ftp.stanford.edu/pub/yeast/data_dump/feature/chromosomal_features.tab
97 into a version of the GFF format suitable for display by the generic
98 genome browser.
100 To use this script, get the SGD chromosomal_features.tab file from the
101 FTP site listed above, and run the following command:
103 % process_sgd.pl chromosomal_features.tab > yeast.gff
105 The yeast.gff file can then be loaded into a Bio::DB::GFF database
106 using the following command:
108 % bulk_load_gff.pl -d <databasename> yeast.gff
110 =head1 SEE ALSO
112 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
114 =head1 AUTHOR
116 Lincoln Stein, lstein@cshl.org
118 Copyright (c) 2002 Cold Spring Harbor Laboratory
120 This library is free software; you can redistribute it and/or modify
121 it under the same terms as Perl itself. See DISCLAIMER.txt for
122 disclaimers of warranty.
124 =cut