A test to ensure Bio::PrimarySeqI->trunc() doesn't use clone() for a Bio::Seq::RichSe...
[bioperl-live.git] / scripts / Bio-DB-GFF / bp_process_wormbase.pl
blobf2ac864efc4c70b373b482dc3f06dff85baae386
1 #!/usr/bin/perl
3 use constant ACEDB => 'sace://aceserver.cshl.org:2005';
4 use strict;
5 use warnings;
6 use Ace;
8 my @framework = qw(mex-3 spe-15 lin-17 unc-11 dhc-1 unc-40 smg-5
9 unc-13 unc-29 eat-16 lin-11 spe-9 par-6 unc-59 unc-54 mab-9 lin-42
10 sri-71 smu-2 vab-1 bli-2 dpy-10 him-14 mig-5 unc-4 bli-1 sqt-1 rol-1
11 his-14 unc-52 unc-45 par-2 let-805 sel-8 mab-21 daf-4 sma-3 lin-39
12 unc-32 tax-4 ced-9 tra-1 nob-1 daf-1 ced-2 lin-1 unc-17 dpy-13 unc-5
13 smg-7 dif-1 lin-49 elt-1 daf-14 dpy-20 dpy-26 unc-30 tra-3 sup-24
14 rho-1 egl-8 unc-60 srh-36 apx-1 unc-62 let-418 dpy-11 let-413 sel-9
15 unc-42 egl-9 sma-1 sqt-3 odr-3 hda-1 unc-76 gcy-20 skr-5 par-4 unc-51
16 egl-17 lim-6 fox-1 fax-1 lon-2 unc-97 unc-6 unc-18 mec-10 sop-1 mab-18
17 sdc-2 odr-7 unc-9 unc-3 gas-1 ace-1);
18 my %framework = map {$_=>1} @framework;
19 my %framework_seen = ();
21 my $USAGE = <<USAGE;
22 This script massages the Wormbase GFF files located at
23 ftp://www.wormbase.org/pub/wormbase/GENE_DUMPS into a version of the
24 GFF format suitable for display by the generic genome browser. It
25 mainly adds comments to the annotations and designates certain
26 well-spaced genetic loci as framework landmarks.
28 This script requires the AcePerl distribution, which is available on
29 CPAN (look for the "Ace" module).
31 To use this script, get the WormBase GFF files from the FTP site
32 listed above and place them in a directory. It might be a good idea
33 to name the directory after the current release, such as WS61. You do
34 not need to uncompress the files.
36 Then give that directory as the argument to this script and capture
37 the script's output to a file:
39 % process_wormbase.pl ./WS61 > wormbase.gff
41 It may take a while before you see output from this script, since it
42 must first fetch gene and protein database from the remote AceDB
43 running at www.wormbase.org.
44 The wormbase.gff file can then be loaded into a Bio::DB::GFF database
45 using the following command:
47 % bulk_load_gff.pl -d <databasename> wormbase.gff
48 USAGE
52 die $USAGE if $ARGV[0]=~/^-?-h/i;
54 my $db = Ace->connect(-url=>ACEDB,
55 -query_timeout=>500) or die "Can't open ace database:",Ace->error;
57 if (-d $ARGV[0]) {
58 @ARGV = <$ARGV[0]/*.gff.gz>;
61 @ARGV || die $USAGE;
63 foreach (@ARGV) { # GFF FILES
64 $_ = "gunzip -c $_ |" if /\.gz$/;
67 my (%NOTES,%LOCUS,%GENBANK,%CONFIRMED,%ORFEOME);
68 get_confirmed($db,\%CONFIRMED);
69 get_genbank($db,\%GENBANK);
70 get_loci($db,\%LOCUS);
71 get_notes($db,\%NOTES);
72 get_orfeome($db,\%ORFEOME);
74 while (<>) {
75 chomp;
76 next if /^\#/;
77 my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split /\t/;
78 next if $source eq 'assembly_tag'; # don't want 'em, don't need 'em
79 $ref =~ s/^CHROMOSOME_//;
80 $group =~ s/CHROMOSOME_//;
82 $source ='' if $source eq '*UNKNOWN*';
84 if ($method eq 'Sequence' && ($source eq 'curated' || $source eq 'RNA') && $group =~ /Sequence "(\w+\.\d+[a-z]?)"/) {
85 my @notes;
86 push @notes,map { qq(Note "$_") } @{$NOTES{$1}} if $NOTES{$1};
87 push @notes,map { qq(Note "$_") } @{$LOCUS{$1}} if $LOCUS{$1};
88 push @notes,qq(Confirmed_by "$CONFIRMED{$1}") if $CONFIRMED{$1};
89 $group = join ' ; ',$group,@notes;
90 if (my $loci = $LOCUS{$1}) {
91 foreach (@$loci) {
92 print join("\t",$ref,$source,'gene',$start,$stop,$score,$strand,$phase,"Locus $_"),"\n";
93 print join("\t",$ref,'framework','gene',$start,$stop,$score,$strand,$phase,"Locus $_"),"\n"
94 if $framework{$_} && !$framework_seen{$_}++;
99 if ($method eq 'Sequence' && $source eq 'Genomic_canonical' && $group =~ /Sequence "(\w+)"/) {
100 if (my $accession = $GENBANK{$1}) {
101 $group .= qq( ; Note "Genbank $accession");
102 print join("\t",$ref,'Genbank',$method,$start,$stop,$score,$strand,$phase,"Genbank \"$accession\""),"\n";
106 if ($method eq 'reagent' && $source eq 'Orfeome_project' && $group =~ /PCR_product "([^\"]+)"/) {
107 my $amp = $ORFEOME{$1};
108 $group .= qq( ; Amplified $amp) if defined $amp;
111 # fix variant fields: Variant "T" => Note "T"
112 $group =~ s/(?:Variant|Insert) "(\w+)"/Note "$1"/;
114 # fix UTR fields
115 if ($group =~ /UTR "([35])_UTR:(\S+)"/) {
116 $method = 'UTR';
117 $source = "$1_UTR";
118 $group = qq(Sequence "$2");
121 print join("\t",$ref,$source,$method,$start,$stop,$score,$strand,$phase,$group),"\n";
124 sub get_loci {
125 my ($db,$hash) = @_; # hash keys are predicted gene names, values are one or more loci names
126 my @genes = $db->fetch(-query=>'find Locus Genomic_sequence',-filltag=>'Genomic_sequence');
127 foreach my $obj (@genes) {
128 my @genomic = $obj->Genomic_sequence or next;
129 foreach (@genomic) {
130 push @{$hash->{$_}},$obj;
135 sub get_notes {
136 my ($db,$hash) = @_; # hash keys are predicted gene names, values are one or more brief identifications
137 my @genes = $db->fetch(-query=>'find Sequence Brief_identification',-filltag=>'Brief_identification');
138 foreach my $obj (@genes) {
139 my @notes = $obj->Brief_identification or next;
140 $hash->{$obj} = \@notes;
144 sub get_genbank {
145 my ($db,$hash) = @_; # hash keys are cosmid names, values are genbank accessions (1 to 1)
146 my @cosmids = $db->fetch(-query=>'find Genome_Sequence Database',-filltag=>'Database');
147 for my $cosmid (@cosmids) {
148 my ($database,undef,$accession) = $cosmid->Database(1)->row;
149 next unless $accession;
150 $hash->{$cosmid} = $accession;
154 sub get_confirmed {
155 my ($db,$hash) = @_; # hash keys are predicted gene names, values are confirmation type
156 my @confirmed = $db->fetch(-query=>'find Sequence Confirmed_by',-filltag=>'Confirmed_by');
157 foreach my $obj (@confirmed) {
158 my $confirmed_by = $obj->Confirmed_by || 'Unknown';
159 $hash->{$obj} = $confirmed_by;
163 sub get_orfeome {
164 my ($db,$hash) = @_;
165 my @mv_primers = $db->fetch(-query=>'find PCR_Product mv*',-filltag=>'Amplified');
166 for my $obj (@mv_primers) {
167 my $amplified = $obj->Amplified;
168 $hash->{$obj} = $amplified;
172 __END__
174 =head1 NAME
176 bp_process_wormbase.pl - Massage WormBase GFF files into a version suitable for the Generic Genome Browser
178 =head1 SYNOPSIS
180 % bp_process_wormbase.pl ./WS61 > wormbase.gff
182 =head1 DESCRIPTION
184 This script massages the Wormbase GFF files located at
185 ftp://www.wormbase.org/pub/wormbase/GENE_DUMPS into a version of the
186 GFF format suitable for display by the generic genome browser. It
187 mainly adds comments to the annotations and designates certain
188 well-spaced genetic loci as framework landmarks.
190 This script requires the AcePerl distribution, which is available on
191 CPAN (look for the "Ace" module).
193 To use this script, get the WormBase GFF files from the FTP site
194 listed above and place them in a directory. It might be a good idea
195 to name the directory after the current release, such as WS61. You do
196 not need to uncompress the files.
198 Then give that directory as the argument to this script and capture
199 the script's output to a file:
201 % bp_process_wormbase.pl ./WS61 > wormbase.gff
203 It may take a while before you see output from this script, since it
204 must first fetch gene and protein database from the remote AceDB
205 running at www.wormbase.org.
206 The wormbase.gff file can then be loaded into a Bio::DB::GFF database
207 using the following command:
209 % bulk_load_gff.pl -d <databasename> wormbase.gff
211 =head1 SEE ALSO
213 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
215 =head1 AUTHOR
217 Lincoln Stein E<lt>lstein@cshl.orgE<gt>
219 Copyright (c) 2002 Cold Spring Harbor Laboratory
221 This library is free software; you can redistribute it and/or modify
222 it under the same terms as Perl itself. See DISCLAIMER.txt for
223 disclaimers of warranty.
225 =cut