3 use constant ACEDB
=> 'sace://aceserver.cshl.org:2005';
8 my @framework = qw(mex-3 spe-15 lin-17 unc-11 dhc-1 unc-40 smg-5
9 unc-13 unc-29 eat-16 lin-11 spe-9 par-6 unc-59 unc-54 mab-9 lin-42
10 sri-71 smu-2 vab-1 bli-2 dpy-10 him-14 mig-5 unc-4 bli-1 sqt-1 rol-1
11 his-14 unc-52 unc-45 par-2 let-805 sel-8 mab-21 daf-4 sma-3 lin-39
12 unc-32 tax-4 ced-9 tra-1 nob-1 daf-1 ced-2 lin-1 unc-17 dpy-13 unc-5
13 smg-7 dif-1 lin-49 elt-1 daf-14 dpy-20 dpy-26 unc-30 tra-3 sup-24
14 rho-1 egl-8 unc-60 srh-36 apx-1 unc-62 let-418 dpy-11 let-413 sel-9
15 unc-42 egl-9 sma-1 sqt-3 odr-3 hda-1 unc-76 gcy-20 skr-5 par-4 unc-51
16 egl-17 lim-6 fox-1 fax-1 lon-2 unc-97 unc-6 unc-18 mec-10 sop-1 mab-18
17 sdc-2 odr-7 unc-9 unc-3 gas-1 ace-1);
18 my %framework = map {$_=>1} @framework;
19 my %framework_seen = ();
22 This script massages the Wormbase GFF files located at
23 ftp://www.wormbase.org/pub/wormbase/GENE_DUMPS into a version of the
24 GFF format suitable for display by the generic genome browser. It
25 mainly adds comments to the annotations and designates certain
26 well-spaced genetic loci as framework landmarks.
28 This script requires the AcePerl distribution, which is available on
29 CPAN (look for the "Ace" module).
31 To use this script, get the WormBase GFF files from the FTP site
32 listed above and place them in a directory. It might be a good idea
33 to name the directory after the current release, such as WS61. You do
34 not need to uncompress the files.
36 Then give that directory as the argument to this script and capture
37 the script's output to a file:
39 % process_wormbase.pl ./WS61 > wormbase.gff
41 It may take a while before you see output from this script, since it
42 must first fetch gene and protein database from the remote AceDB
43 running at www.wormbase.org.
44 The wormbase.gff file can then be loaded into a Bio::DB::GFF database
45 using the following command:
47 % bulk_load_gff.pl -d <databasename> wormbase.gff
52 die $USAGE if $ARGV[0]=~/^-?-h/i;
54 my $db = Ace
->connect(-url
=>ACEDB
,
55 -query_timeout
=>500) or die "Can't open ace database:",Ace
->error;
58 @ARGV = <$ARGV[0]/*.gff
.gz
>;
63 foreach (@ARGV) { # GFF FILES
64 $_ = "gunzip -c $_ |" if /\.gz$/;
67 my (%NOTES,%LOCUS,%GENBANK,%CONFIRMED,%ORFEOME);
68 get_confirmed
($db,\
%CONFIRMED);
69 get_genbank
($db,\
%GENBANK);
70 get_loci
($db,\
%LOCUS);
71 get_notes
($db,\
%NOTES);
72 get_orfeome
($db,\
%ORFEOME);
77 my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split /\t/;
78 next if $source eq 'assembly_tag'; # don't want 'em, don't need 'em
79 $ref =~ s/^CHROMOSOME_//;
80 $group =~ s/CHROMOSOME_//;
82 $source ='' if $source eq '*UNKNOWN*';
84 if ($method eq 'Sequence' && ($source eq 'curated' || $source eq 'RNA') && $group =~ /Sequence "(\w+\.\d+[a-z]?)"/) {
86 push @notes,map { qq(Note
"$_") } @
{$NOTES{$1}} if $NOTES{$1};
87 push @notes,map { qq(Note
"$_") } @
{$LOCUS{$1}} if $LOCUS{$1};
88 push @notes,qq(Confirmed_by
"$CONFIRMED{$1}") if $CONFIRMED{$1};
89 $group = join ' ; ',$group,@notes;
90 if (my $loci = $LOCUS{$1}) {
92 print join("\t",$ref,$source,'gene',$start,$stop,$score,$strand,$phase,"Locus $_"),"\n";
93 print join("\t",$ref,'framework','gene',$start,$stop,$score,$strand,$phase,"Locus $_"),"\n"
94 if $framework{$_} && !$framework_seen{$_}++;
99 if ($method eq 'Sequence' && $source eq 'Genomic_canonical' && $group =~ /Sequence "(\w+)"/) {
100 if (my $accession = $GENBANK{$1}) {
101 $group .= qq( ; Note
"Genbank $accession");
102 print join("\t",$ref,'Genbank',$method,$start,$stop,$score,$strand,$phase,"Genbank \"$accession\""),"\n";
106 if ($method eq 'reagent' && $source eq 'Orfeome_project' && $group =~ /PCR_product "([^\"]+)"/) {
107 my $amp = $ORFEOME{$1};
108 $group .= qq( ; Amplified
$amp) if defined $amp;
111 # fix variant fields: Variant "T" => Note "T"
112 $group =~ s/(?:Variant|Insert) "(\w+)"/Note "$1"/;
115 if ($group =~ /UTR "([35])_UTR:(\S+)"/) {
118 $group = qq(Sequence
"$2");
121 print join("\t",$ref,$source,$method,$start,$stop,$score,$strand,$phase,$group),"\n";
125 my ($db,$hash) = @_; # hash keys are predicted gene names, values are one or more loci names
126 my @genes = $db->fetch(-query
=>'find Locus Genomic_sequence',-filltag
=>'Genomic_sequence');
127 foreach my $obj (@genes) {
128 my @genomic = $obj->Genomic_sequence or next;
130 push @
{$hash->{$_}},$obj;
136 my ($db,$hash) = @_; # hash keys are predicted gene names, values are one or more brief identifications
137 my @genes = $db->fetch(-query
=>'find Sequence Brief_identification',-filltag
=>'Brief_identification');
138 foreach my $obj (@genes) {
139 my @notes = $obj->Brief_identification or next;
140 $hash->{$obj} = \
@notes;
145 my ($db,$hash) = @_; # hash keys are cosmid names, values are genbank accessions (1 to 1)
146 my @cosmids = $db->fetch(-query
=>'find Genome_Sequence Database',-filltag
=>'Database');
147 for my $cosmid (@cosmids) {
148 my ($database,undef,$accession) = $cosmid->Database(1)->row;
149 next unless $accession;
150 $hash->{$cosmid} = $accession;
155 my ($db,$hash) = @_; # hash keys are predicted gene names, values are confirmation type
156 my @confirmed = $db->fetch(-query
=>'find Sequence Confirmed_by',-filltag
=>'Confirmed_by');
157 foreach my $obj (@confirmed) {
158 my $confirmed_by = $obj->Confirmed_by || 'Unknown';
159 $hash->{$obj} = $confirmed_by;
165 my @mv_primers = $db->fetch(-query
=>'find PCR_Product mv*',-filltag
=>'Amplified');
166 for my $obj (@mv_primers) {
167 my $amplified = $obj->Amplified;
168 $hash->{$obj} = $amplified;
176 bp_process_wormbase.pl - Massage WormBase GFF files into a version suitable for the Generic Genome Browser
180 % bp_process_wormbase.pl ./WS61 > wormbase.gff
184 This script massages the Wormbase GFF files located at
185 ftp://www.wormbase.org/pub/wormbase/GENE_DUMPS into a version of the
186 GFF format suitable for display by the generic genome browser. It
187 mainly adds comments to the annotations and designates certain
188 well-spaced genetic loci as framework landmarks.
190 This script requires the AcePerl distribution, which is available on
191 CPAN (look for the "Ace" module).
193 To use this script, get the WormBase GFF files from the FTP site
194 listed above and place them in a directory. It might be a good idea
195 to name the directory after the current release, such as WS61. You do
196 not need to uncompress the files.
198 Then give that directory as the argument to this script and capture
199 the script's output to a file:
201 % bp_process_wormbase.pl ./WS61 > wormbase.gff
203 It may take a while before you see output from this script, since it
204 must first fetch gene and protein database from the remote AceDB
205 running at www.wormbase.org.
206 The wormbase.gff file can then be loaded into a Bio::DB::GFF database
207 using the following command:
209 % bulk_load_gff.pl -d <databasename> wormbase.gff
213 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
217 Lincoln Stein E<lt>lstein@cshl.orgE<gt>
219 Copyright (c) 2002 Cold Spring Harbor Laboratory
221 This library is free software; you can redistribute it and/or modify
222 it under the same terms as Perl itself. See DISCLAIMER.txt for
223 disclaimers of warranty.