examples/tools/extract_genes.pl

   1 #!/usr/bin/perl
   2 =pod
   3
   4 =head1 NAME
   5
   6 extract_genes.pl - extract genomic sequences from NCBI files
   7 using BioPerl
   8
   9 =head1 DESCRIPTION
  10
  11 This script is a simple solution to the problem of
  12 extracting genomic regions corresponding to genes. There are other
  13 solutions, this particular approach uses genomic sequence
  14 files from NCBI and gene coordinates from Entrez Gene.
  15
  16 The first time this script is run it will be slow as it will
  17 extract species-specific data from the gene2accession file and create
  18 a storable hash (retrieving the positional data from this hash is
  19 significantly faster than reading gene2accession each time the script
  20 runs). The subsequent runs should be fast.
  21
  22 =head1 INSTALLATION
  23
  24 =head2
  25
  26 Install BioPerl, full instructions at http://bioperl.org.
  27
  28 =head2 Download gene2accession.gz
  29
  30 Download this file from ftp://ftp.ncbi.nlm.nih.gov/gene/DATA into
  31 your working directory and gunzip it.
  32
  33 =head2 Download sequence files
  34
  35 Create one or more species directories in the working directory, the
  36 directory names do not have to match those at NCBI (e.g. "Sc", "Hs").
  37
  38 Download the nucleotide fasta files for a given species from its CHR*
  39 directories at ftp://ftp.ncbi.nlm.nih.gov/genomes and put these files into a
  40 species directory. The sequence files will have the suffix ".fna" or
  41 "fa.gz", gunzip if necessary.
  42
  43 =head2 Determine Taxon id
  44
  45 Determine the taxon id for the given species. This id is the first column
  46 in the gene2accession file. Modify the %species hash in this script
  47 such that name of your species directory is a key and the taxon id is the
  48 value.
  49
  50 =head2 Command-line options
  51
  52   -i   Gene id
  53   -s   Name of species directory
  54   -h   Help
  55
  56 Example:
  57
  58   extract_genes.pl -i 850302 -s Sc
  59
  60 =cut
  61
  62 use strict;
  63 use Bio::DB::Fasta;
  64 use Getopt::Long;
  65 use Storable;
  66
  67 my %species = ( "Sc" => 4932,  # Saccharomyces cerevisiae
  68                                      "Ec" => 83333, # Escherichia coli K12
  69                                           "Hs" => 9606   # H. sapiens
  70                                    );
  71
  72 my ($help,$id,$name);
  73
  74 GetOptions( "s=s"  =>  \$name,
  75             "i=i"  =>  \$id,
  76                                 "h"    =>  \$help );
  77
  78 usage() if ($help || !$id || !$name);
  79
  80 my $storedHash = $name . ".dump";
  81
  82 # create index for a directory of fasta files
  83 my $db = Bio::DB::Fasta->new($name, -makeid => \&make_my_id);
  84
  85 # extract species-specific data from gene2accession
  86 unless (-e $storedHash) {
  87         my $ref;
  88         # extract species-specific information from gene2accession
  89         my $file = 'gene2accession';
  90         open my $MYIN, '<', $file or die "Could not read file '$file': $!\n";
  91         while (my $line = <$MYIN>) {
  92                 my @arr = split "\t", $line;
  93                 if ($arr[0] == $species{$name} && $arr[9] =~ /\d+/ && $arr[10] =~ /\d+/) {
  94                         ($ref->{$arr[1]}->{"start"}, $ref->{$arr[1]}->{"end"},
  95                          $ref->{$arr[1]}->{"strand"}, $ref->{$arr[1]}->{"id"}) =
  96                                 ($arr[9], $arr[10], $arr[11], $arr[7]);
  97                 }
  98         }
  99         close $MYIN;
 100         # save species-specific information using Storable
 101         store $ref, $storedHash;
 102 }
 103
 104 # retrieve the species-specific data from a stored hash
 105 my $ref = retrieve($storedHash);
 106
 107 # retrieve sequence and sub-sequence
 108 if (defined $ref->{$id}) {
 109         my $chr = $db->get_Seq_by_id($ref->{$id}->{"id"});
 110         my $seq = $chr->trunc($ref->{$id}->{"start"},$ref->{$id}->{"end"});
 111         $seq = $seq->revcom if ($ref->{$id}->{"strand"} eq "-");
 112
 113         # Insert SeqIO options here...
 114         print $seq->seq,"\n";
 115 } else {
 116         print "Cannot find id: $id\n";
 117 }
 118
 119 sub make_my_id {
 120         my $line = shift;
 121         $line =~ /ref\|([^|]+)/;
 122         $1;
 123 }
 124
 125 sub usage {
 126         system "perldoc $0";
 127         exit;
 128 }
 129
 130 __END__