6 extract_genes.pl - extract genomic sequences from NCBI files
11 This script is a simple solution to the problem of
12 extracting genomic regions corresponding to genes. There are other
13 solutions, this particular approach uses genomic sequence
14 files from NCBI and gene coordinates from Entrez Gene.
16 The first time this script is run it will be slow as it will
17 extract species-specific data from the gene2accession file and create
18 a storable hash (retrieving the positional data from this hash is
19 significantly faster than reading gene2accession each time the script
20 runs). The subsequent runs should be fast.
26 Install BioPerl, full instructions at http://bioperl.org.
28 =head2 Download gene2accession.gz
30 Download this file from ftp://ftp.ncbi.nlm.nih.gov/gene/DATA into
31 your working directory and gunzip it.
33 =head2 Download sequence files
35 Create one or more species directories in the working directory, the
36 directory names do not have to match those at NCBI (e.g. "Sc", "Hs").
38 Download the nucleotide fasta files for a given species from its CHR*
39 directories at ftp://ftp.ncbi.nlm.nih.gov/genomes and put these files into a
40 species directory. The sequence files will have the suffix ".fna" or
41 "fa.gz", gunzip if necessary.
43 =head2 Determine Taxon id
45 Determine the taxon id for the given species. This id is the first column
46 in the gene2accession file. Modify the %species hash in this script
47 such that name of your species directory is a key and the taxon id is the
50 =head2 Command-line options
53 -s Name of species directory
58 extract_genes.pl -i 850302 -s Sc
67 my %species = ( "Sc" => 4932, # Saccharomyces cerevisiae
68 "Ec" => 83333, # Escherichia coli K12
69 "Hs" => 9606 # H. sapiens
74 GetOptions
( "s=s" => \
$name,
78 usage
() if ($help || !$id || !$name);
80 my $storedHash = $name . ".dump";
82 # create index for a directory of fasta files
83 my $db = Bio
::DB
::Fasta
->new($name, -makeid
=> \
&make_my_id
);
85 # extract species-specific data from gene2accession
86 unless (-e
$storedHash) {
88 # extract species-specific information from gene2accession
89 my $file = 'gene2accession';
90 open my $MYIN, '<', $file or die "Could not read file '$file': $!\n";
91 while (my $line = <$MYIN>) {
92 my @arr = split "\t", $line;
93 if ($arr[0] == $species{$name} && $arr[9] =~ /\d+/ && $arr[10] =~ /\d+/) {
94 ($ref->{$arr[1]}->{"start"}, $ref->{$arr[1]}->{"end"},
95 $ref->{$arr[1]}->{"strand"}, $ref->{$arr[1]}->{"id"}) =
96 ($arr[9], $arr[10], $arr[11], $arr[7]);
100 # save species-specific information using Storable
101 store
$ref, $storedHash;
104 # retrieve the species-specific data from a stored hash
105 my $ref = retrieve
($storedHash);
107 # retrieve sequence and sub-sequence
108 if (defined $ref->{$id}) {
109 my $chr = $db->get_Seq_by_id($ref->{$id}->{"id"});
110 my $seq = $chr->trunc($ref->{$id}->{"start"},$ref->{$id}->{"end"});
111 $seq = $seq->revcom if ($ref->{$id}->{"strand"} eq "-");
113 # Insert SeqIO options here...
114 print $seq->seq,"\n";
116 print "Cannot find id: $id\n";
121 $line =~ /ref\|([^|]+)/;