From 522684aed5910492f72c70799339c6f254cd9ece Mon Sep 17 00:00:00 2001 From: =?utf8?q?Carn=C3=AB=20Draug?= Date: Tue, 25 Sep 2018 18:18:11 +0100 Subject: [PATCH] Bio::DB::GFF move into its own namespace (except Bio::DB::GFF::Util) We don't move the Bio::DB::GFF::Util because it doesn't make use of the rest Bio::DB::GFF (it does not even expect them as arguments) and is used in Bio::SeqFeature::Collection. --- Changes | 19 +- bin/bp_bulk_load_gff | 698 ---- bin/bp_das_server | 528 --- bin/bp_fast_load_gff | 497 --- bin/bp_genbank2gff | 310 -- bin/bp_generate_histogram | 137 - bin/bp_load_gff | 166 - bin/bp_meta_gff | 87 - lib/Bio/DB/GFF.pm | 3896 -------------------- lib/Bio/DB/GFF/Adaptor/berkeleydb.pm | 1129 ------ lib/Bio/DB/GFF/Adaptor/berkeleydb/iterator.pm | 83 - lib/Bio/DB/GFF/Adaptor/biofetch.pm | 355 -- lib/Bio/DB/GFF/Adaptor/biofetch_oracle.pm | 301 -- lib/Bio/DB/GFF/Adaptor/dbi.pm | 2474 ------------- lib/Bio/DB/GFF/Adaptor/dbi/caching_handle.pm | 273 -- lib/Bio/DB/GFF/Adaptor/dbi/iterator.pm | 74 - lib/Bio/DB/GFF/Adaptor/dbi/mysql.pm | 909 ----- lib/Bio/DB/GFF/Adaptor/dbi/mysqlcmap.pm | 1176 ------ lib/Bio/DB/GFF/Adaptor/dbi/mysqlopt.pm | 31 - lib/Bio/DB/GFF/Adaptor/dbi/oracle.pm | 1029 ------ lib/Bio/DB/GFF/Adaptor/dbi/pg.pm | 1390 ------- lib/Bio/DB/GFF/Adaptor/dbi/pg_fts.pm | 363 -- lib/Bio/DB/GFF/Adaptor/memory.pm | 718 ---- .../DB/GFF/Adaptor/memory/feature_serializer.pm | 37 - lib/Bio/DB/GFF/Adaptor/memory/iterator.pm | 79 - lib/Bio/DB/GFF/Aggregator.pm | 642 ---- lib/Bio/DB/GFF/Aggregator/alignment.pm | 138 - lib/Bio/DB/GFF/Aggregator/clone.pm | 160 - lib/Bio/DB/GFF/Aggregator/coding.pm | 102 - lib/Bio/DB/GFF/Aggregator/gene.pm | 108 - lib/Bio/DB/GFF/Aggregator/match.pm | 105 - lib/Bio/DB/GFF/Aggregator/none.pm | 43 - lib/Bio/DB/GFF/Aggregator/orf.pm | 91 - lib/Bio/DB/GFF/Aggregator/processed_transcript.pm | 107 - lib/Bio/DB/GFF/Aggregator/so_transcript.pm | 110 - lib/Bio/DB/GFF/Aggregator/transcript.pm | 114 - lib/Bio/DB/GFF/Aggregator/ucsc_acembly.pm | 99 - lib/Bio/DB/GFF/Aggregator/ucsc_ensgene.pm | 99 - lib/Bio/DB/GFF/Aggregator/ucsc_genscan.pm | 98 - lib/Bio/DB/GFF/Aggregator/ucsc_refgene.pm | 98 - lib/Bio/DB/GFF/Aggregator/ucsc_sanger22.pm | 99 - lib/Bio/DB/GFF/Aggregator/ucsc_sanger22pseudo.pm | 99 - lib/Bio/DB/GFF/Aggregator/ucsc_softberry.pm | 98 - lib/Bio/DB/GFF/Aggregator/ucsc_twinscan.pm | 98 - lib/Bio/DB/GFF/Aggregator/ucsc_unigene.pm | 100 - lib/Bio/DB/GFF/Featname.pm | 153 - lib/Bio/DB/GFF/Feature.pm | 1374 ------- lib/Bio/DB/GFF/Homol.pm | 100 - lib/Bio/DB/GFF/RelSegment.pm | 1168 ------ lib/Bio/DB/GFF/Segment.pm | 884 ----- lib/Bio/DB/GFF/Typename.pm | 188 - t/LocalDB/BioDBGFF.t | 454 --- t/data/biodbgff/test.gff | 42 - t/data/biodbgff/test.gff3 | 44 - 54 files changed, 18 insertions(+), 23756 deletions(-) delete mode 100644 bin/bp_bulk_load_gff delete mode 100644 bin/bp_das_server delete mode 100644 bin/bp_fast_load_gff delete mode 100644 bin/bp_genbank2gff delete mode 100644 bin/bp_generate_histogram delete mode 100644 bin/bp_load_gff delete mode 100644 bin/bp_meta_gff delete mode 100644 lib/Bio/DB/GFF.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/berkeleydb.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/berkeleydb/iterator.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/biofetch.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/biofetch_oracle.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/dbi.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/dbi/caching_handle.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/dbi/iterator.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/dbi/mysql.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/dbi/mysqlcmap.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/dbi/mysqlopt.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/dbi/oracle.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/dbi/pg.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/dbi/pg_fts.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/memory.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/memory/feature_serializer.pm delete mode 100644 lib/Bio/DB/GFF/Adaptor/memory/iterator.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/alignment.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/clone.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/coding.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/gene.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/match.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/none.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/orf.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/processed_transcript.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/so_transcript.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/transcript.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/ucsc_acembly.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/ucsc_ensgene.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/ucsc_genscan.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/ucsc_refgene.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/ucsc_sanger22.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/ucsc_sanger22pseudo.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/ucsc_softberry.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/ucsc_twinscan.pm delete mode 100644 lib/Bio/DB/GFF/Aggregator/ucsc_unigene.pm delete mode 100644 lib/Bio/DB/GFF/Featname.pm delete mode 100644 lib/Bio/DB/GFF/Feature.pm delete mode 100644 lib/Bio/DB/GFF/Homol.pm delete mode 100644 lib/Bio/DB/GFF/RelSegment.pm delete mode 100644 lib/Bio/DB/GFF/Segment.pm delete mode 100644 lib/Bio/DB/GFF/Typename.pm delete mode 100644 t/LocalDB/BioDBGFF.t delete mode 100644 t/data/biodbgff/test.gff delete mode 100644 t/data/biodbgff/test.gff3 diff --git a/Changes b/Changes index 6d4d2f917..ece460bc7 100644 --- a/Changes +++ b/Changes @@ -53,6 +53,16 @@ be removed. Bio::ClusterIO::* Bio::DB::Expression Bio::DB::Expression::geo + Bio::DB::GFF + Bio::DB::GFF::Adaptor::* + Bio::DB::GFF::Aggregator + Bio::DB::GFF::Aggregator::* + Bio::DB::GFF::Featname + Bio::DB::GFF::Feature + Bio::DB::GFF::Homol + Bio::DB::GFF::RelSegment + Bio::DB::GFF::Segment + Bio::DB::GFF::Typename Bio::DB::SeqFeature Bio::DB::SeqFeature::* Bio::Index::Stockholm @@ -66,15 +76,22 @@ be removed. Bio::SeqIO::exp Bio::SeqIO::pln Bio::SeqIO::ztr - Bio::Variation::* Bio::Tools::AlignFactory Bio::Tools::Phylo::Gumby Bio::Tools::dpAlign Bio::Tools::pSW + Bio::Variation::* * The following programs have been removed: + bp_bulk_load_gff + bp_das_server + bp_fast_load_gff bp_flanks + bp_genbank2gff + bp_generate_histogram + bp_load_gff + bp_meta_gff bp_netinstall bp_process_wormbase bp_seqfeature_delete diff --git a/bin/bp_bulk_load_gff b/bin/bp_bulk_load_gff deleted file mode 100644 index c4ad73a4f..000000000 --- a/bin/bp_bulk_load_gff +++ /dev/null @@ -1,698 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -# use lib './blib/lib'; -use DBI; -use IO::File; -use File::Spec; -use Getopt::Long; -use Bio::DB::GFF; -use Bio::DB::GFF::Util::Binning 'bin'; - -use constant MYSQL => 'mysql'; -use constant FDATA => 'fdata'; -use constant FTYPE => 'ftype'; -use constant FGROUP => 'fgroup'; -use constant FDNA => 'fdna'; -use constant FATTRIBUTE => 'fattribute'; -use constant FATTRIBUTE_TO_FEATURE => 'fattribute_to_feature'; - -=head1 NAME - -bp_bulk_load_gff.pl - Bulk-load a Bio::DB::GFF database from GFF files. - -=head1 SYNOPSIS - - % bp_bulk_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ... - -=head1 DESCRIPTION - -This script loads a Bio::DB::GFF database with the features contained -in a list of GFF files and/or FASTA sequence files. You must use the -exact variant of GFF described in L. Various -command-line options allow you to control which database to load and -whether to allow an existing database to be overwritten. - -This script differs from bp_load_gff.pl in that it is hard-coded to use -MySQL and cannot perform incremental loads. See L for an -incremental loader that works with all databases supported by -Bio::DB::GFF, and L for a MySQL loader that supports -fast incremental loads. - -=head2 NOTES - -If the filename is given as "-" then the input is taken from standard -input. Compressed files (.gz, .Z, .bz2) are automatically -uncompressed. - -FASTA format files are distinguished from GFF files by their filename -extensions. Files ending in .fa, .fasta, .fast, .seq, .dna and their -uppercase variants are treated as FASTA files. Everything else is -treated as a GFF file. If you wish to load -fasta files from STDIN, -then use the -f command-line swith with an argument of '-', as in - - gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f - - -The nature of the bulk load requires that the database be on the local -machine and that the indicated user have the "file" privilege to load -the tables and have enough room in /usr/tmp (or whatever is specified -by the \$TMPDIR environment variable), to hold the tables transiently. - -Local data may now be uploaded to a remote server via the --local option -with the database host specified in the dsn, e.g. dbi:mysql:test:db_host - -The adaptor used is dbi::mysqlopt. There is currently no way to -change this. - -About maxfeature: the default value is 100,000,000 bases. If you have -features that are close to or greater that 100Mb in length, then the -value of maxfeature should be increased to 1,000,000,000. This value -must be a power of 10. - -Note that Windows users must use the --create option. - -If the list of GFF or fasta files exceeds the kernel limit for the -maximum number of command-line arguments, use the ---long_list /path/to/files option. - - -=head1 COMMAND-LINE OPTIONS - -Command-line options can be abbreviated to single-letter options. -e.g. -d instead of --database. - - --database Database name (default dbi:mysql:test) - --adaptor Adaptor name (default mysql) - --create Reinitialize/create data tables without asking - --user Username to log in as - --fasta File or directory containing fasta files to load - --long_list Directory containing a very large number of - GFF and/or FASTA files - --password Password to use for authentication - (Does not work with Postgres, password must be - supplied interactively or be left empty for - ident authentication) - --maxbin Set the value of the maximum bin size - --local Flag to indicate that the data source is local - --maxfeature Set the value of the maximum feature size (power of 10) - --group A list of one or more tag names (comma or space separated) - to be used for grouping in the 9th column. - --gff3_munge Activate GFF3 name munging (see Bio::DB::GFF) - --summary Generate summary statistics for drawing coverage histograms. - This can be run on a previously loaded database or during - the load. - --Temporary Location of a writable scratch directory - -=head1 SEE ALSO - -L, L, L - -=head1 AUTHOR - -Lincoln Stein, lstein@cshl.org - -Copyright (c) 2002 Cold Spring Harbor Laboratory - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. See DISCLAIMER.txt for -disclaimers of warranty. - -=cut - -package Bio::DB::GFF::Adaptor::fauxmysql; - -use Bio::DB::GFF::Adaptor::dbi::mysqlopt; -use vars '@ISA'; -@ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt'; - -sub insert_sequence { - my $self = shift; - my ($id,$offset,$seq) = @_; - print join("\t",$id,$offset,$seq),"\n"; -}; - -package Bio::DB::GFF::Adaptor::fauxmysqlcmap; - -use Bio::DB::GFF::Adaptor::dbi::mysqlcmap; -use vars '@ISA'; -@ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlcmap'; - -sub insert_sequence { - my $self = shift; - my ($id,$offset,$seq) = @_; - print join("\t",$id,$offset,$seq),"\n"; -}; - -package Bio::DB::GFF::Adaptor::fauxpg; - -use Bio::DB::GFF::Adaptor::dbi::pg; -use vars '@ISA'; -@ISA = 'Bio::DB::GFF::Adaptor::dbi::pg'; - -#these two subs are to separate the table creation from the -#index creation -sub do_initialize { - my $self = shift; - my $erase = shift; - $self->drop_all if $erase; - - my $dbh = $self->features_db; - my $schema = $self->schema; - foreach my $table_name ($self->tables) { - my $create_table_stmt = $schema->{$table_name}{table} ; - $dbh->do($create_table_stmt) || warn $dbh->errstr; - # $self->create_other_schema_objects(\%{$schema->{$table_name}}); - } - 1; -} - -sub _create_indexes_etc { - my $self = shift; - - my $dbh = $self->features_db; - my $schema = $self->schema; - foreach my $table_name ($self->tables) { - $self->create_other_schema_objects(\%{$schema->{$table_name}}); - } -} - -sub insert_sequence { - my $self = shift; - my ($id,$offset,$seq) = @_; - print "$id\t$offset\t$seq\n"; -} - -package main; - -eval "use Time::HiRes"; undef $@; -my $timer = defined &Time::HiRes::time; - -my $bWINDOWS = 0; # Boolean: is this a MSWindows operating system? -if ($^O =~ /MSWin32/i) { - $bWINDOWS = 1; -} - -my ($DSN,$ADAPTOR,$FORCE,$USER,$PASSWORD,$FASTA,$LOCAL,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR); - -GetOptions ('database:s' => \$DSN, - 'adaptor:s' => \$ADAPTOR, - 'create' => \$FORCE, - 'user:s' => \$USER, - 'password:s' => \$PASSWORD, - 'fasta:s' => \$FASTA, - 'local' => \$LOCAL, - 'maxbin|maxfeature:s' => \$MAX_BIN, - 'group:s' => \$GROUP_TAG, - 'long_list:s' => \$LONG_LIST, - 'gff3_munge' => \$MUNGE, - 'Temporary:s' => \$TMPDIR, - ) or (system('pod2text', $0), exit -1); - -# If called as pg_bulk_load_gff.pl behave as that did. -if ($0 =~/pg_bulk_load_gff.pl/){ - $ADAPTOR ||= 'Pg'; - $DSN ||= 'test'; -} -$DSN ||= 'dbi:mysql:test'; -$MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes - - -if ($bWINDOWS && not $FORCE) { - die "Note that Windows users must use the --create option.\n"; -} - -unless ($FORCE) { - die "This will delete all existing data in database $DSN. If you want to do this, rerun with the --create option.\n" - if $bWINDOWS; - open (TTY,"/dev/tty") or die "/dev/tty: $!\n"; #TTY use removed for win compatability - print STDERR "This operation will delete all existing data in database $DSN. Continue? "; - my $f = ; - die "Aborted\n" unless $f =~ /^[yY]/; - close TTY; -} - -# postgres DBD::Pg allows 'database', but also 'dbname', and 'db': -# and it must be Pg (not pg) -$DSN=~s/pg:database=/Pg:/i; -$DSN=~s/pg:dbname=/Pg:/i; -$DSN=~s/pg:db=/Pg:/i; - -# leave these lines for mysql -$DSN=~s/database=//i; -$DSN=~s/;host=/:/i; #cater for dsn in the form of "dbi:mysql:database=$dbname;host=$host" - - -my($DBI,$DBD,$DBNAME,$HOST)=split /:/,$DSN; -$DBNAME=$DSN unless $DSN=~/:/; -$ADAPTOR ||= $DBD; -$ADAPTOR ||= 'mysql'; - -if ($DBD eq 'Pg') { - # rebuild DSN, DBD::Pg requires full dbname= format - $DSN = "dbi:Pg:dbname=$DBNAME"; - if ($HOST) { $DSN .= ";host=$HOST"; } -} - -my ($use_mysql,$use_mysqlcmap,$use_pg) = (0,0,0); -if ( $ADAPTOR eq 'mysqlcmap' ) { - $use_mysqlcmap = 1; -} -elsif ( $ADAPTOR =~ /^mysql/ ) { - $use_mysql = 1; -} -elsif ( $ADAPTOR eq "Pg" ) { - $use_pg = 1; -} -else{ - die "$ADAPTOR is not an acceptable database adaptor."; -} - - -my (@auth,$AUTH); -if (defined $USER) { - push @auth,(-user=>$USER); - if ( $use_mysql or $use_mysqlcmap ) { - $AUTH .= " -u$USER"; - } - elsif ( $use_pg ) { - $AUTH .= " -U $USER "; - } -} -if (defined $PASSWORD) { - push @auth,(-pass=>$PASSWORD); - if ( $use_mysql or $use_mysqlcmap ) { - $AUTH .= " -p$PASSWORD"; - } -# elsif ( $use_pg ) { -# $AUTH .= " -W $PASSWORD "; -# } -} - -if (defined $HOST) { - $AUTH .= " -h$HOST"; -} -if (defined $DBNAME) { - if ( $use_mysql or $use_mysqlcmap ) { - $AUTH .= " -D$DBNAME "; - } -} -if (defined $LOCAL) { - $LOCAL='local'; - $AUTH.=' --local-infile=1'; -}else { - $LOCAL=''; -} - -my $faux_adaptor; -if ( $use_mysqlcmap ) { - $faux_adaptor = "fauxmysqlcmap"; -} -elsif ( $use_mysql ) { - $faux_adaptor = "fauxmysql"; -} -elsif ( $use_pg ) { - $faux_adaptor = "fauxpg"; -} - -my $db = Bio::DB::GFF->new(-adaptor=>$faux_adaptor,-dsn => $DSN,@auth) - or die "Can't open database: ",Bio::DB::GFF->error,"\n"; - -$db->gff3_name_munging(1) if $MUNGE; - -$MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1); -$MAX_BIN ||= $db->meta('max_bin') || 100_000_000; - -# deal with really long lists of files -if ($LONG_LIST) { - -d $LONG_LIST or die "The --long_list argument must be a directory\n"; - opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!"; - @ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR; - closedir GFFDIR; - - if (defined $FASTA && -d $FASTA) { - opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!"; - push @ARGV, map { "$FASTA\/$_" } readdir FASTA; - closedir FASTA; - } - elsif (defined $FASTA && -f $FASTA) { - push @ARGV, $FASTA; - } -} - -foreach (@ARGV) { - $_ = "gunzip -c $_ |" if /\.gz$/; - $_ = "uncompress -c $_ |" if /\.Z$/; - $_ = "bunzip2 -c $_ |" if /\.bz2$/; -} - -my (@gff,@fasta); -foreach (@ARGV) { - if (/\.(fa|fasta|dna|seq|fast)(?:$|\.)/i) { - push @fasta,$_; - } else { - push @gff,$_; - } -} -@ARGV = @gff; -push @fasta,$FASTA if defined $FASTA; - -# drop everything that was there before -my %FH; -my $tmpdir = File::Spec->tmpdir() || '/tmp'; -$tmpdir =~ s!\\!\\\\!g if $bWINDOWS; #eliminates backslash mis-interpretation --d $tmpdir or die <new(">$tmpdir/$_.$$") or die $_,": $!"; - $FH{$_}->autoflush; -} - -if ( $use_pg ) { - $FH{FDATA() }->print("COPY fdata (fid, fref, fstart, fstop, fbin, ftypeid, fscore, fstrand, fphase, gid, ftarget_start, ftarget_stop) FROM stdin;\n"); - $FH{FTYPE() }->print("COPY ftype (ftypeid, fmethod, fsource) FROM stdin;\n"); - $FH{FGROUP() }->print("COPY fgroup (gid, gclass, gname) FROM stdin;\n"); - $FH{FATTRIBUTE() }->print("COPY fattribute (fattribute_id, fattribute_name) FROM stdin;\n"); - $FH{FATTRIBUTE_TO_FEATURE()}->print("COPY fattribute_to_feature (fid, fattribute_id, fattribute_value) FROM stdin;\n"); -} -my $FID = 1; -my $GID = 1; -my $FTYPEID = 1; -my $ATTRIBUTEID = 1; -my %GROUPID = (); -my %FTYPEID = (); -my %ATTRIBUTEID = (); -my %DONE = (); -my $FEATURES = 0; - -my %tmpfiles; # keep track of temporary fasta files -my $count; -my $fasta_sequence_id; -my $gff3; -my $current_file; #used to reset GFF3 flag in mix of GFF and GFF3 files - -$db->preferred_groups(split (/[,\s]+/,$GROUP_TAG)) if defined $GROUP_TAG; - -my $last = Time::HiRes::time() if $timer; -my $start = $last; - - # avoid hanging on standalone --fasta load -if (!@ARGV) { - $FH{NULL} = IO::File->new(">$tmpdir/null"); - push @ARGV, "$tmpdir/null"; -} - -my ($cmap_db); -if ($use_mysqlcmap){ - my $options = { - AutoCommit => 1, - FetchHashKeyName => 'NAME_lc', - LongReadLen => 3000, - LongTruncOk => 1, - RaiseError => 1, - }; - - $cmap_db = DBI->connect( $DSN, $USER, $PASSWORD, $options ); -} -# Only load CMap::Utils if using cmap -unless (!$use_mysqlcmap or - eval { - require Bio::GMOD::CMap::Utils; - Bio::GMOD::CMap::Utils->import('next_number'); - 1; - } - ) { - print STDERR "Error loading Bio::GMOD::CMap::Utils\n"; -} - - -while (<>) { - - $current_file ||= $ARGV; - - # reset GFF3 flag if new filehandle - unless($current_file eq $ARGV){ - undef $gff3; - $current_file = $ARGV; - } - - chomp; - my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group); - - # close sequence filehandle if required - if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA}) { - $FH{FASTA}->close; - delete $FH{FASTA}; - } - - # print to fasta file if the handle is open - if ( defined $FH{FASTA} ) { - $FH{FASTA}->print("$_\n"); - next; - } - - elsif (/^>(\S+)/) { # uh oh, sequence coming - $FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n"; - $FH{FASTA}->print("$_\n"); - print STDERR "Preparing embedded sequence $1\n"; - push @fasta, "$tmpdir/$1\.fa"; - push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa"; - $tmpfiles{"$tmpdir/$1\.fa"}++; - next; - } - - elsif (/^\#\#\s*gff-version\s+(\d+)/) { - $gff3 = ($1 >= 3); - $db->print_gff3_warning() if $gff3; - next; - } - - elsif (/^\#\#\s*group-tags\s+(.+)/) { - $db->preferred_groups(split(/\s+/,$1)); - next; - } - - elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line - ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = - ($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1")); - } - - elsif (/^\#/) { - next; - } - - else { - ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t"; - } - if ( not defined( $ref ) or length ($ref) == 0) { - warn "\$ref is null. source = $source, method = $method, group = $group\n"; - next; - } - $FEATURES++; - my $size = $stop-$start+1; - warn "Feature $group ($size) is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $size > $MAX_BIN; - - $source = '\N' unless defined $source; - $score = '\N' if $score eq '.'; - $strand = '\N' if $strand eq '.'; - $phase = '\N' if $phase eq '.'; - - my ($group_class,$group_name,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3); - - # GFF2/3 transition - $group_class = [$group_class] unless ref $group_class; - $group_name = [$group_name] unless ref $group_name; - - for (my $i=0; $i < @$group_name; $i++) { - $group_class->[$i] ||= '\N'; - $group_name->[$i] ||= '\N'; - $target_start ||= '\N'; - $target_stop ||= '\N'; - $method ||= '\N'; - $source ||= '\N'; - - my $fid = $FID++; - my $gid = $GROUPID{lc join('',$group_class->[$i],$group_name->[$i])} ||= $GID++; - my $ftypeid = $FTYPEID{lc join('',$source,$method)} ||= $FTYPEID++; - - my $bin = bin($start,$stop,$db->min_bin); - $FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" ); - if ($use_mysqlcmap){ - my $feature_id = next_number( - db => $cmap_db, - table_name => 'cmap_feature', - id_field => 'feature_id', - ) - or die 'No feature id'; - my $direction = $strand eq '-' ? -1:1; - $FH{ FGROUP() }->print( - join("\t",$feature_id,$feature_id,'NULL',0, $group_name->[$i],0,0,'NULL',1,$direction, $group_class->[$i],) - ,"\n" - ) unless $DONE{"G$gid"}++; - } - else { - $FH{ FGROUP() }->print( join("\t",$gid,$group_class->[$i],$group_name->[$i]),"\n") unless $DONE{"G$gid"}++; - } - $FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"T$ftypeid"}++; - - foreach (@$attributes) { - my ($key,$value) = @$_; - my $attributeid = $ATTRIBUTEID{$key} ||= $ATTRIBUTEID++; - $FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"A$attributeid"}++; - $FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n"); - } - - if ( $fid % 1000 == 0) { - my $now = Time::HiRes::time() if $timer; - my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : ''; - $last = $now; - print STDERR "$fid features parsed$elapsed..."; - print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n"; - } - } -} - -$FH{FASTA}->close if exists $FH{FASTA}; - -for my $file (@fasta) { - warn "Preparing DNA file $file....\n"; - if ($use_pg){ - $FH{FDNA() }->print("COPY fdna (fref, foffset, fdna) FROM stdin;\n"); - } - my $old = select($FH{FDNA()}); - $db->load_fasta($file) or warn "Couldn't load fasta file $file: $!"; - if ($use_pg){ - $FH{FDNA() }->print("\\.\n\n"); - } - warn "done...\n"; - select $old; - unlink $file if $tmpfiles{$file}; -} - -if ($use_pg) { - $FH{FDATA() }->print("\\.\n\n"); - $FH{FTYPE() }->print("\\.\n\n"); - $FH{FGROUP() }->print("\\.\n\n"); - $FH{FATTRIBUTE() }->print("\\.\n\n"); - $FH{FATTRIBUTE_TO_FEATURE()}->print("\\.\n\n"); -} - - -$_->close foreach values %FH; -printf STDERR "Total parse time %5.2fs\n",(Time::HiRes::time() - $start) if $timer; -warn "Loading feature data and analyzing tables. You may see RDBMS messages here...\n"; - -if ($use_pg){ - warn "Loading feature data. You may see Postgres comments...\n"; - - foreach (@files) { - my $file = "$tmpdir/$_.$$"; - - $AUTH ? system("psql $AUTH -f $file $DBNAME") - : system('psql','-f', $file, $DBNAME); - - unlink $file; - } - - warn "Updating sequences ...\n"; - $db->update_sequences(); - - warn "Creating indexes ...\n"; - $db->_create_indexes_etc(); - - warn "done...\n"; - -} - -elsif( $use_mysql or $use_mysqlcmap ) { - $start = time(); - - my $success = 1; - my $TERMINATEDBY = $bWINDOWS ? q( LINES TERMINATED BY '\r\n') : ''; - for my $f (@files) { - my $table = function_to_table($f,$ADAPTOR); - my $sql = join ('; ', - "lock tables $table write", - "delete from $table", - "load data $LOCAL infile '$tmpdir/$f.$$' replace into table $table $TERMINATEDBY", - "unlock tables"); - my $command = MYSQL . qq[$AUTH -s -e "$sql"]; - $command =~ s/\n/ /g; - $success &&= system($command) == 0; - unlink "$tmpdir/$f.$$"; - } - printf STDERR "Total load time %5.2fs\n",(time() - $start) if $timer; - print STDERR "done...\n"; - - print STDERR "Analyzing/optimizing tables. You will see database messages...\n"; - $start = time(); - my $sql = ''; - for my $f (@files) { - my $table = function_to_table($f,$ADAPTOR); - $sql .= "analyze table $table;"; - } - my $command = MYSQL . qq[$AUTH -N -s -e "$sql"]; - $success &&= system($command) == 0; - printf STDERR "Optimization time time %5.2fs\n",(time() - $start); - - if ($success) { - print "$FEATURES features successfully loaded\n"; - } else { - print "FAILURE: Please see standard error for details\n"; - exit -1; - } -} - -foreach (@fasta_files_to_be_unlinked) { - unlink "$tmpdir/$_.$$"; -} - -warn "Building summary statistics for coverage histograms...\n"; -my (@args,$AUTH); -if (defined $USER) { - push @args,(-user=>$USER); - $AUTH .= " -u$USER"; -} -if (defined $PASSWORD) { - push @args,(-pass=>$PASSWORD); - $AUTH .= " -p$PASSWORD"; -} -push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG; -my $db = Bio::DB::GFF->new(-adaptor=>"dbi::$ADAPTOR",-dsn => $DSN,@args) - or die "Can't open database: ",Bio::DB::GFF->error,"\n"; -$db->build_summary_statistics; - -exit 0; - -sub function_to_table { - my $function = shift; - my $adaptor = shift; - - if ($function eq 'fdata'){ - return 'fdata'; - } - elsif ($function eq 'ftype'){ - return 'ftype'; - } - elsif ($function eq 'fgroup'){ - return 'cmap_feature' if ($adaptor eq 'mysqlcmap'); - return 'fgroup'; - } - elsif ($function eq 'fdna'){ - return 'fdna'; - } - elsif ($function eq 'fattribute'){ - return 'fattribute'; - } - elsif ($function eq 'fattribute_to_feature'){ - return 'fattribute_to_feature'; - } - return ''; -} - -__END__ diff --git a/bin/bp_das_server b/bin/bp_das_server deleted file mode 100644 index 169b7df7b..000000000 --- a/bin/bp_das_server +++ /dev/null @@ -1,528 +0,0 @@ -#!/usr/bin/perl - -# minimal annotation server - -use strict; -use warnings; -use Apache::DBI; -use Bio::DB::GFF; -use CGI qw/header path_info param url request_method/; -use Digest::MD5 'md5_hex'; -use Carp; - -my $VERSION = 'DAS/1.00'; -(my $BASENAME = url(-absolute=>1)) =~ s!http://[^/]+/!!; - -use vars qw($DB %ERRCODES %CATEGORIES $HEADER - %DSN %TYPE2CATEGORY %TYPEOBJECTS - %EXCLUDE - ); - -# dsn description db server map master -%DSN = ( - 'chr22_transcripts' => [q(EST-predicted transcripts on chr22 from Jean Thierry-Mieg), - 'dbi:mysql:database=tm_chr22;host=brie3.cshl.org', - 'http://servlet.sanger.ac.uk:8080/das/ensembl110'] - ); -######################################################################################## - -%ERRCODES = ( - 200 => 'OK', - 400 => 'Bad command', - 401 => 'Bad data source', - 402 => 'Bad command arguments', - 403 => 'Bad reference object', - 404 => 'Bad stylesheet', - 405 => 'Coordinate error', - 500 => 'Internal server error (oops)', - 501 => 'Unimplemented feature', - ); - -%CATEGORIES = ( - component => [qw(Sequence:Contig Sequence:Link Sequence:Genomic_canonical - static_golden_path_contig:ensembl ensembl_contig:ensembl)], - transcription => [qw(Sequence:curated polyA_site stop CpG misc_signal intron exon transcript CDS)], - homology => [qw(similarity)], - repeat => [qw(Alu repeat repeat_region repeat_unit misc_feature)], - structural => [qw(Clone cosmid OLIGO PCR_product structural compression Comment Conflict)], - experimental => [qw(experimental RNAi)], -); - -%EXCLUDE = ( - 'static_golden_path_contig:ensembl' => 1, - 'ensembl_contig:ensembl' => 1, - 'Sequence:Contig' => 1, - ); - -while (my($c,$v) = each %CATEGORIES) { # invert nicely - for my $typename (@$v) { - my $typeobj = Bio::DB::GFF::Typename->new($typename); - $TYPE2CATEGORY{$typeobj} = $c; - $TYPEOBJECTS{$typeobj} = $typeobj; - } -} - -$HEADER = 0; -my ($junk,$DSN,$OPERATION) = split '/',path_info(); - -do { error_header('invalid request',400); exit 0 } unless $DSN; -do { list_dsns(); exit 0 } if $DSN eq 'dsn' or $OPERATION eq 'dsn'; -do { error_header('invalid data source, use the dsn command to get list',401); exit 0 } unless $DSN{$DSN}; - -do { error_header('Could not open database',500); exit 0 } - unless $DB = openDB($DSN); - -do { entry_points(); exit 0 } if $OPERATION eq 'entry_points'; -do { types(); exit 0 } if $OPERATION eq 'types'; -do { features(); exit 0 } if $OPERATION eq 'features'; -do { stylesheet(); exit 0 } if $OPERATION eq 'stylesheet'; - -error_header('invalid request',400); -exit 0; - -# ----------------------------------------------------------------- -sub openDB { - my $name = shift; - my $db = Bio::DB::GFF->new(-adaptor=>'dbi::mysqlopt',-dsn=>$DSN{$name}[1]); - $db->automerge(0); - $db->debug(0); - return $db; -} - -# ----------------------------------------------------------------- -sub list_dsns { - my $j = ' 'x3; - ok_header(); - print qq(\n\n); - print "\n"; - - for my $dsn (sort keys %DSN) { - print "$j\n"; - print qq($j$j$DSN{$dsn}[0]\n); - print qq($j$j$DSN{$dsn}[2]/\n); - print qq($j$jThis is the $DSN{$dsn}[0] database\n); - print "$j\n"; - } - print "\n"; -} - -# ----------------------------------------------------------------- -sub entry_points { - my $segments = get_segments(); - - my @parts; - if ($segments) { - @parts = map { get_segment_obj(@$_) } @$segments; - @parts = map { $_->contained_features(-types=>['Sequence:Link','Sequence:Contig','Sequence:Genomic_canonical'],-merge=>0) } @parts; - } else { - @parts = grep {$_->name =~ /^CHR/i} $DB->features(-types=>['Sequence:Link','Sequence:Contig','Sequence:Genomic_canonical'],-merge=>0); - } - - my $url = get_url(); - - ok_header(); - print < - - - -END -; - - for my $part (@parts) { - $part->absolute(1); - my $name = $part->name; - my $st = $part->start; - my $en = $part->stop; - my $class = $part->class; - my $length = $part->length; - my $orientation = $part->strand > 0 ? '+' : '-'; - my $subparts = $part->source =~ /Link|Chromosome|Contig/ ? 'yes' : 'no'; - print qq($name\n); - } - print "\n\n"; -} - -# ----------------------------------------------------------------- -# get the features for the segment indicated -sub features { - my @segments = get_segments() or return; - - my $summary = param('summary'); - my $url = get_url(); - my @filter = param('type'); - my @category = param('category'); - push @filter,make_categories(@category); - - - ok_header(); - print < - - - -END -; - - foreach (@segments) { - my ($reference,$refclass,$start,$stop) = @$_; - my $seq = get_segment_obj($reference,$refclass,$start,$stop); - unless ($seq) { - print qq(\n); - next; - } - - if (lc(param('category')) eq 'component') { - dump_framework($seq); - next; - } - - my $r = $seq->refseq; - my $s = $seq->start; - my $e = $seq->stop; - ($s,$e) = ($e,$s) if $s > $e; - - print qq(\n); - - my $iterator = $seq->features(-types=>\@filter,-merge=>0,-iterator=>1); - - while (my $f = $iterator->next_seq) { - my $type = $f->type; - next if $EXCLUDE{$type}; - - my $flabel = $f->info || $f->type; - my $source = $f->source; - my $method = $f->method; - my $start = $f->start; - my $end = $f->stop; - my $score = $f->score; - my $orientation = $f->strand; - my $phase = $f->phase; - my $group = $f->group; - my $id = $f->id; - - $phase ||= 0; - $orientation ||= 0; - $score ||= '-'; - $orientation = $orientation >= 0 ? '+' : '-'; - - # hack hack hack - my $category = transmute($type); - ($start,$end) = ($end,$start) if $start > $end; - - # group stuff - my $hash = $group; -# my @notes = $f->notes; - my @notes; - my $info = $f->info; - my $group_info; - - if (ref($info)) { - my $class = $info->class; - $id = "$class:$info"; - if ($DSN eq 'elegans') { - $group_info = qq($info); - } - } else { - $hash = md5_hex($group); - $group_info = join "\n",map {qq($_)} @notes; - } - - my ($target,$target_info); - if (($target = $f->target) && $target->can('start')) { - my $start = $target->start; - my $stop = $target->stop; - $target_info = qq(); - } - - if ($category eq 'component') { - my $strt = 1; - my $stp = $stop - $start + 1; - $target_info = qq(); - } - - my $map; - if ($type =~ /Segment|Link|Genomic_canonical|Contig/i) { $map = qq( reference="yes") } else { $map = qq() } - $map .= qq( subparts="yes") if $type =~ /Segment|Link/i; - - ## Need not print feature for map in annotation services - ## The next 2 lines are ucommented at Wash U: - # if (($DSN ne "welegans") && ($c eq "structural")) { - # } else { - - print < - $type - $method - $start - $end - $score - $orientation - $phase -END -; - if ($hash) { - print qq( \n); - print qq( $group_info\n) if $group_info; - print qq( $target_info\n) if $target_info; - print qq( \n); - } - print < -END - ; - # } # End Wash U if statement - } - - print qq(\n); - } - -print < - -END -} - -sub dump_framework { - my $seq = shift; - my $start = $seq->start; - my $stop = $seq->stop; - - my @parts = $seq->contained_features(-type=>['Sequence:Link','Sequence:Genomic_canonical','Sequence:Contig'],-merge=>0); - - print qq(\n); - - for my $part (@parts) { - my ($st,$en) = ($part->start,$part->stop); - my $orientation = $part->strand >= 0 ? '+1' : '-1'; - my $length = $part->length; - my $type = $part->type; - my $method = $type->method; - my $description = qq(category="component" reference="yes"); - $description .= qq( subparts="yes") unless $part->source eq 'Genomic_canonical'; - - print < - $part - $method - $st - $en - - - $orientation - - - - $part - - -END - ; - } - print qq(\n); -} - -sub types { - return all_types() unless param('ref') or param('segment'); - - my $type = param('entry_type') || 'Sequence'; - my $summary = param('summary'); - my $url = get_url(); - my @filter = param('type'); - - my @segments = get_segments() or return; - - ok_header(); - - print < - - - -END -; - - foreach (@segments) { - my ($reference,$class,$start,$stop) = @$_; - next unless $reference; - my $seq = get_segment_obj($reference,$class,$start,$stop) or next; - unless ($seq) { #empty section - print qq(\n); - print qq(\n); - next; - } - - my $s = $seq->start; - my $e = $seq->stop; - - # use absolute coordinates -- people expect it - my $name = $seq->refseq; - - print qq(\n); - - my @args = (-enumerate=>1); - push @args,(-types=>\@filter) if @filter; - my %histogram = $seq->types(@args); - foreach (keys %histogram) { - my ($method,$source) = split ':'; - my $count = $histogram{$_}; - my $category = transmute($_); - print qq(\t$count\n) - unless $EXCLUDE{$_}; - } - print qq(\n); - } -print < - -END -} - -# list of all the types -sub all_types { - my @methods = $DB->types; - - ok_header(); - my $url = get_url(); - print < - - - - -END - ; - - for my $id (@methods) { - next if $EXCLUDE{$id}; - my $category = transmute($id); - my $method = $id->method; - my $source = $id->source; - print qq(\t\n); - } - - print < - - -END - ; - -} - -# Big time kludge -- just outputs the prebuilt stylesheet in this -# directory. Used primarily for testing. -sub stylesheet { - ok_header(); - open my $STYLE, '<', "style.xml" or die "Could not read file 'style.xml': $!\n"; - while(<$STYLE>) { - print $_; - } - close $STYLE; -} - -# really, really bad shit -# calculate type and category from acedb type and method -sub transmute { - my $type = shift; - - # look in $TYPE2CATEGORY first to see if we have an exact match - my $category = $TYPE2CATEGORY{$type}; - return $category if $category; - - # otherwise do a fuzzy match using the values of %TYPEOBJECTS - for my $typeobj (values %TYPEOBJECTS) { - warn "comparing $typeobj to $type"; - - if ($typeobj->match($type)) { - $category = $TYPE2CATEGORY{$typeobj}; # fetch category for this object - $TYPE2CATEGORY{$type} = $category; # remember this match for later - return $category; - } - } - return 'miscellaneous'; # no success -} - -# ----------------------------------------------------------------- -sub get_url { - my $url = url(-path=>1, -query=>1); - $url =~ tr/&/\;/; - return $url; -} - -# ----------------------------------------------------------------- -sub error_header { - my ($message,$code) = @_; - $code ||= 500; -# $code = "$code $ERRCODES{$code}"; - print header(-type =>'text/plain', - -X_DAS_Version => $VERSION, - -X_DAS_Status => $code, - ) unless $HEADER++; - return if request_method() eq 'HEAD'; - print $message; -} - -sub ok_header { - print header(-type =>'text/plain', - -X_DAS_Version => $VERSION, - -X_DAS_Status => "200 OK", - ) unless $HEADER++; -} - -# phony dtd -sub dtd { - ok_header(); - print < -DTD -} - -# ----------------------------------------------------------------- -sub get_segments { - # extended segment argument - my @segments; - foreach (param('segment')) { - my ($ref,$start,$stop) = /^(\S+?)(?::(\d+),(\d+))?$/; - push @segments,[$ref,$start,$stop]; - } - push @segments,[scalar param('ref'),scalar param('start'),scalar param('stop')] if param('ref'); - return unless @segments; - - foreach (@segments){ - my ($reference,$start,$stop) = @$_; - my $class = param('entry_type') || 'Sequence'; - my $name = $reference; - - if ($reference =~ /^(\w+):(\S+)$/) { - $class = $1; - $name = $2; - } - my @values = ($name,$class,$start,$stop); - $_ = \@values; - } - - return wantarray ? @segments : \@segments; -} - -# ----------------------------------------------------------------- -sub get_segment_obj { - my ($reference,$class,$start,$stop) = @_; - my @args = (-name=>$reference); - push @args,(-class=>$class) if defined $class; - push @args,(-start=>$start) if defined $start; - push @args,(-stop=>$stop) if defined $stop; - - my $segment = $DB->segment(@args) or return; - return $segment; -} - - -# ----------------------------------------------------------------- -sub make_categories { - my @filter; - for my $category (@_) { - my $c = lc $category; - push @filter,@{$CATEGORIES{$c}} if $CATEGORIES{$c}; - push @filter,$category unless $CATEGORIES{$c}; - } - return @filter; -} diff --git a/bin/bp_fast_load_gff b/bin/bp_fast_load_gff deleted file mode 100644 index 5445d9123..000000000 --- a/bin/bp_fast_load_gff +++ /dev/null @@ -1,497 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -# use lib './blib/lib'; -use DBI; -use IO::File; -use Getopt::Long; -use Bio::DB::GFF::Util::Binning 'bin'; -use Bio::DB::GFF::Adaptor::dbi::mysqlopt; - -use constant MYSQL => 'mysql'; - -use constant FDATA => 'fdata'; -use constant FTYPE => 'ftype'; -use constant FGROUP => 'fgroup'; -use constant FDNA => 'fdna'; -use constant FATTRIBUTE => 'fattribute'; -use constant FATTRIBUTE_TO_FEATURE => 'fattribute_to_feature'; - -my $DO_FAST = eval "use POSIX 'WNOHANG'; 1;"; - -=head1 NAME - -bp_fast_load_gff.pl - Fast-load a Bio::DB::GFF database from GFF files. - -=head1 SYNOPSIS - - % bp_fast_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ... - -=head1 DESCRIPTION - -This script loads a Bio::DB::GFF database with the features contained -in a list of GFF files and/or FASTA sequence files. You must use the -exact variant of GFF described in L. Various -command-line options allow you to control which database to load and -whether to allow an existing database to be overwritten. - -This script is similar to load_gff.pl, but is much faster. However, -it is hard-coded to use MySQL and probably only works on Unix -platforms due to its reliance on pipes. See L for an -incremental loader that works with all databases supported by -Bio::DB::GFF, and L for a fast MySQL loader that -supports all platforms. - -=head2 NOTES - -If the filename is given as "-" then the input is taken from -standard input. Compressed files (.gz, .Z, .bz2) are automatically -uncompressed. - -FASTA format files are distinguished from GFF files by their filename -extensions. Files ending in .fa, .fasta, .fast, .seq, .dna and their -uppercase variants are treated as FASTA files. Everything else is -treated as a GFF file. If you wish to load -fasta files from STDIN, -then use the -f command-line swith with an argument of '-', as in - - gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f - - -The nature of the load requires that the database be on the local -machine and that the indicated user have the "file" privilege to load -the tables and have enough room in /usr/tmp (or whatever is specified -by the \$TMPDIR environment variable), to hold the tables transiently. -If your MySQL is version 3.22.6 and was compiled using the "load local -file" option, then you may be able to load remote databases with local -data using the --local option. - -About maxfeature: the default value is 100,000,000 bases. If you have -features that are close to or greater that 100Mb in length, then the -value of maxfeature should be increased to 1,000,000,000. This value -must be a power of 10. - -If the list of GFF or fasta files exceeds the kernel limit for the -maximum number of command-line arguments, use the ---long_list /path/to/files option. - -The adaptor used is dbi::mysqlopt. There is currently no way to -change this. - -=head1 COMMAND-LINE OPTIONS - -Command-line options can be abbreviated to single-letter options. -e.g. -d instead of --database. - - --database Mysql database name - --create Reinitialize/create data tables without asking - --local Try to load a remote database using local data. - --user Username to log in as - --fasta File or directory containing fasta files to load - --password Password to use for authentication - --long_list Directory containing a very large number of - GFF and/or FASTA files - --maxfeature Set the value of the maximum feature size (default 100Mb; must be a power of 10) - --group A list of one or more tag names (comma or space separated) - to be used for grouping in the 9th column. - --gff3_munge Activate GFF3 name munging (see Bio::DB::GFF) - --summary Generate summary statistics for drawing coverage histograms. - This can be run on a previously loaded database or during - the load. - --Temporary Location of a writable scratch directory - -=head1 SEE ALSO - -L, L, L - -=head1 AUTHOR - -Lincoln Stein, lstein@cshl.org - -Copyright (c) 2002 Cold Spring Harbor Laboratory - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. See DISCLAIMER.txt for -disclaimers of warranty. - -=cut - -package Bio::DB::GFF::Adaptor::faux; - -use Bio::DB::GFF::Adaptor::dbi::mysqlopt; -use vars '@ISA'; -@ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt'; - -sub insert_sequence { - my $self = shift; - my ($id,$offset,$seq) = @_; - print join "\t",$id,$offset,$seq,"\n"; -} - -package main; - -eval "use Time::HiRes"; undef $@; -my $timer = defined &Time::HiRes::time; - -my ($DSN,$CREATE,$USER,$PASSWORD,$FASTA,$FAILED,$LOCAL,%PID,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR,$SUMMARY_STATS); - -if ($DO_FAST) { - $SIG{CHLD} = sub { - while ((my $child = waitpid(-1,&WNOHANG)) > 0) { - delete $PID{$child} or next; - $FAILED++ if $? != 0; - } - } -}; - -$SIG{INT} = $SIG{TERM} = sub {cleanup(); exit -1}; - -GetOptions ('database:s' => \$DSN, - 'create' => \$CREATE, - 'user:s' => \$USER, - 'local' => \$LOCAL, - 'password:s' => \$PASSWORD, - 'fasta:s' => \$FASTA, - 'group:s' => \$GROUP_TAG, - 'long_list:s' => \$LONG_LIST, - 'maxbin|maxfeature:s' => \$MAX_BIN, - 'gff3_munge' => \$MUNGE, - 'summary' => \$SUMMARY_STATS, - 'Temporary:s' => \$TMPDIR, - ) or (system('pod2text',$0), exit -1); - -$DSN ||= 'test'; -$MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes - -my (@args,$AUTH); -if (defined $USER) { - push @args,(-user=>$USER); - $AUTH .= " -u$USER"; -} -if (defined $PASSWORD) { - push @args,(-pass=>$PASSWORD); - $AUTH .= " -p$PASSWORD"; -} -push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG; - -my $db = Bio::DB::GFF->new(-adaptor=>'faux',-dsn => $DSN,@args) - or die "Can't open database: ",Bio::DB::GFF->error,"\n"; - -$db->gff3_name_munging(1) if $MUNGE; - -if ($CREATE) { - $SUMMARY_STATS++; - $MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1); -} - -$MAX_BIN ||= $db->meta('max_bin') || 100_000_000; - -# deal with really long lists of files -if ($LONG_LIST) { - -d $LONG_LIST or die "The --long_list argument must be a directory\n"; - opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!"; - @ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR; - closedir GFFDIR; - - if (defined $FASTA && -d $FASTA) { - opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!"; - push @ARGV, map { "$FASTA\/$_" } readdir FASTA; - closedir FASTA; - } -} - -foreach (@ARGV) { - $_ = "gunzip -c $_ |" if /\.gz$/; - $_ = "uncompress -c $_ |" if /\.Z$/; - $_ = "bunzip2 -c $_ |" if /\.bz2$/; -} -my(@fasta,@gff); -foreach (@ARGV) { - if (/\.(fa|fasta|dna|seq|fast)(?:\.|$)/i) { - push @fasta,$_; - } else { - push @gff,$_; - } -} -@ARGV = @gff; -push @fasta,$FASTA if defined $FASTA; - -# initialize state variables -my $FID = 1; -my $GID = 1; -my $FTYPEID = 1; -my $ATTRIBUTEID = 1; -my %GROUPID = (); -my %FTYPEID = (); -my %ATTRIBUTEID = (); -my %DONE = (); -my $FEATURES = 0; - -load_tables($db->dbh) unless $CREATE; -my ($major,$minor,$sub) = split /\./,$db->dbh->get_info(18); # SQL_DBMS_VER -my $can_disable_indexes = ($major >= 4 and $minor >= 0); - -# open up pipes to the database -my (%FH,%COMMAND); -my $MYSQL = MYSQL; -my $tmpdir = $TMPDIR || $ENV{TMPDIR} || $ENV{TMP} || File::Spec->tmpdir(); --d $tmpdir or die <new($file,'>') or die $_,": $!"; - print STDERR "ok\n"; - $FH{$_}->autoflush; -} - -print STDERR "Fast loading enabled\n" if $DO_FAST; - -my ($count,$gff3,$last,$start,$beginning,$current_file); - -$last = Time::HiRes::time() if $timer; -$beginning = $start = $last; - -# avoid hanging on standalone --fasta load -if (!@ARGV) { - $FH{NULL} = IO::File->new(">$tmpdir/null"); - push @ARGV, "$tmpdir/null"; -} - -while (<>) { - - # reset GFF3 flag if new filehandle - $current_file ||= $ARGV; - unless ($current_file eq $ARGV) { - undef $gff3; - $current_file = $ARGV; - } - - chomp; - my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group); - - # close sequence filehandle if required - if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA}) { - $FH{FASTA}->close; - delete $FH{FASTA}; - } - - # print to fasta file if the handle is open - if ( defined $FH{FASTA} ) { - $FH{FASTA}->print("$_\n"); - next; - } - - elsif (/^>(\S+)/) { # uh oh, sequence coming - $FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n"; - $FH{FASTA}->print("$_\n"); - push @fasta, "$tmpdir/$1\.fa"; - push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa"; - print STDERR "Processing embedded sequence $1\n"; - next; - } - - elsif (/^\#\#\s*group-tags\s+(.+)/) { - $db->preferred_groups(split(/\s+/,$1)); - next; - } - - elsif (/^\#\#\s*gff-version\s+(\d+)/) { - $gff3 = ($1 >= 3); - $db->print_gff3_warning() if $gff3; - next; - } - - elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line - ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = - ($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1")); - } - - elsif (/^\#/) { - next; - } - - else { - ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t"; - } - next unless defined $ref; - $FEATURES++; - - warn "Feature $group is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $stop-$start+1 > $MAX_BIN; - - $source = '\N' unless defined $source; - $score = '\N' if $score eq '.'; - $strand = '\N' if $strand eq '.'; - $phase = '\N' if $phase eq '.'; - - my ($gclass,$gname,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3); - # GFF2/3 transition - $gclass = [$gclass] unless ref $gclass; - $gname = [$gname] unless ref $gname; - - for (my $i=0; $i < @$gname; $i++) { - my $group_class = $gclass->[$i]; - my $group_name = $gname->[$i]; - $group_class ||= '\N'; - $group_name ||= '\N'; - $target_start ||= '\N'; - $target_stop ||= '\N'; - $method ||= '\N'; - $source ||= '\N'; - - my $fid = $FID++; - my $gid = $GROUPID{lc join($;,$group_class,$group_name)} ||= $GID++; - my $ftypeid = $FTYPEID{lc join($;,$source,$method)} ||= $FTYPEID++; - - my $bin = bin($start,$stop,$db->min_bin); - $FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" ); - $FH{ FGROUP() }->print( join("\t",$gid,$group_class,$group_name),"\n" ) unless $DONE{"fgroup$;$gid"}++; - $FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"ftype$;$ftypeid"}++; - - foreach (@$attributes) { - my ($key,$value) = @$_; - my $attributeid = $ATTRIBUTEID{lc $key} ||= $ATTRIBUTEID++; - $FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"fattribute$;$attributeid"}++; - $FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n"); - } - - if ( $FEATURES % 1000 == 0) { - my $now = Time::HiRes::time() if $timer; - my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : ''; - $last = $now; - print STDERR "$fid features parsed$elapsed..."; - print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n"; - } - } -} - -$FH{FASTA}->close if exists $FH{FASTA}; - -printf STDERR "Feature load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer; -$start = time(); - -for my $fasta (@fasta) { - warn "Loading fasta ",(-d $fasta?"directory":"file"), " $fasta\n"; - my $old = select($FH{FDNA()}); - my $loaded = $db->load_fasta($fasta); - warn "$fasta: $loaded records loaded\n"; - select $old; -} - -printf STDERR "Fasta load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer; -$start = time(); - -my $success = 1; -if ($DO_FAST) { - warn "Indexing and analyzing tables. This may take some time (you may see database messages during the process)...\n"; -} - -$_->close foreach values %FH; - -if (!$DO_FAST) { - warn "Loading feature data and analyzing tables. You may see database messages here...\n"; - $success &&= system($COMMAND{$_}) == 0 foreach @files; -} - -# wait for children -while (%PID) { - sleep; -} -$success &&= !$FAILED; - -cleanup(); - -printf STDERR "Total parse & load time %5.2fs\n",(Time::HiRes::time() - $beginning) if $timer; - -if ($success) { - print "SUCCESS: $FEATURES features successfully loaded\n"; - exit 0; -} else { - print "FAILURE: Please see standard error for details\n"; - exit -1; -} - -if ($SUMMARY_STATS) { - warn "Building summary statistics for coverage histograms...\n"; - $db->build_summary_statistics; -} - -exit 0; - -sub cleanup { - foreach (@files,@fasta_files_to_be_unlinked) { - unlink "$tmpdir/$_.$$"; - } -} - -# load copies of some of the tables into memory -sub load_tables { - my $dbh = shift; - print STDERR "loading normalized group, type and attribute information..."; - $FID = 1 + get_max_id($dbh,'fdata','fid'); - $GID = 1 + get_max_id($dbh,'fgroup','gid'); - $FTYPEID = 1 + get_max_id($dbh,'ftype','ftypeid'); - $ATTRIBUTEID = 1 + get_max_id($dbh,'fattribute','fattribute_id'); - get_ids($dbh,\%DONE,\%GROUPID,'fgroup','gid','gclass','gname'); - get_ids($dbh,\%DONE,\%FTYPEID,'ftype','ftypeid','fsource','fmethod'); - get_ids($dbh,\%DONE,\%ATTRIBUTEID,'fattribute','fattribute_id','fattribute_name'); - print STDERR "ok\n"; -} - -sub get_max_id { - my $dbh = shift; - my ($table,$id) = @_; - my $sql = "select max($id) from $table"; - my $result = $dbh->selectcol_arrayref($sql) or die $dbh->errstr; - $result->[0]; -} - -sub get_ids { - my $dbh = shift; - my ($done,$idhash,$table,$id,@columns) = @_; - my $columns = join ',',$id,@columns; - my $sql = "select $columns from $table"; - my $sth = $dbh->prepare($sql) or die $dbh->errstr; - $sth->execute or die $dbh->errstr; - while (my($id,@cols) = $sth->fetchrow_array) { - my $key = lc join $;,@cols; - $idhash->{$key} = $id; - $done->{$table,$id}++; - } -} - -__END__ diff --git a/bin/bp_genbank2gff b/bin/bp_genbank2gff deleted file mode 100644 index 2e1144630..000000000 --- a/bin/bp_genbank2gff +++ /dev/null @@ -1,310 +0,0 @@ -#!/usr/bin/perl - -use lib '.'; - -use strict; -use warnings; -use Bio::DB::GFF; -use Getopt::Long; - -=head1 NAME - -bp_genbank2gff.pl - Load a Bio::DB::GFF database from GENBANK files. - -=head1 SYNOPSIS - - % bp_genbank2gff.pl -d genbank -f localfile.gb - % bp_genbank2gff.pl -d genbank --accession AP003256 - % bp_genbank2gff.pl --accession AP003256 --stdout - -=head1 DESCRIPTION - -This script loads a Bio::DB::GFF database with the features contained -in a either a local genbank file or an accession that is fetched from -genbank. Various command-line options allow you to control which -database to load and whether to allow an existing database to be -overwritten. - -The database must already have been created and the current user must -have appropriate INSERT and UPDATE privileges. The --create option -will initialize a new database with the appropriate schema, deleting -any tables that were already there. - -=head1 COMMAND-LINE OPTIONS - -Command-line options can be abbreviated to single-letter options. -e.g. -d instead of --database. - - --create Force creation and initialization of database - --dsn Data source (default dbi:mysql:test) - --user Username for mysql authentication - --pass Password for mysql authentication - --proxy Proxy server to use for remote access - --stdout direct output to STDOUT - --adaptor adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle) --viral the genome you are loading is viral (changes tag - choices) - --source source field for features ['genbank'] - EITHER --file Arguments that follow are Genbank/EMBL file names - OR --gb_folder What follows is a folder full of gb files to process OR --accession Arguments that follow are genbank accession numbers - (not gi!) - OR --acc_file Accession numbers (not gi!) in a file (one per line, no punc.) - OR --acc_pipe Accession numbers (not gi!) from a STDIN pipe (one - per line) - - -=head1 SEE ALSO - -L, L, L - -=head1 AUTHOR - -Scott Cain, cain@cshl.org - -Copyright (c) 2003 Cold Spring Harbor Laboratory - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. See DISCLAIMER.txt for -disclaimers of warranty. - -=cut - -package Bio::DB::GFF::Adaptor::biofetch_to_stdout; -use CGI 'escape'; -use Bio::DB::GFF::Util::Rearrange; -use Bio::DB::GFF::Adaptor::biofetch; -use vars '@ISA'; -@ISA = 'Bio::DB::GFF::Adaptor::biofetch'; - -sub load_gff_line { - my ($self,$options) = @_; - # synthesize GFF3-compatible line - my @attributes; - if (my $id = $options->{gname}) { - my $parent = $id; - $parent =~ s/\..\d+$// if $options->{method} =~ /^(mRNA|transcript|exon|gene)$/; - push @attributes,"Parent=".escape($parent) if $options->{method} =~ /^(variation|exon|CDS|transcript|mRNA|coding)$/; - push @attributes,"ID=".escape($id) unless $options->{method} =~ /^(exon|CDS)$/; - } - if (my $tstart = $options->{tstart}) { - my $tstop = $options->{tstop}; - my $target = escape($options->{gname}); - push @attributes,"Target=$target+$tstart+$tstop"; - } - my %a; - if (my $attributes = $options->{attributes}) { - for my $a (@$attributes) { - my ($tag,$value) = @$a; - push @{$a{escape($tag)}},escape($value); - } - for my $a (keys %a) { - push @attributes,"$a=".join(',',@{$a{$a}}); - } - } - ${$options}{'score'} = "." unless ${$options}{'score'}; - ${$options}{'strand'} = "." unless ${$options}{'strand'}; - ${$options}{'phase'} = "." unless ${$options}{'phase'}; - my $last_column = join ';',@attributes; - if ($options->{method} eq 'origin') { - print "##sequence-region $options->{gname} $options->{start} $options->{stop}\n"; - } - print join("\t",@{$options}{qw(ref source method start stop score strand phase)},$last_column),"\n"; -} - -sub load_sequence_string { - my $self = shift; - my ($acc,$seq) = @_; - return unless $seq; - $seq =~ s/(.{1,60})/$1\n/g; - print ">$acc\n\L$seq\U\n"; -} - -sub setup_load { - my $self = shift; - print "##gff-version 3\n"; -} - -sub finish_load { } - -1; - -package main; - -my $USAGE = < ] ... -Load a Bio::DB::GFF database from GFF files. - - Options: - --create Force creation and initialization of database - --dsn Data source (default dbi:mysql:test) - --user Username for mysql authentication - --pass Password for mysql authentication - --proxy Proxy server to use for remote access - --stdout direct output to STDOUT - --adaptor adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle) - --viral the genome you are loading is viral (changes tag - choices) - --source source field for features ['genbank'] - EITHER --file Arguments that follow are Genbank/EMBL file names - OR --gb_folder What follows is a folder full of gb files to process - OR --accession Arguments that follow are genbank accession numbers - (not gi!) - OR --acc_file Accession numbers (not gi!) in a file (one per line, - no punc.) - OR --acc_pipe Accession numbers (not gi!) from a STDIN pipe (one - per line) - - -This script loads a Bio::DB::GFF database with the features contained -in a either a local genbank file or an accession that is fetched from -genbank. Various command-line options allow you to control which -database to load and whether to allow an existing database to be -overwritten. - -USAGE -; - -my ($DSN,$ADAPTOR,$CREATE,$USER,$VIRAL,$PASSWORD,$gbFOLDER, - $FASTA,$ACC,$accFILE, $accPIPE, $FILE,$PROXY,$STDOUT,$SOURCE); - - -GetOptions ( - 'dsn:s' => \$DSN, - 'user:s' => \$USER, - 'password:s' => \$PASSWORD, - 'adaptor:s' => \$ADAPTOR, - 'accession' => \$ACC, - 'file' => \$FILE, - 'viral' => \$VIRAL, - 'acc_file' => \$accFILE, - 'acc_pipe' => \$accPIPE, - 'source:s' => \$SOURCE, - 'gb_folder=s' => \$gbFOLDER, - 'proxy:s' => \$PROXY, - 'stdout' => \$STDOUT, - 'create' => \$CREATE) or die $USAGE; - - -die $USAGE unless ($DSN || $STDOUT); # at a minimum we need to have a place to write to! - -# some local defaults -$DSN ||= 'dbi:mysql:test'; -$ADAPTOR ||= $STDOUT ? 'memory' : 'dbi::mysql'; - -# Ensure that biofetch inherits from the "right" adaptor. -# This is a horrible hack and should be fixed. -eval "use Bio::DB::GFF::Adaptor::${ADAPTOR}"; -local @Bio::DB::GFF::Adaptor::biofetch::ISA = "Bio::DB::GFF::Adaptor::${ADAPTOR}"; - -my $biofetch = $STDOUT ? 'biofetch_to_stdout' : 'biofetch'; -my @dsn = $STDOUT ? () : (-dsn => $DSN); - -my @auth; -push @auth,(-user=>$USER) if defined $USER; -push @auth,(-pass=>$PASSWORD) if defined $PASSWORD; -push @auth,(-proxy=>$PROXY) if defined $PROXY; - -my %preferred_tags = ( - strain => 10, - organism => 20, - protein_id => 40, - locus_tag => 50, - locus => 60, - gene => 70, - standard_name => 80, - ); -$preferred_tags{'product'} = 90 if $VIRAL; # added this to the default list for viral genomes - # since most functions come from post-translational processing, so the default labels are c**p! - -my $db = Bio::DB::GFF->new(-adaptor=>$biofetch, - @dsn, - @auth, - -preferred_tags => \%preferred_tags, - -source=> $SOURCE || 'Genbank') - or die "Can't open database: ",Bio::DB::GFF->error,"\n"; - -if ($CREATE) { - $db->initialize(1); -} - -die "you must specify either an accession to retrieve from\nembl or a local file containing data in embl format\n" if (($FILE || $ACC) && !scalar(@ARGV)); - -if ($ACC) { - while ($_ = shift) { - status(loading => $_); - my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_); - status(done => $result); - } - exit 1; -} - -elsif ($FILE) { - while ($_ = shift) { - status('loading' => $_); - my $result = $db->load_from_file($_); - status (done => $result); - } - exit 1; -} - -elsif ($accFILE){ - my $filename = shift; - die "you must supply a filename after the --accFILE command line flag\n" unless $filename; - die "file $filename does not exist\n" unless (-e $filename && !(-d $filename)); - open my $IN, '<', $filename or die "Could not read file '$filename' for reading accession numbers: $!\n"; - while (my $line = <$IN>){ - chomp $line; - status(loading => $line); - my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $line); - status(done => $result); - } - close $IN; - exit 1; -} - -elsif ($gbFOLDER){ - my $dir = $gbFOLDER; - die "folder $dir does not exist\n" unless (-e $dir && -d $dir); - opendir DIR, "$dir" || die "can't open directory $dir for reading: $!\n"; - my @files = readdir DIR; - foreach my $file(@files){ - if (!(-e "$gbFOLDER/$file") || (-d "$gbFOLDER/$file")){ - print STDERR " $gbFOLDER/$file is not a filename! Skipping...\n"; - next - } - my $result = $db->load_from_file("$gbFOLDER/$file"); - print STDERR $result ? "ok\n" : "failed\n"; - } -} elsif ($accPIPE){ - my @accessions = ; - chomp @accessions; - foreach (@accessions){ - status(loading => $_); - my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_); - status(done => $result); - } - exit 1; -} - -else { - my $done; - while ($_ = shift) { - $done = 1; - status(loading => $_); - my $result = $db->load_from_file($_); - status(done => $result); - } - - $done || die "\n\nno source of data provided\n\n"; - exit 1; -} - -sub status { - my ($state,$msg) = @_; - return if $STDOUT; - if ($state eq 'loading') { - print STDERR "Loading $msg..."; - } elsif ($state eq 'done') { - print STDERR $msg ? "ok\n" : "failed\n"; - } -} diff --git a/bin/bp_generate_histogram b/bin/bp_generate_histogram deleted file mode 100644 index ac3976990..000000000 --- a/bin/bp_generate_histogram +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use lib '.','./blib','../../blib/lib'; -use Bio::DB::GFF; -use Getopt::Long; - -my $usage = < Data source (default dbi:mysql:test) - --adaptor Schema adaptor (default dbi::mysqlopt) - --user Username for mysql authentication - --pass Password for mysql authentication - --bin Bin size in base pairs. - --aggregator Comma-separated list of aggregators - --sort Sort the resulting list by type and bin - --merge Merge features with same method but different sources -USAGE -; - -my ($DSN,$ADAPTOR,$AGG,$USER,$PASSWORD,$BINSIZE,$SORT,$MERGE); -GetOptions ('dsn:s' => \$DSN, - 'adaptor:s' => \$ADAPTOR, - 'user:s' => \$USER, - 'password:s' => \$PASSWORD, - 'aggregators:s' => \$AGG, - 'bin:i' => \$BINSIZE, - 'sort' => \$SORT, - 'merge' => \$MERGE, - ) or die $usage; - -my @types = @ARGV or die $usage; - -# some local defaults -$DSN ||= 'dbi:mysql:test'; -$ADAPTOR ||= 'dbi::mysqlopt'; -$BINSIZE ||= 1_000_000; # 1 megabase bins - -my @options; -push @options,(-user=>$USER) if defined $USER; -push @options,(-pass=>$PASSWORD) if defined $PASSWORD; -push @options,(-aggregator=>[split /\s+/,$AGG]) if defined $AGG; - -my $db = Bio::DB::GFF->new(-adaptor=>$ADAPTOR,-dsn => $DSN,@options) - or die "Can't open database: ",Bio::DB::GFF->error,"\n"; - -my @features = $db->features(-binsize=>$BINSIZE,-types=>\@types); - -if ($MERGE) { - my %MERGE; - for my $f (@features) { - my $name = $f->name; - my $class = $name->class; - $name =~ s/^(.+:.+):.+$/$1/; - $f->group(Bio::DB::GFF::Featname->new($class,$name)); - my $source = $f->source; - $source =~ s/:.+$//; - $f->source($source); - if (my $already_there = $MERGE{$f->source,$f->abs_ref,$f->abs_start}) { - $already_there->score($already_there->score + $f->score); - } else { - $MERGE{$f->source,$f->abs_ref,$f->abs_start} = $f; - } - } - @features = values %MERGE; -} - -# sort features by type, ref and start if requested -if ($SORT) { - @features = sort { - $a->type cmp $b->type - || $a->abs_ref cmp $b->abs_ref - || $a->start <=> $b->start - } - @features; -} - -for my $f (@features) { - print $f->gff_string,"\n"; -} - - -__END__ - -=head1 NAME - -bp_generate_histogram.pl -- Generate a histogram of Bio::DB::GFF features - -=head1 SYNOPSIS - - bp_generate_histogram.pl -d gadfly variation gene:curated - -=head1 DESCRIPTION - -Use this utility to generate feature density histograms from -Bio::DB::GFF databases. The result is a GFF data file that is -suitable for loading with load_gff.pl. - -=head2 OPTIONS - -The following options are recognized: - - Option Description - ------ ----------- - - --dsn Data source (default dbi:mysql:test) - --adaptor Schema adaptor (default dbi::mysqlopt) - --user Username for mysql authentication - --pass Password for mysql authentication - --aggregator Comma-separated list of aggregators - -=head1 BUGS - -Please report them. - -=head1 SEE ALSO - -L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE - -Copyright (c) 2001 Cold Spring Harbor Laboratory - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. See DISCLAIMER.txt for -disclaimers of warranty. - -=cut - diff --git a/bin/bp_load_gff b/bin/bp_load_gff deleted file mode 100644 index 947093cb2..000000000 --- a/bin/bp_load_gff +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use lib '../blib/lib'; -use Bio::DB::GFF; -use Getopt::Long; - -=head1 NAME - -bp_load_gff.pl - Load a Bio::DB::GFF database from GFF files. - -=head1 SYNOPSIS - - % bp_load_gff.pl -d testdb -u user -p pw - --dsn 'dbi:mysql:database=dmel_r5_1;host=myhost;port=myport' - dna1.fa dna2.fa features1.gff features2.gff ... - -=head1 DESCRIPTION - -This script loads a Bio::DB::GFF database with the features contained -in a list of GFF files and/or FASTA sequence files. You must use the -exact variant of GFF described in L. Various -command-line options allow you to control which database to load and -whether to allow an existing database to be overwritten. - -This script uses the Bio::DB::GFF interface, and so works with all -database adaptors currently supported by that module (MySQL, Oracle, -PostgreSQL soon). However, it is slow. For faster loading, see the -MySQL-specific L and L scripts. - -=head2 NOTES - -If the filename is given as "-" then the input is taken from standard -input. Compressed files (.gz, .Z, .bz2) are automatically -uncompressed. - -FASTA format files are distinguished from GFF files by their filename -extensions. Files ending in .fa, .fasta, .fast, .seq, .dna and their -uppercase variants are treated as FASTA files. Everything else is -treated as a GFF file. If you wish to load -fasta files from STDIN, -then use the -f command-line swith with an argument of '-', as in - - gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f - - -On the first load of a database, you will see a number of "unknown -table" errors. This is normal. - -About maxfeature: the default value is 100,000,000 bases. If you have -features that are close to or greater that 100Mb in length, then the -value of maxfeature should be increased to 1,000,000,000, or another -power of 10. - -=head1 COMMAND-LINE OPTIONS - -Command-line options can be abbreviated to single-letter options. -e.g. -d instead of --database. - - --dsn Data source (default dbi:mysql:test) - --adaptor Schema adaptor (default dbi::mysqlopt) - --user Username for mysql authentication - --pass Password for mysql authentication - --fasta Fasta file or directory containing fasta files for the DNA - --create Force creation and initialization of database - --maxfeature Set the value of the maximum feature size (default 100 Mb; must be a power of 10) - --group A list of one or more tag names (comma or space separated) - to be used for grouping in the 9th column. - --upgrade Upgrade existing database to current schema - --gff3_munge Activate GFF3 name munging (see Bio::DB::GFF) - --quiet No progress reports - --summary Generate summary statistics for drawing coverage histograms. - This can be run on a previously loaded database or during - the load. - -=head1 SEE ALSO - -L, L, L - -=head1 AUTHOR - -Lincoln Stein, lstein@cshl.org - -Copyright (c) 2002 Cold Spring Harbor Laboratory - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. See DISCLAIMER.txt for -disclaimers of warranty. - -=cut - -my ($DSN,$ADAPTOR,$CREATE,$USER,$PASSWORD,$FASTA,$UPGRADE,$MAX_BIN,$GROUP_TAG,$MUNGE,$QUIET,$SUMMARY_STATS); - -GetOptions ('dsn:s' => \$DSN, - 'adaptor:s' => \$ADAPTOR, - 'u|user:s' => \$USER, - 'p|password:s' => \$PASSWORD, - 'fasta:s' => \$FASTA, - 'upgrade' => \$UPGRADE, - 'maxbin|maxfeature:s' => \$MAX_BIN, - 'group:s' => \$GROUP_TAG, - 'gff3_munge' => \$MUNGE, - 'quiet' => \$QUIET, - 'summary' => \$SUMMARY_STATS, - 'create' => \$CREATE) or (system('pod2text',$0), exit -1); - -# some local defaults -$DSN ||= 'dbi:mysql:test'; -$ADAPTOR ||= 'dbi::mysqlopt'; -$MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes - -my @args; -push @args,(-user=>$USER) if defined $USER; -push @args,(-pass=>$PASSWORD) if defined $PASSWORD; -push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG; -push @args,(-create=>1) if $CREATE; -push @args,(-write=>1); - -my $db = Bio::DB::GFF->new(-adaptor=>$ADAPTOR,-dsn => $DSN,@args) - or die "Can't open database: ",Bio::DB::GFF->error,"\n"; - -$db->gff3_name_munging(1) if $MUNGE; - -if ($CREATE) { - $SUMMARY_STATS++; - $MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : - $db->initialize(1); -} elsif ($UPGRADE) { - warn qq(expect to see several "table already exists" messages\n); - $db->initialize(0); - my $dbi = $db->dbh; # get the raw database handle - my ($count) = $dbi->selectrow_array('SELECT COUNT(*) FROM fnote'); - if (defined($count) && $count > 0) { - warn qq(fnote table detected. Translating into fattribute table. This may take a while.\n); - $dbi->do("INSERT INTO fattribute VALUES (1,'Note')") or die "failed: ",$dbi->errstr; - $dbi->do("INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) SELECT fnote.fid,1,fnote FROM fnote") or die "failed: ",$dbi->errstr; - warn qq(Schema successfully upgraded. You might want to drop the fnote table when you're sure everything's working.\n); - } -} - -my (@gff,@fasta); -foreach (@ARGV) { - if (/\.(fa|fasta|dna|seq|fast)$/i) { - push @fasta,$_; - } else { - push @gff,$_; - } -} - -for my $file (@gff) { - warn "$file: loading...\n"; - my $loaded = $db->load_gff($file,!$QUIET); - warn "$file: $loaded records loaded\n"; -} - -unshift @fasta,$FASTA if defined $FASTA; - -for my $file (@fasta) { - warn "Loading fasta ",(-d $file?"directory":"file"), " $file\n"; - my $loaded = $db->load_fasta($file,!$QUIET); - warn "$file: $loaded records loaded\n"; -} - -if ($SUMMARY_STATS) { - warn "Building summary statistics for coverage histograms...\n"; - $db->build_summary_statistics; -} diff --git a/bin/bp_meta_gff b/bin/bp_meta_gff deleted file mode 100644 index 974d2d039..000000000 --- a/bin/bp_meta_gff +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use DBI; -use Getopt::Long; -use Bio::DB::GFF; - -=head1 NAME - -bp_meta_gff.pl - Get/set Bio::DB::GFF meta-data - -=head1 SYNOPSIS - - # set the following meta data values - % bp_meta_gff.pl -d testdb tag1=value1 tag2=value2 - - # get the indicated meta data value - % bp_meta_gff.pl -d testdb tag1 tag2 - -=head1 DESCRIPTION - -This script gets or sets metadata in a Bio::DB::GFF database. Not all -adaptors support this operation! To set a series of tags, pass a set -of tag=value pairs to the script. To get the contents of a series of -tags, pass the bare tag names. - -The output from the get operation will be an easily parseable set of -tag=value pairs, one per line. - -=head1 COMMAND-LINE OPTIONS - -Command-line options can be abbreviated to single-letter options. -e.g. -d instead of --database. - - --database Mysql database name (default dbi:mysql:test) - --adaptor Mysql adaptor (default dbi::mysqlopt) - --user Username for mysql authentication - --pass Password for mysql authentication - -=head1 SEE ALSO - -L - -=head1 AUTHOR - -Lincoln Stein, lstein@cshl.org - -Copyright (c) 2002 Cold Spring Harbor Laboratory - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. See DISCLAIMER.txt for -disclaimers of warranty. - -=cut - -my ($DSN,$ADAPTOR,$USER,$PASSWORD); - -GetOptions ('database:s' => \$DSN, - 'adaptor:s' => \$ADAPTOR, - 'user:s' => \$USER, - 'password:s' => \$PASSWORD, - ) or (system('pod2text', $0), exit -1); - -$DSN ||= 'dbi:mysql:test'; -$ADAPTOR ||= 'dbi::mysqlopt'; - -my @args; -push @args,(-user=>$USER) if defined $USER; -push @args,(-pass=>$PASSWORD) if defined $PASSWORD; - -my $db = Bio::DB::GFF->new(-adaptor=>$ADAPTOR,-dsn => $DSN,@args) - or die "Can't open database: ",Bio::DB::GFF->error,"\n"; - -for my $pair (@ARGV) { - my ($tag,$value) = split /=/,$pair; - if ($value) { # set operation - $db->meta($tag,$value); - unless ($db->meta($tag) eq $value) { - print STDERR "value for '$tag' not set; perhaps this adaptor does not support meta data?\n"; - } - } else { - print "$tag=",$db->meta($tag),"\n"; - } -} - -__END__ diff --git a/lib/Bio/DB/GFF.pm b/lib/Bio/DB/GFF.pm deleted file mode 100644 index 16af2c483..000000000 --- a/lib/Bio/DB/GFF.pm +++ /dev/null @@ -1,3896 +0,0 @@ - -=head1 NAME - -Bio::DB::GFF -- Storage and retrieval of sequence annotation data - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt', - -dsn => 'dbi:mysql:elegans'); - - # fetch a 1 megabase segment of sequence starting at landmark "ZK909" - my $segment = $db->segment('ZK909', 1 => 1000000); - - # pull out all transcript features - my @transcripts = $segment->features('transcript'); - - # for each transcript, total the length of the introns - my %totals; - for my $t (@transcripts) { - my @introns = $t->Intron; - $totals{$t->name} += $_->length foreach @introns; - } - - # Sort the exons of the first transcript by position - my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon; - - # Get a region 1000 bp upstream of first exon - my $upstream = $exons[0]->subseq(-1000,0); - - # get its DNA - my $dna = $upstream->seq; - - # and get all curated polymorphisms inside it - @polymorphisms = $upstream->contained_features('polymorphism:curated'); - - # get all feature types in the database - my @types = $db->types; - - # count all feature types in the segment - my %type_counts = $segment->types(-enumerate=>1); - - # get an iterator on all curated features of type 'exon' or 'intron' - my $iterator = $db->get_seq_stream(-type => ['exon:curated','intron:curated']); - - while (my $s = $iterator->next_seq) { - print $s,"\n"; - } - - # find all transcripts annotated as having function 'kinase' - my $iterator = $db->get_seq_stream(-type=>'transcript', - -attributes=>{Function=>'kinase'}); - while (my $s = $iterator->next_seq) { - print $s,"\n"; - } - -=head1 DESCRIPTION - -Bio::DB::GFF provides fast indexed access to a sequence annotation -database. It supports multiple database types (ACeDB, relational), -and multiple schemas through a system of adaptors and aggregators. - -The following operations are supported by this module: - - - retrieving a segment of sequence based on the ID of a landmark - - retrieving the DNA from that segment - - finding all annotations that overlap with the segment - - finding all annotations that are completely contained within the - segment - - retrieving all annotations of a particular type, either within a - segment, or globally - - conversion from absolute to relative coordinates and back again, - using any arbitrary landmark for the relative coordinates - - using a sequence segment to create new segments based on relative - offsets - -The data model used by Bio::DB::GFF is compatible with the GFF flat -file format (L). The module -can load a set of GFF files into the database, and serves objects that -have methods corresponding to GFF fields. - -The objects returned by Bio::DB::GFF are compatible with the -SeqFeatureI interface, allowing their use by the Bio::Graphics and -Bio::DAS modules. - -=head2 Auxiliary Scripts - -The bioperl distribution includes several scripts that make it easier -to work with Bio::DB::GFF databases. They are located in the scripts -directory under a subdirectory named Bio::DB::GFF: - -=over 4 - -=item * - -bp_load_gff.pl - -This script will load a Bio::DB::GFF database from a flat GFF file of -sequence annotations. Only the relational database version of -Bio::DB::GFF is supported. It can be used to create the database from -scratch, as well as to incrementally load new data. - -This script takes a --fasta argument to load raw DNA into the database -as well. However, GFF databases do not require access to the raw DNA -for most of their functionality. - -load_gff.pl also has a --upgrade option, which will perform a -non-destructive upgrade of older schemas to newer ones. - -=item * - -bp_bulk_load_gff.pl - -This script will populate a Bio::DB::GFF database from a flat GFF file -of sequence annotations. Only the MySQL database version of -Bio::DB::GFF is supported. It uses the "LOAD DATA INFILE" query in -order to accelerate loading considerably; however, it can only be used -for the initial load, and not for updates. - -This script takes a --fasta argument to load raw DNA into the database -as well. However, GFF databases do not require access to the raw DNA -for most of their functionality. - -=item * - -bp_fast_load_gff.pl - -This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe -tricks to allow for incremental updates. It only supports the MySQL -database version of Bio::DB::GFF and is guaranteed not to work on -non-Unix platforms. - -Arguments are the same as bp_load_gff.pl - -=item * - -gadfly_to_gff.pl - -This script will convert the GFF-like format used by the Berkeley -Drosophila Sequencing project into a format suitable for use with this -module. - -=item * - -sgd_to_gff.pl - -This script will convert the tab-delimited feature files used by the -Saccharomyces Genome Database into a format suitable for use with this -module. - -=back - -=head2 GFF Fundamentals - -The GFF format is a flat tab-delimited file, each line of which -corresponds to an annotation, or feature. Each line has nine columns -and looks like this: - - Chr1 curated CDS 365647 365963 . + 1 Transcript "R119.7" - -The 9 columns are as follows: - -=over 4 - -=item 1. - -reference sequence - -This is the ID of the sequence that is used to establish the -coordinate system of the annotation. In the example above, the -reference sequence is "Chr1". - -=item 2. - -source - -The source of the annotation. This field describes how the annotation -was derived. In the example above, the source is "curated" to -indicate that the feature is the result of human curation. The names -and versions of software programs are often used for the source field, -as in "tRNAScan-SE/1.2". - -=item 3. - -method - -The annotation method. This field describes the type of the -annotation, such as "CDS". Together the method and source describe -the annotation type. - -=item 4. - -start position - -The start of the annotation relative to the reference sequence. - -=item 5. - -stop position - -The stop of the annotation relative to the reference sequence. Start -is always less than or equal to stop. - -=item 6. - -score - -For annotations that are associated with a numeric score (for example, -a sequence similarity), this field describes the score. The score -units are completely unspecified, but for sequence similarities, it is -typically percent identity. Annotations that don't have a score can -use "." - -=item 7. - -strand - -For those annotations which are strand-specific, this field is the -strand on which the annotation resides. It is "+" for the forward -strand, "-" for the reverse strand, or "." for annotations that are -not stranded. - -=item 8. - -phase - -For annotations that are linked to proteins, this field describes the -phase of the annotation on the codons. It is a number from 0 to 2, or -"." for features that have no phase. - -=item 9. - -group - -GFF provides a simple way of generating annotation hierarchies ("is -composed of" relationships) by providing a group field. The group -field contains the class and ID of an annotation which is the logical -parent of the current one. In the example given above, the group is -the Transcript named "R119.7". - -The group field is also used to store information about the target of -sequence similarity hits, and miscellaneous notes. See the next -section for a description of how to describe similarity targets. - -The format of the group fields is "Class ID" with a single space (not -a tab) separating the class from the ID. It is VERY IMPORTANT to -follow this format, or grouping will not work properly. - -=back - -The sequences used to establish the coordinate system for annotations -can correspond to sequenced clones, clone fragments, contigs or -super-contigs. Thus, this module can be used throughout the lifecycle -of a sequencing project. - -In addition to a group ID, the GFF format allows annotations to have a -group class. For example, in the ACeDB representation, RNA -interference experiments have a class of "RNAi" and an ID that is -unique among the RNAi experiments. Since not all databases support -this notion, the class is optional in all calls to this module, and -defaults to "Sequence" when not provided. - -Double-quotes are sometimes used in GFF files around components of the -group field. Strictly, this is only necessary if the group name or -class contains whitespace. - -=head2 Making GFF files work with this module - -Some annotations do not need to be individually named. For example, -it is probably not useful to assign a unique name to each ALU repeat -in a vertebrate genome. Others, such as predicted genes, correspond -to named biological objects; you probably want to be able to fetch the -positions of these objects by referring to them by name. - -To accommodate named annotations, the GFF format places the object -class and name in the group field. The name identifies the object, -and the class prevents similarly-named objects, for example clones and -sequences, from collding. - -A named object is shown in the following excerpt from a GFF file: - - Chr1 curated transcript 939627 942410 . + . Transcript Y95B8A.2 - -This object is a predicted transcript named Y95BA.2. In this case, -the group field is used to identify the class and name of the object, -even though no other annotation belongs to that group. - -It now becomes possible to retrieve the region of the genome covered -by transcript Y95B8A.2 using the segment() method: - - $segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2'); - -It is not necessary for the annotation's method to correspond to the -object class, although this is commonly the case. - -As explained above, each annotation in a GFF file refers to a -reference sequence. It is important that each reference sequence also -be identified by a line in the GFF file. This allows the Bio::DB::GFF -module to determine the length and class of the reference sequence, -and makes it possible to do relative arithmetic. - -For example, if "Chr1" is used as a reference sequence, then it should -have an entry in the GFF file similar to this one: - - Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1 - -This indicates that the reference sequence named "Chr1" has length -14972282 bp, method "chromosome" and source "assembly". In addition, -as indicated by the group field, Chr1 has class "Sequence" and name -"Chr1". - -The object class "Sequence" is used by default when the class is not -specified in the segment() call. This allows you to use a shortcut -form of the segment() method: - - $segment = $db->segment('Chr1'); # whole chromosome - $segment = $db->segment('Chr1',1=>1000); # first 1000 bp - -For your convenience, if, during loading a GFF file, Bio::DB::GFF -encounters a line like the following: - - ##sequence-region Chr1 1 14972282 - -It will automatically generate the following entry: - - Chr1 reference Component 1 14972282 . + . Sequence Chr1 - -This is sufficient to use Chr1 as a reference point. -The ##sequence-region line is frequently found in the GFF files -distributed by annotation groups. - -=head2 Specifying the group tag - -A frequent problem with GFF files is the problem distinguishing -which of the several tag/value pairs in the 9th column is the grouping -pair. Ordinarily the first tag will be used for grouping, but some -GFF manipulating tools do not preserve the order of attributes. To -eliminate this ambiguity, this module provides two ways of explicitly -specifying which tag to group on: - -=over 4 - -=item * - -Using -preferred_groups - -When you create a Bio::DB::GFF object, pass it a -preferred_groups=E -argument. This specifies a tag that will be used for grouping. You -can pass an array reference to specify a list of such tags. - -=item * - -In the GFF header - -The GFF file itself can specify which tags are to be used for -grouping. Insert a comment like the following: - - ##group-tags Accession Locus - -This says to use the Accession tag for grouping. If it is not -available, use the Locus tag. If neither tag is available, use the -first pair to appear. - -=back - -These options only apply when B a GFF file into the database, -and have no effect on existing databases. - -The group-tags comment in the GFF file will *override* the preferred -groups set when you create the Bio::DB::GFF object. - -For backward compatibility, the tags Sequence and Transcript are -always treated as grouping tags unless preferred_tags are specified. -The "Target" tag is always used for grouping regardless of the -preferred_groups() setting, and the tags "tstart", "tend" and "Note" -cannot be used for grouping. These are historical artefacts coming -from various interpretations of GFF2, and cannot be changed. - -=head2 Sequence alignments - -There are two cases in which an annotation indicates the relationship -between two sequences. The first case is a similarity hit, where the -annotation indicates an alignment. The second case is a map assembly, -in which the annotation indicates that a portion of a larger sequence -is built up from one or more smaller ones. - -Both cases are indicated by using the B tag in the group -field. For example, a typical similarity hit will look like this: - - Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544 - -The group field contains the Target tag, followed by an identifier for -the biological object referred to. The GFF format uses the notation -I:I for the biological object, and even though this is -stylistically inconsistent, that's the way it's done. The object -identifier is followed by two integers indicating the start and stop -of the alignment on the target sequence. - -Unlike the main start and stop columns, it is possible for the target -start to be greater than the target end. The previous example -indicates that the the section of Chr1 from 76,953 to 77,108 aligns to -the protein SW:ABL_DROME starting at position 493 and extending to -position 544. - -A similar notation is used for sequence assembly information as shown -in this example: - - Chr1 assembly Link 10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826 - LINK_H06O01 assembly Cosmid 32386 64122 . . . Target Sequence:F49B2 6 31742 - -This indicates that the region between bases 10922906 and 11177731 of -Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826. The region -of LINK_H0601 between 32386 and 64122 is, in turn, composed of the -bases 5 to 31742 of cosmid F49B2. - -=head2 Attributes - -While not intended to serve as a general-purpose sequence database -(see bioperl-db for that), GFF allows you to tag features with -arbitrary attributes. Attributes appear in the Group field following -the initial class/name pair. For example: - - Chr1 cur trans 939 942 . + . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3 - -This line tags the feature named Transcript Y95B8A.2 as being "Gene" -named sma-3 and having the Alias "sma3". Features having these -attributes can be looked up using the fetch_feature_by_attribute() method. - -Two attributes have special meaning: "Note" is for backward -compatibility and is used for unstructured text remarks. "Alias" is -considered as a synonym for the feature name and will be consulted -when looking up a feature by its name. - -=head2 Adaptors and Aggregators - -This module uses a system of adaptors and aggregators in order to make -it adaptable to use with a variety of databases. - -=over 4 - -=item * - -Adaptors - -The core of the module handles the user API, annotation coordinate -arithmetic, and other common issues. The details of fetching -information from databases is handled by an adaptor, which is -specified during Bio::DB::GFF construction. The adaptor encapsulates -database-specific information such as the schema, user authentication -and access methods. - -There are currently five adaptors recommended for general use: - - Adaptor Name Description - ------------ ----------- - - memory A simple in-memory database suitable for testing - and small data sets. - - berkeleydb An indexed file database based on the DB_File module, - suitable for medium-sized read-only data sets. - - dbi::mysql An interface to a schema implemented in the Mysql - relational database management system. - - dbi::oracle An interface to a schema implemented in the Oracle - relational database management system. - - dbi::pg An interface to a schema implemented in the PostgreSQL - relational database management system. - -Check the Bio/DB/GFF/Adaptor directory and subdirectories for other, -more specialized adaptors, as well as experimental ones. - -=item * - -Aggregators - -The GFF format uses a "group" field to indicate aggregation properties -of individual features. For example, a set of exons and introns may -share a common transcript group, and multiple transcripts may share -the same gene group. - -Aggregators are small modules that use the group information to -rebuild the hierarchy. When a Bio::DB::GFF object is created, you -indicate that it use a set of one or more aggregators. Each -aggregator provides a new composite annotation type. Before the -database query is generated each aggregator is called to -"disaggregate" its annotation type into list of component types -contained in the database. After the query is generated, each -aggregator is called again in order to build composite annotations -from the returned components. - -For example, during disaggregation, the standard -"processed_transcript" aggregator generates a list of component -feature types including "UTR", "CDS", and "polyA_site". Later, it -aggregates these features into a set of annotations of type -"processed_transcript". - -During aggregation, the list of aggregators is called in reverse -order. This allows aggregators to collaborate to create multi-level -structures: the transcript aggregator assembles transcripts from -introns and exons; the gene aggregator then assembles genes from sets -of transcripts. - -Three default aggregators are provided: - - transcript assembles transcripts from features of type - exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA - clone assembles clones from Clone_left_end, Clone_right_end - and Sequence features. - alignment assembles gapped alignments from features of type - "similarity". - -In addition, this module provides the optional "wormbase_gene" -aggregator, which accommodates the WormBase representation of genes. -This aggregator aggregates features of method "exon", "CDS", "5'UTR", -"3'UTR", "polyA" and "TSS" into a single object. It also expects to -find a single feature of type "Sequence" that spans the entire gene. - -The existing aggregators are easily customized. - -Note that aggregation will not occur unless you specifically request -the aggregation type. For example, this call: - - @features = $segment->features('alignment'); - -will generate an array of aggregated alignment features. However, -this call: - - @features = $segment->features(); - -will return a list of unaggregated similarity segments. - -For more informnation, see the manual pages for -Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone, -etc. - -=back - -=head2 Loading GFF3 Files - -This module will accept GFF3 files, as described at -http://song.sourceforge.net/gff3.shtml. However, the implementation -has some limitations. - -=over 4 - -=item GFF version string is required - -The GFF file B contain the version comment: - - ##gff-version 3 - -Unless this version string is present at the top of the GFF file, the -loader will attempt to parse the file in GFF2 format, with -less-than-desirable results. - -=item Only one level of nesting allowed - -A major restriction is that Bio::DB::GFF only allows one level of -nesting of features. For nesting, the Target tag will be used -preferentially followed by the ID tag, followed by the Parent tag. -This means that if genes are represented like this: - - XXXX XXXX gene XXXX XXXX XXXX ID=myGene - XXXX XXXX mRNA XXXX XXXX XXXX ID=myTranscript;Parent=myGene - XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript - XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript - -Then there will be one group called myGene containing the "gene" -feature and one group called myTranscript containing the mRNA, and two -exons. - -You can work around this restriction to some extent by using the Alias -attribute literally: - - XXXX XXXX gene XXXX XXXX XXXX ID=myGene - XXXX XXXX mRNA XXXX XXXX XXXX ID=myTranscript;Parent=myGene;Alias=myGene - XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript;Alias=myGene - XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript;Alias=myGene - -This limitation will be corrected in the next version of Bio::DB::GFF. - -=back - -=head1 API - -The following is the API for Bio::DB::GFF. - -=cut - -package Bio::DB::GFF; - -use strict; - -use IO::File; -use File::Glob ':bsd_glob'; -use Bio::DB::GFF::Util::Rearrange; -use Bio::DB::GFF::RelSegment; -use Bio::DB::GFF::Feature; -use Bio::DB::GFF::Aggregator; - -use base qw(Bio::Root::Root Bio::DasI); - -my %valid_range_types = (overlaps => 1, - contains => 1, - contained_in => 1); - -=head1 Querying GFF Databases - -=head2 new - - Title : new - Usage : my $db = Bio::DB::GFF->new(@args); - Function: create a new Bio::DB::GFF object - Returns : new Bio::DB::GFF object - Args : lists of adaptors and aggregators - Status : Public - -These are the arguments: - - -adaptor Name of the adaptor module to use. If none - provided, defaults to "dbi::mysqlopt". - - -aggregator Array reference to a list of aggregators - to apply to the database. If none provided, - defaults to ['processed_transcript','alignment']. - - -preferred_groups When interpreteting the 9th column of a GFF2 file, - the indicated group names will have preference over - other attributes, even if they do not come first in - the list of attributes. This can be a scalar value - or an array reference. - - Any other named argument pairs are passed to - the adaptor for processing. - -The adaptor argument must correspond to a module contained within the -Bio::DB::GFF::Adaptor namespace. For example, the -Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying -'dbi::mysql'. By Perl convention, the adaptors names are lower case -because they are loaded at run time. - -The aggregator array may contain a list of aggregator names, a list of -initialized aggregator objects, or a string in the form -"aggregator_name{subpart1,subpart2,subpart3/main_method}" (the -"/main_method" part is optional, but if present a feature with the -main_method must be present in order for aggregation to occur). For -example, if you wish to change the components aggregated by the -transcript aggregator, you could pass it to the GFF constructor this -way: - - my $transcript = - Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr - polyA spliced_leader)]); - - my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment], - -adaptor => 'dbi::mysql', - -dsn => 'dbi:mysql:elegans42'); - -Alternatively, you could create an entirely new transcript aggregator -this way: - - my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}'; - my $db = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment], - -adaptor => 'dbi::mysql', - -dsn => 'dbi:mysql:elegans42'); - -See L for more details. - -The B<-preferred_groups> argument is used to change the default -processing of the 9th column of GFF version 2 files. By default, the -first tag/value pair is used to establish the group class and name. -If you pass -preferred_groups a scalar, the parser will look for a tag -of the indicated type and use it as the group even if it is not first -in the file. If you pass this argument a list of group classes as an -array ref, then the list will establish the precedence for searching. - -The commonly used 'dbi::mysql' adaptor recognizes the following -adaptor-specific arguments: - - Argument Description - -------- ----------- - - -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' - If a partial name is given, such as "ens0040", the - "dbi:mysql:" prefix will be added automatically. - - -user username for authentication - - -pass the password for authentication - - -refclass landmark Class; defaults to "Sequence" - - -The commonly used 'dbi::mysqlopt' adaptor also recognizes the following -arguments. - - Argument Description - -------- ----------- - - -fasta path to a directory containing FASTA files for the DNA - contained in this database (e.g. "/usr/local/share/fasta") - - -acedb an acedb URL to use when converting features into ACEDB - objects (e.g. sace://localhost:2005) - -=cut - -#' - -sub new { - my $package = shift; - my ($adaptor,$aggregators,$args,$refclass,$preferred_groups); - - if (@_ == 1) { # special case, default to dbi::mysqlopt - $adaptor = 'dbi::mysqlopt'; - $args = {DSN => shift}; - } else { - ($adaptor,$aggregators,$refclass,$preferred_groups,$args) = rearrange([ - [qw(ADAPTOR FACTORY)], - [qw(AGGREGATOR AGGREGATORS)], - 'REFCLASS', - 'PREFERRED_GROUPS' - ],@_); - } - - $adaptor ||= 'dbi::mysqlopt'; - my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E"; - unless ($class->can('new')) { - eval "require $class;1;" or $package->throw("Unable to load $adaptor adaptor: $@"); - } - - # this hack saves the memory adaptor, which loads the GFF file in new() - $args->{PREFERRED_GROUPS} = $preferred_groups if defined $preferred_groups; - - my $self = $class->new($args); - - # handle preferred groups - $self->preferred_groups($preferred_groups) if defined $preferred_groups; - $self->default_class($refclass || 'Sequence'); - - # handle the aggregators. - # aggregators are responsible for creating complex multi-part features - # from the GFF "group" field. If none are provided, then we provide a - # list of the two used in WormBase. - # Each aggregator can be a scalar or a ref. In the former case - # it is treated as a class name to call new() on. In the latter - # the aggreator is treated as a ready made object. - $aggregators = $self->default_aggregators unless defined $aggregators; - my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators; - for my $a (@a) { - $self->add_aggregator($a); - } - - # default settings go here..... - $self->automerge(1); # set automerge to true - - $self; -} - - -=head2 types - - Title : types - Usage : $db->types(@args) - Function: return list of feature types in range or database - Returns : a list of Bio::DB::GFF::Typename objects - Args : see below - Status : public - -This routine returns a list of feature types known to the database. -The list can be database-wide or restricted to a region. It is also -possible to find out how many times each feature occurs. - -For range queries, it is usually more convenient to create a -Bio::DB::GFF::Segment object, and then invoke it's types() method. - -Arguments are as follows: - - -ref ID of reference sequence - -class class of reference sequence - -start start of segment - -stop stop of segment - -enumerate if true, count the features - -The returned value will be a list of Bio::DB::GFF::Typename objects, -which if evaluated in a string context will return the feature type in -"method:source" format. This object class also has method() and -source() methods for retrieving the like-named fields. - -If -enumerate is true, then the function returns a hash (not a hash -reference) in which the keys are type names in "method:source" format -and the values are the number of times each feature appears in the -database or segment. - -The argument -end is a synonum for -stop, and -count is a synonym for --enumerate. - -=cut - -sub types { - my $self = shift; - my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([ - [qw(REF REFSEQ)], - qw(START), - [qw(STOP END)], - [qw(ENUMERATE COUNT)], - [qw(CLASS SEQCLASS)], - [qw(TYPE TYPES)], - ],@_); - $types = $self->parse_types($types) if defined $types; - $self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types); -} - -=head2 classes - - Title : classes - Usage : $db->classes - Function: return list of landmark classes in database - Returns : a list of classes - Args : none - Status : public - -This routine returns the list of reference classes known to the -database, or empty if classes are not used by the database. Classes -are distinct from types, being essentially qualifiers on the reference -namespaces. - -=cut - -sub classes { - my $self = shift; - return (); -} - -=head2 segment - - Title : segment - Usage : $db->segment(@args); - Function: create a segment object - Returns : segment object(s) - Args : numerous, see below - Status : public - -This method generates a segment object, which is a Perl object -subclassed from Bio::DB::GFF::Segment. The segment can be used to -find overlapping features and the raw DNA. - -When making the segment() call, you specify the ID of a sequence -landmark (e.g. an accession number, a clone or contig), and a -positional range relative to the landmark. If no range is specified, -then the entire extent of the landmark is used to generate the -segment. - -You may also provide the ID of a "reference" sequence, which will set -the coordinate system and orientation used for all features contained -within the segment. The reference sequence can be changed later. If -no reference sequence is provided, then the coordinate system is based -on the landmark. - -Arguments: - - -name ID of the landmark sequence. - - -class Database object class for the landmark sequence. - "Sequence" assumed if not specified. This is - irrelevant for databases which do not recognize - object classes. - - -start Start of the segment relative to landmark. Positions - follow standard 1-based sequence rules. If not specified, - defaults to the beginning of the landmark. - - -end Stop of the segment relative to the landmark. If not specified, - defaults to the end of the landmark. - - -stop Same as -end. - - -offset For those who prefer 0-based indexing, the offset specifies the - position of the new segment relative to the start of the landmark. - - -length For those who prefer 0-based indexing, the length specifies the - length of the new segment. - - -refseq Specifies the ID of the reference landmark used to establish the - coordinate system for the newly-created segment. - - -refclass Specifies the class of the reference landmark, for those databases - that distinguish different object classes. Defaults to "Sequence". - - -absolute - Return features in absolute coordinates rather than relative to the - parent segment. - - -nocheck Don't check the database for the coordinates and length of this - feature. Construct a segment using the indicated name as the - reference, a start coordinate of 1, an undefined end coordinate, - and a strand of +1. - - -force Same as -nocheck. - - -seq,-sequence,-sourceseq Aliases for -name. - - -begin,-end Aliases for -start and -stop - - -off,-len Aliases for -offset and -length - - -seqclass Alias for -class - -Here's an example to explain how this works: - - my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql'); - -If successful, $db will now hold the database accessor object. We now -try to fetch the fragment of sequence whose ID is A0000182 and class -is "Accession." - - my $segment = $db->segment(-name=>'A0000182',-class=>'Accession'); - -If successful, $segment now holds the entire segment corresponding to -this accession number. By default, the sequence is used as its own -reference sequence, so its first base will be 1 and its last base will -be the length of the accession. - -Assuming that this sequence belongs to a longer stretch of DNA, say a -contig, we can fetch this information like so: - - my $sourceseq = $segment->sourceseq; - -and find the start and stop on the source like this: - - my $start = $segment->abs_start; - my $stop = $segment->abs_stop; - -If we had another segment, say $s2, which is on the same contiguous -piece of DNA, we can pass that to the refseq() method in order to -establish it as the coordinate reference point: - - $segment->refseq($s2); - -Now calling start() will return the start of the segment relative to -the beginning of $s2, accounting for differences in strandedness: - - my $rel_start = $segment->start; - -IMPORTANT NOTE: This method can be used to return the segment spanned -by an arbitrary named annotation. However, if the annotation appears -at multiple locations on the genome, for example an EST that maps to -multiple locations, then, provided that all locations reside on the -same physical segment, the method will return a segment that spans the -minimum and maximum positions. If the reference sequence occupies -ranges on different physical segments, then it returns them all in an -array context, and raises a "multiple segment exception" exception in -a scalar context. - -=cut - -#' - -sub segment { - my $self = shift; - my @segments = Bio::DB::GFF::RelSegment->new(-factory => $self, - $self->setup_segment_args(@_)); - foreach (@segments) { - $_->absolute(1) if $self->absolute; - } - - $self->_multiple_return_args(@segments); -} - -sub _multiple_return_args { - my $self = shift; - my @args = @_; - if (@args == 0) { - return; - } elsif (@args == 1) { - return $args[0]; - } elsif (wantarray) { # more than one reference sequence - return @args; - } else { - $self->error($args[0]->name, - " has more than one reference sequence in database. Please call in a list context to retrieve them all."); - $self->throw('multiple segment exception'); - return; - } - -} - -# backward compatibility -- don't use! -# (deliberately undocumented too) -sub abs_segment { - my $self = shift; - return $self->segment($self->setup_segment_args(@_),-absolute=>1); -} - -sub setup_segment_args { - my $self = shift; - return @_ if defined $_[0] && $_[0] =~ /^-/; - return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3; - return (-class=>$_[0],-name=>$_[1]) if @_ == 2; - return (-name=>$_[0]) if @_ == 1; -} - -=head2 features - - Title : features - Usage : $db->features(@args) - Function: get all features, possibly filtered by type - Returns : a list of Bio::DB::GFF::Feature objects - Args : see below - Status : public - -This routine will retrieve features in the database regardless of -position. It can be used to return all features, or a subset based on -their method and source. - -Arguments are as follows: - - -types List of feature types to return. Argument is an array - reference containing strings of the format "method:source" - - -merge Whether to apply aggregators to the generated features. - - -rare Turn on optimizations suitable for a relatively rare feature type, - where it makes more sense to filter by feature type first, - and then by position. - - -attributes A hash reference containing attributes to match. - - -iterator Whether to return an iterator across the features. - - -binsize A true value will create a set of artificial features whose - start and stop positions indicate bins of the given size, and - whose scores are the number of features in the bin. The - class and method of the feature will be set to "bin", - its source to "method:source", and its group to "bin:method:source". - This is a handy way of generating histograms of feature density. - -If -iterator is true, then the method returns a single scalar value -consisting of a Bio::SeqIO object. You can call next_seq() repeatedly -on this object to fetch each of the features in turn. If iterator is -false or absent, then all the features are returned as a list. - -Currently aggregation is disabled when iterating over a series of -features. - -Types are indicated using the nomenclature "method:source". Either of -these fields can be omitted, in which case a wildcard is used for the -missing field. Type names without the colon (e.g. "exon") are -interpreted as the method name and a source wild card. Regular -expressions are allowed in either field, as in: "similarity:BLAST.*". - -The -attributes argument is a hashref containing one or more attributes -to match against: - - -attributes => { Gene => 'abc-1', - Note => 'confirmed' } - -Attribute matching is simple string matching, and multiple attributes -are ANDed together. - -=cut - -sub features { - my $self = shift; - my ($types,$automerge,$sparse,$iterator,$refseq,$start,$end,$other); - if (defined $_[0] && - $_[0] =~ /^-/) { - ($types,$automerge,$sparse,$iterator, - $refseq,$start,$end, - $other) = rearrange([ - [qw(TYPE TYPES)], - [qw(MERGE AUTOMERGE)], - [qw(RARE SPARSE)], - 'ITERATOR', - [qw(REFSEQ SEQ_ID)], - 'START', - [qw(STOP END)], - ],@_); - } else { - $types = \@_; - } - - # for whole database retrievals, we probably don't want to automerge! - $automerge = $self->automerge unless defined $automerge; - $other ||= {}; - $self->_features({ - rangetype => $refseq ? 'overlaps' : 'contains', - types => $types, - refseq => $refseq, - start => $start, - stop => $end, - }, - { sparse => $sparse, - automerge => $automerge, - iterator =>$iterator, - %$other, - } - ); -} - -=head2 get_seq_stream - - Title : get_seq_stream - Usage : my $seqio = $self->get_seq_sream(@args) - Function: Performs a query and returns an iterator over it - Returns : a Bio::SeqIO stream capable of producing sequence - Args : As in features() - Status : public - -This routine takes the same arguments as features(), but returns a -Bio::SeqIO::Stream-compliant object. Use it like this: - - $stream = $db->get_seq_stream('exon'); - while (my $exon = $stream->next_seq) { - print $exon,"\n"; - } - -NOTE: This is also called get_feature_stream(), since that's what it -really does. - -=cut - -sub get_seq_stream { - my $self = shift; - my @args = !defined($_[0]) || $_[0] =~ /^-/ ? (@_,-iterator=>1) - : (-types=>\@_,-iterator=>1); - $self->features(@args); -} - -*get_feature_stream = \&get_seq_stream; - -=head2 get_feature_by_name - - Title : get_feature_by_name - Usage : $db->get_feature_by_name($class => $name) - Function: fetch features by their name - Returns : a list of Bio::DB::GFF::Feature objects - Args : the class and name of the desired feature - Status : public - -This method can be used to fetch a named feature from the database. -GFF annotations are named using the group class and name fields, so -for features that belong to a group of size one, this method can be -used to retrieve that group (and is equivalent to the segment() -method). Any Alias attributes are also searched for matching names. - -An alternative syntax allows you to search for features by name within -a circumscribed region: - - @f = $db->get_feature_by_name(-class => $class,-name=>$name, - -ref => $sequence_name, - -start => $start, - -end => $end); - -This method may return zero, one, or several Bio::DB::GFF::Feature -objects. - -Aggregation is performed on features as usual. - -NOTE: At various times, this function was called fetch_group(), -fetch_feature(), fetch_feature_by_name() and segments(). These names -are preserved for backward compatibility. - -=cut - -sub get_feature_by_name { - my $self = shift; - my ($gclass,$gname,$automerge,$ref,$start,$end); - if (@_ == 1) { - $gclass = $self->default_class; - $gname = shift; - } else { - ($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE', - ['REF','REFSEQ'], - 'START',['STOP','END'] - ],@_); - $gclass ||= $self->default_class; - } - $automerge = $self->automerge unless defined $automerge; - - # we need to refactor this... It's repeated code (see below)... - my @aggregators; - if ($automerge) { - for my $a ($self->aggregators) { - push @aggregators,$a if $a->disaggregate([],$self); - } - } - - my %groups; # cache the groups we create to avoid consuming too much unecessary memory - my $features = []; - my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; - my $location = [$ref,$start,$end] if defined $ref; - $self->_feature_by_name($gclass,$gname,$location,$callback); - - warn "aggregating...\n" if $self->debug; - foreach my $a (@aggregators) { # last aggregator gets first shot - $a->aggregate($features,$self) or next; - } - - @$features; -} - -# horrible indecision regarding proper names! -*fetch_group = *fetch_feature = *fetch_feature_by_name = \&get_feature_by_name; -*segments = \&segment; - -=head2 get_feature_by_target - - Title : get_feature_by_target - Usage : $db->get_feature_by_target($class => $name) - Function: fetch features by their similarity target - Returns : a list of Bio::DB::GFF::Feature objects - Args : the class and name of the desired feature - Status : public - -This method can be used to fetch a named feature from the database -based on its similarity hit. - -=cut - -sub get_feature_by_target { - shift->get_feature_by_name(@_); -} - -=head2 get_feature_by_attribute - - Title : get_feature_by_attribute - Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2) - Function: fetch segments by combinations of attribute values - Returns : a list of Bio::DB::GFF::Feature objects - Args : the class and name of the desired feature - Status : public - -This method can be used to fetch a set of features from the database. -Attributes are a list of name=Evalue pairs. They will be logically -ANDED together. - -=cut - -sub get_feature_by_attribute { - my $self = shift; - my %attributes = ref($_[0]) ? %{$_[0]} : @_; - - # we need to refactor this... It's repeated code (see above)... - my @aggregators; - if ($self->automerge) { - for my $a ($self->aggregators) { - unshift @aggregators,$a if $a->disaggregate([],$self); - } - } - - my %groups; # cache the groups we create to avoid consuming too much unecessary memory - my $features = []; - my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; - $self->_feature_by_attribute(\%attributes,$callback); - - warn "aggregating...\n" if $self->debug; - foreach my $a (@aggregators) { # last aggregator gets first shot - $a->aggregate($features,$self) or next; - } - - @$features; -} - -# more indecision... -*fetch_feature_by_attribute = \&get_feature_by_attribute; - -=head2 get_feature_by_id - - Title : get_feature_by_id - Usage : $db->get_feature_by_id($id) - Function: fetch segments by feature ID - Returns : a Bio::DB::GFF::Feature object - Args : the feature ID - Status : public - -This method can be used to fetch a feature from the database using its -ID. Not all GFF databases support IDs, so be careful with this. - -=cut - -sub get_feature_by_id { - my $self = shift; - my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; - my %groups; # cache the groups we create to avoid consuming too much unecessary memory - my $features = []; - my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; - $self->_feature_by_id($id,'feature',$callback); - return wantarray ? @$features : $features->[0]; -} -*fetch_feature_by_id = \&get_feature_by_id; - -=head2 get_feature_by_gid - - Title : get_feature_by_gid - Usage : $db->get_feature_by_gid($id) - Function: fetch segments by feature ID - Returns : a Bio::DB::GFF::Feature object - Args : the feature ID - Status : public - -This method can be used to fetch a feature from the database using its -group ID. Not all GFF databases support IDs, so be careful with this. - -The group ID is often more interesting than the feature ID, since -groups can be complex objects containing subobjects. - -=cut - -sub get_feature_by_gid { - my $self = shift; - my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; - my %groups; # cache the groups we create to avoid consuming too much unecessary memory - my $features = []; - my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; - $self->_feature_by_id($id,'group',$callback); - return wantarray ? @$features : $features->[0]; -} -*fetch_feature_by_gid = \&get_feature_by_gid; - -=head2 delete_fattribute_to_features - - Title : delete_fattribute_to_features - Usage : $db->delete_fattribute_to_features(@ids_or_features) - Function: delete one or more fattribute_to_features - Returns : count of fattribute_to_features deleted - Args : list of features or feature ids - Status : public - -Pass this method a list of numeric feature ids or a set of features. -It will attempt to remove the fattribute_to_features rows of those features -from the database and return a count of the rows removed. - -NOTE: This method is also called delete_fattribute_to_feature(). Also see -delete_groups() and delete_features(). - -=cut - -*delete_fattribute_to_feature = \&delete_fattribute_to_features; - -sub delete_fattribute_to_features { - my $self = shift; - my @features_or_ids = @_; - my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids; - return unless @ids; - $self->_delete_fattribute_to_features(@ids); -} - -=head2 delete_features - - Title : delete_features - Usage : $db->delete_features(@ids_or_features) - Function: delete one or more features - Returns : count of features deleted - Args : list of features or feature ids - Status : public - -Pass this method a list of numeric feature ids or a set of features. -It will attempt to remove the features from the database and return a -count of the features removed. - -NOTE: This method is also called delete_feature(). Also see -delete_groups(). - -=cut - -*delete_feature = \&delete_features; - -sub delete_features { - my $self = shift; - my @features_or_ids = @_; - my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids; - return unless @ids; - $self->_delete_features(@ids); -} - -=head2 delete_groups - - Title : delete_groups - Usage : $db->delete_groups(@ids_or_features) - Function: delete one or more feature groups - Returns : count of features deleted - Args : list of features or feature group ids - Status : public - -Pass this method a list of numeric group ids or a set of features. It -will attempt to recursively remove the features and ALL members of -their group from the database. It returns a count of the number of -features (not groups) returned. - -NOTE: This method is also called delete_group(). Also see -delete_features(). - -=cut - -*delete_group = \&delete_groupss; - -sub delete_groups { - my $self = shift; - my @features_or_ids = @_; - my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids; - return unless @ids; - $self->_delete_groups(@ids); -} - -=head2 delete - - Title : delete - Usage : $db->delete(@args) - Function: delete features - Returns : count of features deleted -- if available - Args : numerous, see below - Status : public - -This method deletes all features that overlap the specified region or -are of a particular type. If no arguments are provided and the -force -argument is true, then deletes ALL features. - -Arguments: - - -name ID of the landmark sequence. - - -ref ID of the landmark sequence (synonym for -name). - - -class Database object class for the landmark sequence. - "Sequence" assumed if not specified. This is - irrelevant for databases which do not recognize - object classes. - - -start Start of the segment relative to landmark. Positions - follow standard 1-based sequence rules. If not specified, - defaults to the beginning of the landmark. - - -end Stop of the segment relative to the landmark. If not specified, - defaults to the end of the landmark. - - -offset Zero-based addressing - - -length Length of region - - -type,-types Either a single scalar type to be deleted, or an - reference to an array of types. - - -force Force operation to be performed even if it would delete - entire feature table. - - -range_type Control the range type of the deletion. One of "overlaps" (default) - "contains" or "contained_in" - -Examples: - - $db->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats - $db->delete(-name=>'chr3',-start=>1,-end=>1000); # remove annotations on chr3 from 1 to 1000 - $db->delete(-name=>'chr3',-type=>'exon'); # remove all exons on chr3 - -The short form of this call, as described in segment() is also allowed: - - $db->delete("chr3",1=>1000); - $db->delete("chr3"); - -IMPORTANT NOTE: This method only deletes features. It does *NOT* -delete the names of groups that contain the deleted features. Group -IDs will be reused if you later load a feature with the same group -name as one that was previously deleted. - -NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the -result code from the SQL DELETE operation. Some dbd drivers return the -count of rows deleted, while others return 0E0. Caveat emptor. - -=cut - -sub delete { - my $self = shift; - my @args = $self->setup_segment_args(@_); - my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) = - rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET', - 'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args); - $offset = 0 unless defined $offset; - $start = $offset+1 unless defined $start; - $end = $start+$length-1 if !defined $end and $length; - $class ||= $self->default_class; - - my $types = $self->parse_types($type); # parse out list of types - - $range_type ||= 'overlaps'; - $self->throw("range type must be one of {". - join(',',keys %valid_range_types). - "}\n") - unless $valid_range_types{lc $range_type}; - - - my @segments; - if (defined $name && $name ne '') { - my @args = (-name=>$name,-class=>$class); - push @args,(-start=>$start) if defined $start; - push @args,(-end =>$end) if defined $end; - @segments = $self->segment(@args); - return unless @segments; - } - $self->_delete({segments => \@segments, - types => $types, - range_type => $range_type, - force => $force} - ); -} - -=head2 absolute - - Title : absolute - Usage : $abs = $db->absolute([$abs]); - Function: gets/sets absolute mode - Returns : current setting of absolute mode boolean - Args : new setting for absolute mode boolean - Status : public - -$db-Eabsolute(1) will turn on absolute mode for the entire database. -All segments retrieved will use absolute coordinates by default, -rather than relative coordinates. You can still set them to use -relative coordinates by calling $segment-Eabsolute(0). - -Note that this is not the same as calling abs_segment(); it continues -to allow you to look up groups that are not used directly as reference -sequences. - -=cut - -sub absolute { - my $self = shift; - my $d = $self->{absolute}; - $self->{absolute} = shift if @_; - $d; -} - -=head2 strict_bounds_checking - - Title : strict_bounds_checking - Usage : $flag = $db->strict_bounds_checking([$flag]) - Function: gets/sets strict bounds checking - Returns : current setting of bounds checking flag - Args : new setting for bounds checking flag - Status : public - -This flag enables extra checks for segment requests that go beyond the -ends of their reference sequences. If bounds checking is enabled, -then retrieved segments will be truncated to their physical length, -and their truncated() methods will return true. - -If the flag is off (the default), then the module will return segments -that appear to extend beyond their physical boundaries. Requests for -features beyond the end of the segment will, however, return empty. - -=cut - -sub strict_bounds_checking { - my $self = shift; - my $d = $self->{strict}; - $self->{strict} = shift if @_; - $d; -} - -=head2 get_Seq_by_id - - Title : get_Seq_by_id - Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') - Function: Gets a Bio::Seq object by its name - Returns : a Bio::Seq object - Args : the id (as a string) of a sequence - Throws : "id does not exist" exception - -NOTE: Bio::DB::RandomAccessI compliant method - -=cut - -sub get_Seq_by_id { - my $self = shift; - $self->get_feature_by_name(@_); -} - - -=head2 get_Seq_by_accession - - Title : get_Seq_by_accession - Usage : $seq = $db->get_Seq_by_accession('AL12234') - Function: Gets a Bio::Seq object by its accession - Returns : a Bio::Seq object - Args : the id (as a string) of a sequence - Throws : "id does not exist" exception - -NOTE: Bio::DB::RandomAccessI compliant method - -=cut - -sub get_Seq_by_accession { - my $self = shift; - $self->get_feature_by_name(@_); -} - -=head2 get_Seq_by_acc - - Title : get_Seq_by_acc - Usage : $seq = $db->get_Seq_by_acc('X77802'); - Function: Gets a Bio::Seq object by accession number - Returns : A Bio::Seq object - Args : accession number (as a string) - Throws : "acc does not exist" exception - -NOTE: Bio::DB::RandomAccessI compliant method - -=cut - -sub get_Seq_by_acc { - my $self = shift; - $self->get_feature_by_name(@_); -} - -=head2 get_Stream_by_name - - Title : get_Stream_by_name - Usage : $seq = $db->get_Stream_by_name(@ids); - Function: Retrieves a stream of Seq objects given their names - Returns : a Bio::SeqIO stream object - Args : an array of unique ids/accession numbers, or - an array reference - -NOTE: This is also called get_Stream_by_batch() - -=cut - -sub get_Stream_by_name { - my $self = shift; - my @ids = @_; - my $id = ref($ids[0]) ? $ids[0] : \@ids; - Bio::DB::GFF::ID_Iterator->new($self,$id,'name'); -} - -=head2 get_Stream_by_id - - Title : get_Stream_by_id - Usage : $seq = $db->get_Stream_by_id(@ids); - Function: Retrieves a stream of Seq objects given their ids - Returns : a Bio::SeqIO stream object - Args : an array of unique ids/accession numbers, or - an array reference - -NOTE: This is also called get_Stream_by_batch() - -=cut - -sub get_Stream_by_id { - my $self = shift; - my @ids = @_; - my $id = ref($ids[0]) ? $ids[0] : \@ids; - Bio::DB::GFF::ID_Iterator->new($self,$id,'feature'); -} - -=head2 get_Stream_by_batch () - - Title : get_Stream_by_batch - Usage : $seq = $db->get_Stream_by_batch(@ids); - Function: Retrieves a stream of Seq objects given their ids - Returns : a Bio::SeqIO stream object - Args : an array of unique ids/accession numbers, or - an array reference - -NOTE: This is the same as get_Stream_by_id(). - -=cut - -*get_Stream_by_batch = \&get_Stream_by_id; - - -=head2 get_Stream_by_group () - -Bioperl compatibility. - -=cut - -sub get_Stream_by_group { - my $self = shift; - my @ids = @_; - my $id = ref($ids[0]) ? $ids[0] : \@ids; - Bio::DB::GFF::ID_Iterator->new($self,$id,'group'); -} - -=head2 all_seqfeatures - - Title : all_seqfeatures - Usage : @features = $db->all_seqfeatures(@args) - Function: fetch all the features in the database - Returns : an array of features, or an iterator - Args : See below - Status : public - -This is equivalent to calling $db-Efeatures() without any types, and -will return all the features in the database. The -merge and --iterator arguments are recognized, and behave the same as described -for features(). - -=cut - -sub all_seqfeatures { - my $self = shift; - my ($automerge,$iterator)= rearrange([ - [qw(MERGE AUTOMERGE)], - 'ITERATOR' - ],@_); - my @args; - push @args,(-merge=>$automerge) if defined $automerge; - push @args,(-iterator=>$iterator) if defined $iterator; - $self->features(@args); -} - -=head1 Creating and Loading GFF Databases - -=head2 initialize - - Title : initialize - Usage : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2); - Function: initialize a GFF database - Returns : true if initialization successful - Args : a set of named parameters - Status : Public - -This method can be used to initialize an empty database. It takes the following -named arguments: - - -erase A boolean value. If true the database will be wiped clean if it - already contains data. - -Other named arguments may be recognized by subclasses. They become database -meta values that control various settable options. - -As a shortcut (and for backward compatibility) a single true argument -is the same as initialize(-erase=E1). - -=cut - -sub initialize { - my $self = shift; - - my ($erase,$meta) = rearrange(['ERASE'],@_); - $meta ||= {}; - - # initialize (possibly erasing) - return unless $self->do_initialize($erase); - my @default = $self->default_meta_values; - - # this is an awkward way of uppercasing the - # even-numbered values (necessary for case-insensitive SQL databases) - for (my $i=0; $i<@default; $i++) { - $default[$i] = uc $default[$i] if !($i % 2); - } - - my %values = (@default,%$meta); - foreach (keys %values) { - $self->meta($_ => $values{$_}); - } - 1; -} - - -=head2 load_gff - - Title : load_gff - Usage : $db->load_gff($file|$directory|$filehandle [,$verbose]); - Function: load GFF data into database - Returns : count of records loaded - Args : a directory, a file, a list of files, - or a filehandle - Status : Public - -This method takes a single overloaded argument, which can be any of: - -=over 4 - -=item * - -a scalar corresponding to a GFF file on the system - -A pathname to a local GFF file. Any files ending with the .gz, .Z, or -.bz2 suffixes will be transparently decompressed with the appropriate -command-line utility. - -=item * - -an array reference containing a list of GFF files on the system - -For example ['/home/gff/gff1.gz','/home/gff/gff2.gz'] - -=item * - -directory path - -The indicated directory will be searched for all files ending in the -suffixes .gff, .gff.gz, .gff.Z or .gff.bz2. - -=item * - -filehandle - -An open filehandle from which to read the GFF data. Tied filehandles -now work as well. - -=item * - -a pipe expression - -A pipe expression will also work. For example, a GFF file on a remote -web server can be loaded with an expression like this: - - $db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test |"); - -=back - -The optional second argument, if true, will turn on verbose status -reports that indicate the progress. - -If successful, the method will return the number of GFF lines -successfully loaded. - -NOTE:this method used to be called load(), but has been changed. The -old method name is also recognized. - -=cut - -sub load_gff { - my $self = shift; - my $file_or_directory = shift || '.'; - my $verbose = shift; - - local $self->{__verbose__} = $verbose; - return $self->do_load_gff($file_or_directory) if ref($file_or_directory) - && tied *$file_or_directory; - - my $tied_stdin = tied(*STDIN); - open my $SAVEIN, "<&STDIN" unless $tied_stdin; - local @ARGV = $self->setup_argv($file_or_directory,'gff','gff3') or return; # to play tricks with reader - my $result = $self->do_load_gff('ARGV'); - open STDIN, '<', $SAVEIN unless $tied_stdin; # restore STDIN - return $result; -} - -*load = \&load_gff; - -=head2 load_gff_file - - Title : load_gff_file - Usage : $db->load_gff_file($file [,$verbose]); - Function: load GFF data into database - Returns : count of records loaded - Args : a path to a file - Status : Public - -This is provided as an alternative to load_gff_file. It doesn't munge -STDIN or play tricks with ARGV. - -=cut - -sub load_gff_file { - my $self = shift; - my $file = shift; - my $verbose = shift; - my $fh = IO::File->new($file) or return; - return $self->do_load_gff($fh); -} - -=head2 load_fasta - - Title : load_fasta - Usage : $db->load_fasta($file|$directory|$filehandle); - Function: load FASTA data into database - Returns : count of records loaded - Args : a directory, a file, a list of files, - or a filehandle - Status : Public - -This method takes a single overloaded argument, which can be any of: - -=over 4 - -=item * - -scalar corresponding to a FASTA file on the system - -A pathname to a local FASTA file. Any files ending with the .gz, .Z, or -.bz2 suffixes will be transparently decompressed with the appropriate -command-line utility. - -=item * - -array reference containing a list of FASTA files on the -system - -For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz'] - -=item * - -path to a directory - -The indicated directory will be searched for all files ending in the -suffixes .fa, .fa.gz, .fa.Z or .fa.bz2. - -=item * - -filehandle - -An open filehandle from which to read the FASTA data. - -=item * - -pipe expression - -A pipe expression will also work. For example, a FASTA file on a remote -web server can be loaded with an expression like this: - - $db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa |"); - -=back - -=cut - -sub load_fasta { - my $self = shift; - my $file_or_directory = shift || '.'; - my $verbose = shift; - - local $self->{__verbose__} = $verbose; - return $self->load_sequence($file_or_directory) if ref($file_or_directory) - && tied *$file_or_directory; - - my $tied = tied(*STDIN); - open my $SAVEIN, "<&STDIN" unless $tied; - local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return; # to play tricks with reader - my $result = $self->load_sequence('ARGV'); - open STDIN, '<', $SAVEIN unless $tied; # restore STDIN - return $result; -} - - -=head2 load_fasta_file - - Title : load_fasta_file - Usage : $db->load_fasta_file($file [,$verbose]); - Function: load FASTA data into database - Returns : count of records loaded - Args : a path to a file - Status : Public - -This is provided as an alternative to load_fasta. It doesn't munge -STDIN or play tricks with ARGV. - -=cut - -sub load_fasta_file { - my $self = shift; - my $file = shift; - my $verbose = shift; - my $fh = IO::File->new($file) or return; - return $self->do_load_fasta($fh); -} - - -=head2 load_sequence_string - - Title : load_sequence_string - Usage : $db->load_sequence_string($id,$dna) - Function: load a single DNA entry - Returns : true if successfully loaded - Args : a raw sequence string (DNA, RNA, protein) - Status : Public - -=cut - -sub load_sequence_string { - my $self = shift; - my ($acc,$seq) = @_; - my $offset = 0; - $self->insert_sequence_chunk($acc,\$offset,\$seq) or return; - $self->insert_sequence($acc,$offset,$seq) or return; - 1; -} - -sub setup_argv { - my $self = shift; - my $file_or_directory = shift; - my @suffixes = @_; - no strict 'refs'; # so that we can call fileno() on the argument - - my @argv; - - if (-d $file_or_directory) { - # Because glob() is broken with long file names that contain spaces - $file_or_directory = Win32::GetShortPathName($file_or_directory) - if $^O =~ /^MSWin/i && eval 'use Win32; 1'; - @argv = map { bsd_glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes; - }elsif (my $fd = fileno($file_or_directory)) { - open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN"); - @argv = '-'; - } elsif (ref $file_or_directory) { - @argv = @$file_or_directory; - } else { - @argv = $file_or_directory; - } - - foreach (@argv) { - if (/\.gz$/) { - $_ = "gunzip -c $_ |"; - } elsif (/\.Z$/) { - $_ = "uncompress -c $_ |"; - } elsif (/\.bz2$/) { - $_ = "bunzip2 -c $_ |"; - } - } - @argv; -} - -=head2 lock_on_load - - Title : lock_on_load - Usage : $lock = $db->lock_on_load([$lock]) - Function: set write locking during load - Returns : current value of lock-on-load flag - Args : new value of lock-on-load-flag - Status : Public - -This method is honored by some of the adaptors. If the value is true, -the tables used by the GFF modules will be locked for writing during -loads and inaccessible to other processes. - -=cut - -sub lock_on_load { - my $self = shift; - my $d = $self->{lock}; - $self->{lock} = shift if @_; - $d; -} - -=head2 meta - - Title : meta - Usage : $value = $db->meta($name [,$newval]) - Function: get or set a meta variable - Returns : a string - Args : meta variable name and optionally value - Status : abstract - -Get or set a named metavalues for the database. Metavalues can be -used for database-specific settings. - -By default, this method does nothing! - -=cut - -sub meta { - my $self = shift; - my ($name,$value) = @_; - return; -} - -=head2 default_meta_values - - Title : default_meta_values - Usage : %values = $db->default_meta_values - Function: empty the database - Returns : a list of tag=>value pairs - Args : none - Status : protected - -This method returns a list of tag=Evalue pairs that contain default -meta information about the database. It is invoked by initialize() to -write out the default meta values. The base class version returns an -empty list. - -For things to work properly, meta value names must be UPPERCASE. - -=cut - -sub default_meta_values { - my $self = shift; - return (); -} - - -=head2 error - - Title : error - Usage : $db->error( [$new error] ); - Function: read or set error message - Returns : error message - Args : an optional argument to set the error message - Status : Public - -This method can be used to retrieve the last error message. Errors -are not reset to empty by successful calls, so contents are only valid -immediately after an error condition has been detected. - -=cut - -sub error { - my $self = shift; - my $g = $self->{error}; - $self->{error} = join '',@_ if @_; - $g; -} - -=head2 debug - - Title : debug - Usage : $db->debug( [$flag] ); - Function: read or set debug flag - Returns : current value of debug flag - Args : new debug flag (optional) - Status : Public - -This method can be used to turn on debug messages. The exact nature -of those messages depends on the adaptor in use. - -=cut - -sub debug { - my $self = shift; - my $g = $self->{debug}; - $self->{debug} = shift if @_; - $g; -} - - -=head2 automerge - - Title : automerge - Usage : $db->automerge( [$new automerge] ); - Function: get or set automerge value - Returns : current value (boolean) - Args : an optional argument to set the automerge value - Status : Public - -By default, this module will use the aggregators to merge groups into -single composite objects. This default can be changed to false by -calling automerge(0). - -=cut - -sub automerge { - my $self = shift; - my $g = $self->{automerge}; - $self->{automerge} = shift if @_; - $g; -} - -=head2 attributes - - Title : attributes - Usage : @attributes = $db->attributes($id,$name) - Function: get the "attributes" on a particular feature - Returns : an array of string - Args : feature ID - Status : public - -Some GFF version 2 files use the groups column to store a series of -attribute/value pairs. In this interpretation of GFF, the first such -pair is treated as the primary group for the feature; subsequent pairs -are treated as attributes. Two attributes have special meaning: -"Note" is for backward compatibility and is used for unstructured text -remarks. "Alias" is considered as a synonym for the feature name. - -If no name is provided, then attributes() returns a flattened hash, of -attribute=Evalue pairs. This lets you do: - - %attributes = $db->attributes($id); - -If no arguments are provided, attributes() will return the list of -all attribute names: - - @attribute_names = $db->attributes(); - -Normally, however, attributes() will be called by the feature: - - @notes = $feature->attributes('Note'); - -In a scalar context, attributes() returns the first value of the -attribute if a tag is present, otherwise a hash reference in which the -keys are attribute names and the values are anonymous arrays -containing the values. - -=cut - -sub attributes { - my $self = shift; - my ($id,$tag) = @_; - my @result = $self->do_attributes(@_) or return; - return @result if wantarray; - - # what to do in an array context - return $result[0] if $tag; - my %result; - while (my($key,$value) = splice(@result,0,2)) { - push @{$result{$key}},$value; - } - return \%result; -} - -=head2 fast_queries - - Title : fast_queries - Usage : $flag = $db->fast_queries([$flag]) - Function: turn on and off the "fast queries" option - Returns : a boolean - Args : a boolean flag (optional) - Status : public - -The mysql database driver (and possibly others) support a "fast" query -mode that caches results on the server side. This makes queries come -back faster, particularly when creating iterators. The downside is -that while iterating, new queries will die with a "command synch" -error. This method turns the feature on and off. - -For databases that do not support a fast query, this method has no -effect. - -=cut - -# override this method in order to set the mysql_use_result attribute, which is an obscure -# but extremely powerful optimization for both performance and memory. -sub fast_queries { - my $self = shift; - my $d = $self->{fast_queries}; - $self->{fast_queries} = shift if @_; - $d; -} - -=head2 add_aggregator - - Title : add_aggregator - Usage : $db->add_aggregator($aggregator) - Function: add an aggregator to the list - Returns : nothing - Args : an aggregator - Status : public - -This method will append an aggregator to the end of the list of -registered aggregators. Three different argument types are accepted: - - 1) a Bio::DB::GFF::Aggregator object -- will be added - 2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}" - -- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method - part is optional). - 3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator - subclass, where the token corresponds to the subclass name. - -=cut - -sub add_aggregator { - my $self = shift; - my $aggregator = shift; - my $list = $self->{aggregators} ||= []; - if (ref $aggregator) { # an object - @$list = grep {$_->get_method ne $aggregator->get_method} @$list; - push @$list,$aggregator; - } - - elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) { - my($agg_name,$subparts,$mainpart) = ($1,$2,$3); - my @subparts = split /,\s*/,$subparts; - my @args = (-method => $agg_name, - -sub_parts => \@subparts); - if ($mainpart) { - push @args,(-main_method => $mainpart, - -whole_object => 1); - } - warn "making an aggregator with (@args), subparts = @subparts" if $self->debug; - push @$list,Bio::DB::GFF::Aggregator->new(@args); - } - - else { - my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E"; - eval "require $class; 1" or $self->throw("Unable to load $aggregator aggregator: $@"); - push @$list,$class->new(); - } -} - -=head2 aggregators - - Title : aggregators - Usage : $db->aggregators([@new_aggregators]); - Function: retrieve list of aggregators - Returns : list of aggregators - Args : a list of aggregators to set (optional) - Status : public - -This method will get or set the list of aggregators assigned to -the database. If 1 or more arguments are passed, the existing -set will be cleared. - -=cut - -sub aggregators { - my $self = shift; - my $d = $self->{aggregators}; - if (@_) { - $self->clear_aggregators; - $self->add_aggregator($_) foreach @_; - } - return unless $d; - return @$d; -} - -=head2 clear_aggregators - - Title : clear_aggregators - Usage : $db->clear_aggregators - Function: clears list of aggregators - Returns : nothing - Args : none - Status : public - -This method will clear the aggregators stored in the database object. -Use aggregators() or add_aggregator() to add some back. - -=cut - -sub clear_aggregators { shift->{aggregators} = [] } - -=head2 preferred_groups - - Title : preferred_groups - Usage : $db->preferred_groups([$group_name_or_arrayref]) - Function: get/set list of groups for altering GFF2 parsing - Returns : a list of classes - Args : new list (scalar or array ref) - Status : public - -=cut - -sub preferred_groups { - my $self = shift; - my $d = $self->{preferred_groups}; - if (@_) { - my @v = map {ref($_) eq 'ARRAY' ? @$_ : $_} @_; - $self->{preferred_groups} = \@v; - delete $self->{preferred_groups_hash}; - } - return unless $d; - return @$d; -} - -sub _preferred_groups_hash { - my $self = shift; - my $gff3 = shift; - return $self->{preferred_groups_hash} if exists $self->{preferred_groups_hash}; - my $count = 0; - - my @preferred = $self->preferred_groups; - - # defaults - if (!@preferred) { - @preferred = $gff3 || $self->{load_data}{gff3_flag} ? qw(Target Parent ID) : qw(Target Sequence Transcript); - } - - my %preferred = map {lc($_) => @preferred-$count++} @preferred; - return $self->{preferred_groups_hash} = \%preferred; -} - -=head1 Methods for use by Subclasses - -The following methods are chiefly of interest to subclasses and are -not intended for use by end programmers. - -=head2 abscoords - - Title : abscoords - Usage : $db->abscoords($name,$class,$refseq) - Function: finds position of a landmark in reference coordinates - Returns : ($ref,$class,$start,$stop,$strand) - Args : name and class of landmark - Status : public - -This method is called by Bio::DB::GFF::RelSegment to obtain the -absolute coordinates of a sequence landmark. The arguments are the -name and class of the landmark. If successful, abscoords() returns -the ID of the reference sequence, its class, its start and stop -positions, and the orientation of the reference sequence's coordinate -system ("+" for forward strand, "-" for reverse strand). - -If $refseq is present in the argument list, it forces the query to -search for the landmark in a particular reference sequence. - -=cut - -sub abscoords { - my $self = shift; - my ($name,$class,$refseq) = @_; - $class ||= $self->{default_class}; - $self->get_abscoords($name,$class,$refseq); -} - -=head1 Protected API - -The following methods are not intended for public consumption, but are -intended to be overridden/implemented by adaptors. - -=head2 default_aggregators - - Title : default_aggregators - Usage : $db->default_aggregators; - Function: retrieve list of aggregators - Returns : array reference containing list of aggregator names - Args : none - Status : protected - -This method (which is intended to be overridden by adaptors) returns a -list of standard aggregators to be applied when no aggregators are -specified in the constructor. - -=cut - -sub default_aggregators { - my $self = shift; - return ['processed_transcript','alignment']; -} - -=head2 do_load_gff - - Title : do_load_gff - Usage : $db->do_load_gff($handle) - Function: load a GFF input stream - Returns : number of features loaded - Args : A filehandle. - Status : protected - -This method is called to load a GFF data stream. The method will read -GFF features from EE and load them into the database. On exit the -method must return the number of features loaded. - -Note that the method is responsible for parsing the GFF lines. This -is to allow for differences in the interpretation of the "group" -field, which are legion. - -You probably want to use load_gff() instead. It is more flexible -about the arguments it accepts. - -=cut - -sub do_load_gff { - my $self = shift; - my $io_handle = shift; - - local $self->{load_data} = { - lineend => (-t STDERR && !$ENV{EMACS} ? "\r" : "\n"), - count => 0 - }; - - $self->setup_load(); - my $mode = 'gff'; - - while (<$io_handle>) { - chomp; - if ($mode eq 'gff') { - if (/^>/) { # Sequence coming - $mode = 'fasta'; - $self->_load_sequence_start; - $self->_load_sequence_line($_); - } else { - $self->_load_gff_line($_); - } - } - elsif ($mode eq 'fasta') { - if (/^##|\t/) { # Back to GFF mode - $self->_load_sequence_finish; - $mode = 'gff'; - $self->_load_gff_line($_); - } else { - $self->_load_sequence_line($_); - } - } - } - $self->finish_load(); - $self->_load_sequence_finish; - - return $self->{load_data}{count}; -} - -sub _load_gff_line { - my $self = shift; - my $line = shift; - my $lineend = $self->{load_data}{lineend}; - - $self->{load_data}{gff3_flag}++ if $line =~ /^\#\#\s*gff-version\s+3/; - - if (defined $self->{load_data}{gff3_flag} and !defined $self->{load_data}{gff3_warning}) { - $self->print_gff3_warning(); - $self->{load_data}{gff3_warning}=1; - } - - $self->preferred_groups(split(/\s+/,$1)) if $line =~ /^\#\#\s*group-tags?\s+(.+)/; - - if ($line =~ /^\#\#\s*sequence-region\s+(\S+)\s+(-?\d+)\s+(-?\d+)/i) { # header line - $self->load_gff_line( - { - ref => $1, - class => 'Sequence', - source => 'reference', - method => 'Component', - start => $2, - stop => $3, - score => undef, - strand => undef, - phase => undef, - gclass => 'Sequence', - gname => $1, - tstart => undef, - tstop => undef, - attributes => [], - } - ); - return $self->{load_data}{count}++; - } - - return if /^#/; - - my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t",$line; - return unless defined($ref) && defined($method) && defined($start) && defined($stop); - foreach (\$score,\$strand,\$phase) { - undef $$_ if $$_ eq '.'; - } - - my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{load_data}{gff3_flag}); - - # no standard way in the GFF file to denote the class of the reference sequence -- drat! - # so we invoke the factory to do it - my $class = $self->refclass($ref); - - # call subclass to do the dirty work - if ($start > $stop) { - ($start,$stop) = ($stop,$start); - if ($strand eq '+') { - $strand = '-'; - } elsif ($strand eq '-') { - $strand = '+'; - } - } - # GFF2/3 transition stuff - $gclass = [$gclass] unless ref $gclass; - $gname = [$gname] unless ref $gname; - for (my $i=0; $i<@$gname;$i++) { - $self->load_gff_line({ref => $ref, - class => $class, - source => $source, - method => $method, - start => $start, - stop => $stop, - score => $score, - strand => $strand, - phase => $phase, - gclass => $gclass->[$i], - gname => $gname->[$i], - tstart => $tstart, - tstop => $tstop, - attributes => $attributes} - ); - $self->{load_data}{count}++; - } -} - -sub _load_sequence_start { - my $self = shift; - my $ld = $self->{load_data}; - undef $ld->{id}; - $ld->{offset} = 0; - $ld->{seq} = ''; -} -sub _load_sequence_finish { - my $self = shift; - my $ld = $self->{load_data}; - $self->insert_sequence($ld->{id},$ld->{offset},$ld->{seq}) if defined $ld->{id}; -} - -sub _load_sequence_line { - my $self = shift; - my $line = shift; - my $ld = $self->{load_data}; - my $lineend = $ld->{lineend}; - - if (/^>(\S+)/) { - $self->insert_sequence($ld->{id},$ld->{offset},$ld->{seq}) if defined $ld->{id}; - $ld->{id} = $1; - $ld->{offset} = 0; - $ld->{seq} = ''; - $ld->{count}++; - print STDERR $ld->{count}," sequences loaded$lineend" if $self->{__verbose__} && $ld->{count} % 1000 == 0; - } else { - $ld->{seq} .= $_; - $self->insert_sequence_chunk($ld->{id},\$ld->{offset},\$ld->{seq}); - } - -} - -=head2 load_sequence - - Title : load_sequence - Usage : $db->load_sequence($handle) - Function: load a FASTA data stream - Returns : number of sequences - Args : a filehandle to the FASTA file - Status : protected - -You probably want to use load_fasta() instead. - -=cut - -# note - there is some repeated code here -sub load_sequence { - my $self = shift; - my $io_handle = shift; - - local $self->{load_data} = { - lineend => (-t STDERR && !$ENV{EMACS} ? "\r" : "\n"), - count => 0 - }; - - $self->_load_sequence_start; - while (<$io_handle>) { - chomp; - $self->_load_sequence_line($_); - } - $self->_load_sequence_finish; - return $self->{load_data}{count}; -} - -sub insert_sequence_chunk { - my $self = shift; - my ($id,$offsetp,$seqp) = @_; - if (my $cs = $self->dna_chunk_size) { - while (length($$seqp) >= $cs) { - my $chunk = substr($$seqp,0,$cs); - $self->insert_sequence($id,$$offsetp,$chunk); - $$offsetp += length($chunk); - substr($$seqp,0,$cs) = ''; - } - } - return 1; # the calling routine may expect success or failure -} - -# used to store big pieces of DNA in itty bitty pieces -sub dna_chunk_size { - return 0; -} - -sub insert_sequence { - my $self = shift; - my($id,$offset,$seq) = @_; - $self->throw('insert_sequence(): must be defined in subclass'); -} - -# This is the default class for reference points. Defaults to Sequence. -sub default_class { - my $self = shift; - return 'Sequence' unless ref $self; - my $d = $self->{default_class}; - $self->{default_class} = shift if @_; - $d; -} - -# gets name of the reference sequence, and returns its class -# currently just calls default_class -sub refclass { - my $self = shift; - my $name = shift; - return $self->default_class; -} - -=head2 setup_load - - Title : setup_load - Usage : $db->setup_load - Function: called before load_gff_line() - Returns : void - Args : none - Status : abstract - -This abstract method gives subclasses a chance to do any -schema-specific initialization prior to loading a set of GFF records. -It must be implemented by a subclass. - -=cut - -sub setup_load { - # default, do nothing -} - -=head2 finish_load - - Title : finish_load - Usage : $db->finish_load - Function: called after load_gff_line() - Returns : number of records loaded - Args : none - Status :abstract - -This method gives subclasses a chance to do any schema-specific -cleanup after loading a set of GFF records. - -=cut - -sub finish_load { - # default, do nothing -} - -=head2 load_gff_line - - Title : load_gff_line - Usage : $db->load_gff_line(@args) - Function: called to load one parsed line of GFF - Returns : true if successfully inserted - Args : see below - Status : abstract - -This abstract method is called once per line of the GFF and passed a -hashref containing parsed GFF fields. The fields are: - - {ref => $ref, - class => $class, - source => $source, - method => $method, - start => $start, - stop => $stop, - score => $score, - strand => $strand, - phase => $phase, - gclass => $gclass, - gname => $gname, - tstart => $tstart, - tstop => $tstop, - attributes => $attributes} - -=cut - -sub load_gff_line { - shift->throw("load_gff_line(): must be implemented by an adaptor"); -} - - -=head2 do_initialize - - Title : do_initialize - Usage : $db->do_initialize([$erase]) - Function: initialize and possibly erase database - Returns : true if successful - Args : optional erase flag - Status : protected - -This method implements the initialize() method described above, and -takes the same arguments. - -=cut - -sub do_initialize { - shift->throw('do_initialize(): must be implemented by an adaptor'); -} - -=head2 dna - - Title : dna - Usage : $db->dna($id,$start,$stop,$class) - Function: return the raw DNA string for a segment - Returns : a raw DNA string - Args : id of the sequence, its class, start and stop positions - Status : public - -This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA -sequence. - -Arguments: -name sequence name - -start start position - -stop stop position - -class sequence class - -If start and stop are both undef, then the entire DNA is retrieved. -So to fetch the whole dna, call like this: - - $db->dna($name_of_sequence); - -or like this: - - $db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence); - -NOTE: you will probably prefer to create a Segment and then invoke its -dna() method. - -=cut - -# call to return the DNA string for the indicated region -# real work is done by get_dna() -sub dna { - my $self = shift; - my ($id,$start,$stop,$class) = rearrange([ - [qw(NAME ID REF REFSEQ)], - qw(START), - [qw(STOP END)], - 'CLASS', - ],@_); -# return unless defined $start && defined $stop; - $self->get_dna($id,$start,$stop,$class); -} - -sub fetch_sequence { shift->dna(@_) } - -sub features_in_range { - my $self = shift; - my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) = - rearrange([ - [qw(RANGE_TYPE)], - [qw(REF REFSEQ)], - qw(CLASS), - qw(START), - [qw(STOP END)], - [qw(TYPE TYPES)], - qw(PARENT), - [qw(RARE SPARSE)], - [qw(MERGE AUTOMERGE)], - 'ITERATOR' - ],@_); - $other ||= {}; - # $automerge = $types && $self->automerge unless defined $automerge; - $automerge = $self->automerge unless defined $automerge; - $self->throw("range type must be one of {". - join(',',keys %valid_range_types). - "}\n") - unless $valid_range_types{lc $range_type}; - $self->_features({ - rangetype => lc $range_type, - refseq => $refseq, - refclass => $class, - start => $start, - stop => $stop, - types => $types }, - { - sparse => $sparse, - automerge => $automerge, - iterator => $iterator, - %$other, - }, - $parent); -} - -=head2 get_dna - - Title : get_dna - Usage : $db->get_dna($id,$start,$stop,$class) - Function: get DNA for indicated segment - Returns : the dna string - Args : sequence ID, start, stop and class - Status : protected - -If start E stop and the sequence is nucleotide, then this method -should return the reverse complement. The sequence class may be -ignored by those databases that do not recognize different object -types. - -=cut - -sub get_dna { - my $self = shift; - my ($id,$start,$stop,$class,) = @_; - $self->throw("get_dna() must be implemented by an adaptor"); -} - -=head2 get_features - - Title : get_features - Usage : $db->get_features($search,$options,$callback) - Function: get list of features for a region - Returns : count of number of features retrieved - Args : see below - Status : protected - -The first argument is a hash reference containing search criteria for -retrieving features. It contains the following keys: - - rangetype One of "overlaps", "contains" or "contained_in". Indicates - the type of range query requested. - - refseq ID of the landmark that establishes the absolute - coordinate system. - - refclass Class of this landmark. Can be ignored by implementations - that don't recognize such distinctions. - - start Start of the range, inclusive. - - stop Stop of the range, inclusive. - - types Array reference containing the list of annotation types - to fetch from the database. Each annotation type is an - array reference consisting of [source,method]. - -The second argument is a hash reference containing certain options -that affect the way information is retrieved: - - sort_by_group - A flag. If true, means that the returned features should be - sorted by the group that they're in. - - sparse A flag. If true, means that the expected density of the - features is such that it will be more efficient to search - by type rather than by range. If it is taking a long - time to fetch features, give this a try. - - binsize A true value will create a set of artificial features whose - start and stop positions indicate bins of the given size, and - whose scores are the number of features in the bin. The - class of the feature will be set to "bin", and its name to - "method:source". This is a handy way of generating histograms - of feature density. - -The third argument, the $callback, is a code reference to which -retrieved features are passed. It is described in more detail below. - -This routine is responsible for getting arrays of GFF data out of the -database and passing them to the callback subroutine. The callback -does the work of constructing a Bio::DB::GFF::Feature object out of -that data. The callback expects a list of 13 fields: - - $refseq The reference sequence - $start feature start - $stop feature stop - $source feature source - $method feature method - $score feature score - $strand feature strand - $phase feature phase - $groupclass group class (may be undef) - $groupname group ID (may be undef) - $tstart target start for similarity hits (may be undef) - $tstop target stop for similarity hits (may be undef) - $feature_id A unique feature ID (may be undef) - -These fields are in the same order as the raw GFF file, with the -exception that the group column has been parsed into group class and -group name fields. - -The feature ID, if provided, is a unique identifier of the feature -line. The module does not depend on this ID in any way, but it is -available via Bio::DB::GFF-Eid() if wanted. In the dbi::mysql and -dbi::mysqlopt adaptor, the ID is a unique row ID. In the acedb -adaptor it is not used. - -=cut - -=head2 feature_summary(), coverage_array() - -The DBI adaptors provide methods for rapidly fetching coverage -statistics across a region of interest. Please see -L for more information about these -methods. - -=cut - -sub get_features{ - my $self = shift; - my ($search,$options,$callback) = @_; - $self->throw("get_features() must be implemented by an adaptor"); -} - - -=head2 _feature_by_name - - Title : _feature_by_name - Usage : $db->_feature_by_name($class,$name,$location,$callback) - Function: get a list of features by name and class - Returns : count of number of features retrieved - Args : name of feature, class of feature, and a callback - Status : abstract - -This method is used internally. The callback arguments are the same -as those used by make_feature(). This method must be overridden by -subclasses. - -=cut - -sub _feature_by_name { - my $self = shift; - my ($class,$name,$location,$callback) = @_; - $self->throw("_feature_by_name() must be implemented by an adaptor"); -} - -sub _feature_by_attribute { - my $self = shift; - my ($attributes,$callback) = @_; - $self->throw("_feature_by_name() must be implemented by an adaptor"); -} - -=head2 _feature_by_id - - Title : _feature_by_id - Usage : $db->_feature_by_id($ids,$type,$callback) - Function: get a feature based - Returns : count of number of features retrieved - Args : arrayref to feature IDs to fetch - Status : abstract - -This method is used internally to fetch features either by their ID or -their group ID. $ids is a arrayref containing a list of IDs, $type is -one of "feature" or "group", and $callback is a callback. The -callback arguments are the same as those used by make_feature(). This -method must be overridden by subclasses. - -=cut - -sub _feature_by_id { - my $self = shift; - my ($ids,$type,$callback) = @_; - $self->throw("_feature_by_id() must be implemented by an adaptor"); -} - -=head2 overlapping_features - - Title : overlapping_features - Usage : $db->overlapping_features(@args) - Function: get features that overlap the indicated range - Returns : a list of Bio::DB::GFF::Feature objects - Args : see below - Status : public - -This method is invoked by Bio::DB::GFF::Segment-Efeatures() to find -the list of features that overlap a given range. It is generally -preferable to create the Segment first, and then fetch the features. - -This method takes set of named arguments: - - -refseq ID of the reference sequence - -class Class of the reference sequence - -start Start of the desired range in refseq coordinates - -stop Stop of the desired range in refseq coordinates - -types List of feature types to return. Argument is an array - reference containing strings of the format "method:source" - -parent A parent Bio::DB::GFF::Segment object, used to create - relative coordinates in the generated features. - -rare Turn on an optimization suitable for a relatively rare feature type, - where it will be faster to filter by feature type first - and then by position, rather than vice versa. - -merge Whether to apply aggregators to the generated features. - -iterator Whether to return an iterator across the features. - -If -iterator is true, then the method returns a single scalar value -consisting of a Bio::SeqIO object. You can call next_seq() repeatedly -on this object to fetch each of the features in turn. If iterator is -false or absent, then all the features are returned as a list. - -Currently aggregation is disabled when iterating over a series of -features. - -Types are indicated using the nomenclature "method:source". Either of -these fields can be omitted, in which case a wildcard is used for the -missing field. Type names without the colon (e.g. "exon") are -interpreted as the method name and a source wild card. Regular -expressions are allowed in either field, as in: "similarity:BLAST.*". - -=cut - -# call to return the features that overlap the named region -# real work is done by get_features -sub overlapping_features { - my $self = shift; - $self->features_in_range(-range_type=>'overlaps',@_); -} - -=head2 contained_features - - Title : contained_features - Usage : $db->contained_features(@args) - Function: get features that are contained within the indicated range - Returns : a list of Bio::DB::GFF::Feature objects - Args : see overlapping_features() - Status : public - -This call is similar to overlapping_features(), except that it only -retrieves features whose end points are completely contained within -the specified range. - -Generally you will want to fetch a Bio::DB::GFF::Segment object and -call its contained_features() method rather than call this directly. - -=cut - -# The same, except that it only returns features that are completely contained within the -# range (much faster usually) -sub contained_features { - my $self = shift; - $self->features_in_range(-range_type=>'contains',@_); -} - -=head2 contained_in - - Title : contained_in - Usage : @features = $s->contained_in(@args) - Function: get features that contain this segment - Returns : a list of Bio::DB::GFF::Feature objects - Args : see features() - Status : Public - -This is identical in behavior to features() except that it returns -only those features that completely contain the segment. - -=cut - -sub contained_in { - my $self = shift; - $self->features_in_range(-range_type=>'contained_in',@_); -} - -=head2 get_abscoords - - Title : get_abscoords - Usage : $db->get_abscoords($name,$class,$refseq) - Function: get the absolute coordinates of sequence with name & class - Returns : ($absref,$absstart,$absstop,$absstrand) - Args : name and class of the landmark - Status : protected - -Given the name and class of a genomic landmark, this function returns -a four-element array consisting of: - - $absref the ID of the reference sequence that contains this landmark - $absstart the position at which the landmark starts - $absstop the position at which the landmark stops - $absstrand the strand of the landmark, relative to the reference sequence - -If $refseq is provided, the function searches only within the -specified reference sequence. - -=cut - -sub get_abscoords { - my $self = shift; - my ($name,$class,$refseq) = @_; - $self->throw("get_abscoords() must be implemented by an adaptor"); -} - -=head2 get_types - - Title : get_types - Usage : $db->get_types($absref,$class,$start,$stop,$count) - Function: get list of all feature types on the indicated segment - Returns : list or hash of Bio::DB::GFF::Typename objects - Args : see below - Status : protected - -Arguments are: - - $absref the ID of the reference sequence - $class the class of the reference sequence - $start the position to start counting - $stop the position to end counting - $count a boolean indicating whether to count the number - of occurrences of each feature type - -If $count is true, then a hash is returned. The keys of the hash are -feature type names in the format "method:source" and the values are -the number of times a feature of this type overlaps the indicated -segment. Otherwise, the call returns a set of Bio::DB::GFF::Typename -objects. If $start or $stop are undef, then all features on the -indicated segment are enumerated. If $absref is undef, then the call -returns all feature types in the database. - -=cut - -sub get_types { - my $self = shift; - my ($refseq,$class,$start,$stop,$count,$types) = @_; - $self->throw("get_types() must be implemented by an adaptor"); -} - -=head2 make_feature - - Title : make_feature - Usage : $db->make_feature(@args) - Function: Create a Bio::DB::GFF::Feature object from string data - Returns : a Bio::DB::GFF::Feature object - Args : see below - Status : internal - - This takes 14 arguments (really!): - - $parent A Bio::DB::GFF::RelSegment object - $group_hash A hashref containing unique list of GFF groups - $refname The name of the reference sequence for this feature - $refclass The class of the reference sequence for this feature - $start Start of feature - $stop Stop of feature - $source Feature source field - $method Feature method field - $score Feature score field - $strand Feature strand - $phase Feature phase - $group_class Class of feature group - $group_name Name of feature group - $tstart For homologies, start of hit on target - $tstop Stop of hit on target - -The $parent argument, if present, is used to establish relative -coordinates in the resulting Bio::DB::Feature object. This allows one -feature to generate a list of other features that are relative to its -coordinate system (for example, finding the coordinates of the second -exon relative to the coordinates of the first). - -The $group_hash allows the group_class/group_name strings to be turned -into rich database objects via the make_obect() method (see above). -Because these objects may be expensive to create, $group_hash is used -to uniquefy them. The index of this hash is the composite key -{$group_class,$group_name,$tstart,$tstop}. Values are whatever object -is returned by the make_object() method. - -The remainder of the fields are taken from the GFF line, with the -exception that "Target" features, which contain information about the -target of a homology search, are parsed into their components. - -=cut - -# This call is responsible for turning a line of GFF into a -# feature object. -# The $parent argument is a Bio::DB::GFF::Segment object and is used -# to establish the coordinate system for the new feature. -# The $group_hash argument is an hash ref that holds previously- -# generated group objects. -# Other arguments are taken right out of the GFF table. -sub make_feature { - my $self = shift; - my ($parent,$group_hash, # these arguments provided by generic mechanisms - $srcseq, # the rest is provided by adaptor - $start,$stop, - $source,$method, - $score,$strand,$phase, - $group_class,$group_name, - $tstart,$tstop, - $db_id,$group_id) = @_; - - return unless $srcseq; # return undef if called with no arguments. This behavior is used for - # on-the-fly aggregation. - - my $group; # undefined - if (defined $group_class && defined $group_name) { - $tstart ||= ''; - $tstop ||= ''; - if ($group_hash) { - $group = $group_hash->{$group_class,$group_name,$tstart,$tstop} - ||= $self->make_object($group_class,$group_name,$tstart,$tstop); - } else { - $group = $self->make_object($group_class,$group_name,$tstart,$tstop); - } - } - -# fix for some broken GFF files -# unfortunately - has undesired side effects -# if (defined $tstart && defined $tstop && !defined $strand) { -# $strand = $tstart <= $tstop ? '+' : '-'; -# } - - if (ref $parent) { # note that the src sequence is ignored - return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop, - $method,$source, - $score,$strand,$phase, - $group,$db_id,$group_id, - $tstart,$tstop); - } else { - return Bio::DB::GFF::Feature->new($self,$srcseq, - $start,$stop, - $method,$source, - $score,$strand,$phase, - $group,$db_id,$group_id, - $tstart,$tstop); - } -} - -sub make_aggregated_feature { - my $self = shift; - my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3); - my $feature = $self->make_feature($parent,undef,@_); - return [$feature] if $feature && !$feature->group; - - # if we have accumulated features and either: - # (1) make_feature() returned undef, indicated very end or - # (2) the current group is different from the previous one - - local $^W = 0; # irritating uninitialized value warning in next statement - if (@$accumulated_features && - (!defined($feature) || ($accumulated_features->[-1]->group ne $feature->group))) { - foreach my $a (@$aggregators) { # last aggregator gets first shot - $a->aggregate($accumulated_features,$self) or next; - } - my @result = @$accumulated_features; - @$accumulated_features = $feature ? ($feature) : (); - return unless @result; - return \@result ; - } - push @$accumulated_features,$feature; - return; -} - -=head2 make_match_sub - - Title : make_match_sub - Usage : $db->make_match_sub($types) - Function: creates a subroutine used for filtering features - Returns : a code reference - Args : a list of parsed type names - Status : protected - -This method is used internally to generate a code subroutine that will -accept or reject a feature based on its method and source. It takes -an array of parsed type names in the format returned by parse_types(), -and generates an anonymous subroutine. The subroutine takes a single -Bio::DB::GFF::Feature object and returns true if the feature matches -one of the desired feature types, and false otherwise. - -=cut - -# a subroutine that matches features indicated by list of types -sub make_match_sub { - my $self = shift; - my $types = shift; - - return sub { 1 } unless ref $types && @$types; - - my @expr; - for my $type (@$types) { - my ($method,$source) = @$type; - $method = $method ? "\\Q$method\\E" : ".*"; - $source = $source ? ":\\Q$source\\E" : "(?::.+)?"; - push @expr,"${method}${source}"; - } - my $expr = join '|',@expr; - return $self->{match_subs}{$expr} if $self->{match_subs}{$expr}; - - my $sub =<type =~ /^($expr)\$/i; -} -END - warn "match sub: $sub\n" if $self->debug; - undef $@; - my $compiled_sub = eval $sub; - $self->throw($@) if $@; - return $self->{match_subs}{$expr} = $compiled_sub; -} - -=head2 make_object - - Title : make_object - Usage : $db->make_object($class,$name,$start,$stop) - Function: creates a feature object - Returns : a feature object - Args : see below - Status : protected - -This method is called to make an object from the GFF "group" field. -By default, all Target groups are turned into Bio::DB::GFF::Homol -objects, and everything else becomes a Bio::DB::GFF::Featname. -However, adaptors are free to override this method to generate more -interesting objects, such as true BioPerl objects, or Acedb objects. - -Arguments are: - - $name database ID for object - $class class of object - $start for similarities, start of match inside object - $stop for similarities, stop of match inside object - -=cut - -# abstract call to turn a feature into an object, given its class and name -sub make_object { - my $self = shift; - my ($class,$name,$start,$stop) = @_; - return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop) - if defined $start and length $start; - return Bio::DB::GFF::Featname->new($class,$name); -} - - -=head2 do_attributes - - Title : do_attributes - Usage : $db->do_attributes($id [,$tag]); - Function: internal method to retrieve attributes given an id and tag - Returns : a list of Bio::DB::GFF::Feature objects - Args : a feature id and a attribute tag (optional) - Status : protected - -This method is overridden by subclasses in order to return a list of -attributes. If called with a tag, returns the value of attributes of -that tag type. If called without a tag, returns a flattened array of -(tag=Evalue) pairs. A particular tag can be present multiple times. - -=cut - -sub do_attributes { - my $self = shift; - my ($id,$tag) = @_; - return (); -} - -=head2 clone - -The clone() method should be used when you want to pass the -Bio::DB::GFF object to a child process across a fork(). The child must -call clone() before making any queries. - -The default behavior is to do nothing, but adaptors that use the DBI -interface may need to implement this in order to avoid database handle -errors. See the dbi adaptor for an example. - -=cut - -sub clone { } - - -=head1 Internal Methods - -The following methods are internal to Bio::DB::GFF and are not -guaranteed to remain the same. - -=head2 _features - - Title : _features - Usage : $db->_features($search,$options,$parent) - Function: internal method - Returns : a list of Bio::DB::GFF::Feature objects - Args : see below - Status : internal - -This is an internal method that is called by overlapping_features(), -contained_features() and features() to create features based on a -parent segment's coordinate system. It takes three arguments, a -search options hashref, an options hashref, and a parent segment. - -The search hashref contains the following keys: - - rangetype One of "overlaps", "contains" or "contained_in". Indicates - the type of range query requested. - refseq reference sequence ID - refclass reference sequence class - start start of range - stop stop of range - types arrayref containing list of types in "method:source" form - -The options hashref contains zero or more of the following keys: - - sparse turn on optimizations for a rare feature - automerge if true, invoke aggregators to merge features - iterator if true, return an iterator - -The $parent argument is a scalar object containing a -Bio::DB::GFF::RelSegment object or descendent. - -=cut - -#' - -sub _features { - my $self = shift; - my ($search,$options,$parent) = @_; - (@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)}) - if defined($search->{start}) && $search->{start} > $search->{stop}; - $search->{refseq} = $search->{seq_id} if exists $search->{seq_id}; - - my $types = $self->parse_types($search->{types}); # parse out list of types - my @aggregated_types = @$types; # keep a copy - - # allow the aggregators to operate on the original - my @aggregators; - if ($options->{automerge}) { - for my $a ($self->aggregators) { - $a = $a->clone if $options->{iterator}; - unshift @aggregators,$a - if $a->disaggregate(\@aggregated_types,$self); - } - } - - if ($options->{iterator}) { - my @accumulated_features; - my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) } - : sub { [$self->make_feature($parent,undef,@_)] }; - return $self->get_features_iterator({ %$search, - types => \@aggregated_types }, - { %$options, - sort_by_group => $options->{automerge} }, - $callback - ); - } - - my %groups; # cache the groups we create to avoid consuming too much unecessary memory - my $features = []; - - my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) }; - $self->get_features({ %$search, - types => \@aggregated_types }, - $options, - $callback); - - if ($options->{automerge}) { - warn "aggregating...\n" if $self->debug; - foreach my $a (@aggregators) { # last aggregator gets first shot - warn "Aggregator $a:\n" if $self->debug; - $a->aggregate($features,$self); - } - } - - @$features; -} - -=head2 get_features_iterator - - Title : get_features_iterator - Usage : $db->get_features_iterator($search,$options,$callback) - Function: get an iterator on a features query - Returns : a Bio::SeqIO object - Args : as per get_features() - Status : Public - -This method takes the same arguments as get_features(), but returns an -iterator that can be used to fetch features sequentially, as per -Bio::SeqIO. - -Internally, this method is simply a front end to range_query(). -The latter method constructs and executes the query, returning a -statement handle. This routine passes the statement handle to the -constructor for the iterator, along with the callback. - -=cut - -sub get_features_iterator { - my $self = shift; - my ($search,$options,$callback) = @_; - $self->throw('feature iteration is not implemented in this adaptor'); -} - -=head2 split_group - - Title : split_group - Usage : $db->split_group($group_field,$gff3_flag) - Function: parse GFF group field - Returns : ($gclass,$gname,$tstart,$tstop,$attributes) - Args : the gff group column and a flag indicating gff3 compatibility - Status : internal - -This is a method that is called by load_gff_line to parse out the -contents of one or more group fields. It returns the class of the -group, its name, the start and stop of the target, if any, and an -array reference containing any attributes that were stuck into the -group field, in [attribute_name,attribute_value] format. - -=cut - -sub split_group { - my $self = shift; - my ($group,$gff3) = @_; - if ($gff3) { - my @groups = split /[;&]/,$group; # so easy! - return $self->_split_gff3_group(@groups); - } else { - # handle group parsing - # protect embedded semicolons in the group; there must be faster/more elegant way - # to do this. - $group =~ s/\\;/$;/g; - while ($group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/) { 1 } - my @groups = split(/\s*;\s*/,$group); - foreach (@groups) { s/$;/;/g } - return $self->_split_gff2_group(@groups); - } -} - -=head2 _split_gff2_group - -This is an internal method called by split_group(). - -=cut - -# this has gotten quite nasty due to transition from GFF2 to GFF2.5 -# (artemis) to GFF3. - -sub _split_gff2_group { - my $self = shift; - my @groups = @_; - my $target_found; - - my ($gclass,$gname,$tstart,$tstop,@attributes,@notes); - - for (@groups) { - - my ($tag,$value) = /^(\S+)(?:\s+(.+))?/; - $value = '' unless defined $value; - if ($value =~ /^\"(.+)\"$/) { #remove quotes - $value = $1; - } - $value =~ s/\\t/\t/g; - $value =~ s/\\r/\r/g; - $value =~ s/\s+$//; - - # Any additional groups become part of the attributes hash - # For historical reasons, the tag "Note" is treated as an - # attribute, even if it is the only group. - $tag ||= ''; - if ($tag eq 'tstart' && $target_found) { - $tstart = $value; - } - - elsif ($tag eq 'tend' && $target_found) { - $tstop = $value; - } - - elsif (ucfirst $tag eq 'Note') { - push @notes, [$tag => $value]; - } - - elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) { # major disagreement in implementors of GFF2 here - $target_found++; - ($gclass,$gname) = ($1,$2); - ($tstart,$tstop) = / (\d+) (\d+)/; - } - - elsif (!defined($value)) { - push @notes, [Note => $tag]; # e.g. "Confirmed_by_EST" - } - - else { - push @attributes, [$tag => $value]; - } - } - - # group assignment - if (@attributes && !($gclass && $gname) ) { - - my $preferred = ref($self) ? $self->_preferred_groups_hash : {}; - - for my $pair (@attributes) { - my ($c,$n) = @$pair; - ($gclass,$gname) = ($c,$n) - if !$gclass # pick up first one - || - ($preferred->{lc $gclass}||0) < ($preferred->{lc $c}||0); # pick up higher priority one - } - - @attributes = grep {$gclass ne $_->[0]} @attributes; - } - - push @attributes, @notes; - - return ($gclass,$gname,$tstart,$tstop,\@attributes); -} - - -=head2 gff3_name_munging - - Title : gff3_name_munging - Usage : $db->gff3_name_munging($boolean) - Function: get/set gff3_name_munging flag - Returns : $current value of flag - Args : new value of flag (optional) - Status : utility - -If this is set to true (default false), then features identified in -gff3 files with an ID in the format foo:bar will be parsed so that -"foo" is the class and "bar" is the name. This is mostly for backward -compatibility with GFF2. - -=cut - -sub gff3_name_munging { - my $self = shift; - my $d = $self->{gff3_name_munging}; - $self->{gff3_name_munging} = shift if @_; - $d; -} - -=head2 _split_gff3_group - -This is called internally from split_group(). - -=cut - -sub _split_gff3_group { - my $self = shift; - my @groups = @_; - my $dc = $self->default_class; - my (%id,@attributes); - - for my $group (@groups) { - my ($tag,$value) = split /=/,$group; - $tag = unescape($tag); - my @values = map {unescape($_)} split /,/,$value; - - # GFF2 traditionally did not distinguish between a feature's name - # and the group it belonged to. This code is a transition between - # gff2 and the new parent/ID dichotomy in gff3. - if ($tag eq 'Parent') { - my (@names,@classes); - for (@values) { - my ($name,$class) = $self->_gff3_name_munging($_,$dc); - push @names,$name; - push @classes,$class; - } - $id{$tag} = @names > 1 ? [\@names,\@classes] : [$names[0],$classes[0]]; - } - elsif ($tag eq 'ID' || $tag eq 'Name') { - $id{$tag} = [$self->_gff3_name_munging(shift(@values),$dc)]; - } - elsif ($tag eq 'Target') { - my ($gname,$tstart,$tstop) = split /\s+/,shift @values; - $id{$tag} = [$self->_gff3_name_munging($gname,$dc),$tstart,$tstop]; - } - elsif ($tag =~ /synonym/i) { - $tag = 'Alias'; - } - push @attributes,[$tag=>$_] foreach @values; - } - - my $priorities = $self->_preferred_groups_hash(1); - my ($gclass,$gname,$tstart,$tstop); - for my $preferred (sort {$priorities->{lc $b}<=>$priorities->{lc $a}} - keys %id) { - unless (defined $gname) { - ($gname,$gclass,$tstart,$tstop) = @{$id{$preferred}}; - } - } - - # set null gclass to empty string to preserve compatibility with - # programs that expect a defined gclass if no gname - $gclass ||= '' if defined $gname; - - return ($gclass,$gname,$tstart,$tstop,\@attributes); -} - -# accomodation for wormbase style of class:name naming -sub _gff3_name_munging { - my $self = shift; - my ($name,$default_class) = @_; - return ($name,$default_class) unless $self->gff3_name_munging; - - if ($name =~ /^(\w+):(.+)/) { - return ($2,$1); - } else { - return ($name,$default_class); - } -} - -=head2 _delete_features(), _delete_groups(),_delete(),_delete_fattribute_to_features() - - Title : _delete_features(), _delete_groups(),_delete(),_delete_fattribute_to_features() - Usage : $count = $db->_delete_features(@feature_ids) - $count = $db->_delete_groups(@group_ids) - $count = $db->_delete(\%delete_spec) - $count = $db->_delete_fattribute_to_features(@feature_ids) - Function: low-level feature/group deleter - Returns : count of groups removed - Args : list of feature or group ids removed - Status : for implementation by subclasses - -These methods need to be implemented in adaptors. For _delete_features, -_delete_groups and _delete_fattribute_to_features, the arguments are a list of -feature or group IDs to remove. For _delete(), the argument is a hashref with -the three keys 'segments', 'types' and 'force'. The first contains an arrayref -of Bio::DB::GFF::RelSegment objects to delete (all FEATURES within the segment -are deleted). The second contains an arrayref of [method,source] feature types -to delete. The two are ANDed together. If 'force' has a true value, this -forces the operation to continue even if it would delete all features. - -=cut - -sub _delete_features { - my $self = shift; - my @feature_ids = @_; - $self->throw('_delete_features is not implemented in this adaptor'); -} - -sub _delete_groups { - my $self = shift; - my @group_ids = @_; - $self->throw('_delete_groups is not implemented in this adaptor'); -} - -sub _delete { - my $self = shift; - my $delete_options = shift; - $self->throw('_delete is not implemented in this adaptor'); -} - -sub _delete_fattribute_to_features { - my $self = shift; - my @feature_ids = @_; - $self->throw('_delete_fattribute_to_features is not implemented in this adaptor'); -} - - -sub unescape { - my $v = shift; - $v =~ tr/+/ /; - $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge; - return $v; -} - -sub print_gff3_warning { - my $self = shift; - print STDERR <$ids,db=>$db,type=>$type},$class; -} - -sub next_seq { - my $self = shift; - my $next = shift @{$self->{ids}}; - return unless $next; - my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next; - my $segment = $self->{type} eq 'name' ? $self->{db}->segment($name) - : $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name) - : $self->{type} eq 'group' ? $self->{db}->fetch_feature_by_gid($name) - : $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier"); - $self->throw("id does not exist") unless $segment; - return $segment; -} - -package Bio::DB::GFF::FeatureIterator; - -sub new { - my $self = shift; - my @features = @_; - return bless \@features,ref $self || $self; -} -sub next_seq { - my $self = shift; - return unless @$self; - return shift @$self; -} - - -1; - -__END__ - -=head1 BUGS - -Features can only belong to a single group at a time. This must be -addressed soon. - -Start coordinate can be greater than stop coordinate for relative -addressing. This breaks strict BioPerl compatibility and must be -fixed. - -=head1 SEE ALSO - -L, -L, -L, -L, -L, -L -L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut diff --git a/lib/Bio/DB/GFF/Adaptor/berkeleydb.pm b/lib/Bio/DB/GFF/Adaptor/berkeleydb.pm deleted file mode 100644 index fdf86d8ac..000000000 --- a/lib/Bio/DB/GFF/Adaptor/berkeleydb.pm +++ /dev/null @@ -1,1129 +0,0 @@ -package Bio::DB::GFF::Adaptor::berkeleydb; - - -=head1 NAME - -Bio::DB::GFF::Adaptor::berkeleydb -- Bio::DB::GFF database adaptor for in-memory databases - -=head1 SYNOPSIS - - use Bio::DB::GFF; - my $db = Bio::DB::GFF->new(-adaptor=> 'berkeleydb', - -create => 1, # on initial build you need this - -dsn => '/usr/local/share/gff/dmel'); - - # initialize an empty database, then load GFF and FASTA files - $db->initialize(1); - $db->load_gff('/home/drosophila_R3.2.gff'); - $db->load_fasta('/home/drosophila_R3.2.fa'); - - # do queries - my $segment = $db->segment(Chromosome => '1R'); - my $subseg = $segment->subseq(5000,6000); - my @features = $subseg->features('gene'); - -See L for other methods. - -=head1 DESCRIPTION - -This adaptor implements a berkeleydb-indexed version of Bio::DB::GFF. -It requires the DB_File and Storable modules. It can be used to store -and retrieve short to medium-length GFF files of several million -features in length. - -=head1 CONSTRUCTOR - -Use Bio::DB::GFF-Enew() to construct new instances of this class. -Three named arguments are recommended: - - Argument Description - -------- ----------- - - -adaptor Set to "berkeleydb" to create an instance of this class. - - -dsn Path to directory where the database index files will be stored (alias -db) - - -autoindex Monitor the indicated directory path for FASTA and GFF files, and update the - indexes automatically if they change (alias -dir) - - -write Set to a true value in order to update the database. - - -create Set to a true value to create the database the first time - (implies -write) - - -tmp Location of temporary directory for storing intermediate files - during certain queries. - - -preferred_groups Specify the grouping tag. See L - -The -dsn argument selects the directory in which to store the database -index files. If the directory does not exist it will be created -automatically, provided that the current process has sufficient -privileges. If no -dsn argument is specified, a database named "test" -will be created in your system's temporary files directory. - -The -tmp argument specifies the temporary directory to use for storing -intermediate search results. If not specified, your system's temporary -files directory will be used. On Unix systems, the TMPDIR environment -variable is honored. Note that some queries can require a lot of -space. - -The -autoindex argument, if present, selects a directory to be -monitored for GFF and FASTA files (which can be compressed with the -gzip program if desired). Whenever any file in this directory is -changed, the index files will be updated. Note that the indexing can -take a long time to run: anywhere from 5 to 10 minutes for a million -features. An alias for this argument is -dir, which gives this adaptor -a similar flavor to the "memory" adaptor. - --dsn and -dir can point to the same directory. If -dir is given but --dsn is absent the index files will be stored into the directory -containing the source files. For autoindexing to work, you must -specify the same -dir path each time you open the database. - -If you do not choose autoindexing, then you will want to load the -database using the bp_load_gff.pl command-line tool. For example: - - bp_load_gff.pl -a berkeleydb -c -d /usr/local/share/gff/dmel dna1.fa dna2.fa features.gff - -=head1 METHODS - -See L for inherited methods - -=head1 BUGS - -The various get_Stream_* methods and the features() method with the --iterator argument only return an iterator after the query runs -completely and the module has been able to generate a temporary -results file on disk. This means that iteration is not as big a win as -it is for the relational-database adaptors. - -Like the dbi::mysqlopt adaptor, this module uses a binning scheme to -speed up range-based searches. The binning scheme used here imposes a -hard-coded 1 gigabase (1000 Mbase) limit on the size of the largest -chromosome or other reference sequence. - -=head1 SEE ALSO - -L, L - -=head1 AUTHORS - -Vsevolod (Simon) Ilyushchenko Esimonf@cshl.eduE -Lincoln Stein Elstein@cshl.eduE - -Copyright (c) 2005 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -use strict; - -use DB_File; -use File::Path 'mkpath'; -use File::Spec; -use File::Temp 'tempfile'; - -use Bio::DB::GFF::Util::Rearrange; # for rearrange() -use Bio::DB::GFF::Util::Binning; -use Bio::DB::Fasta; -use Bio::DB::GFF::Adaptor::berkeleydb::iterator; -use Bio::DB::GFF::Adaptor::memory::feature_serializer; # qw(feature2string string2feature @hash2array_map); - -# this is the smallest bin (1 K) -use constant MIN_BIN => 1000; -# this is the largest that any reference sequence can be (1000 megabases) -use constant MAX_BIN => 1_000_000_000; -use constant MAX_SEGMENT => 1_000_000_000; # the largest a segment can get - -#We have to define a limit because Berkeleydb sorts in lexicografic order, -#so all the numbers have to have the same length. -use constant MAX_NUM_LENGTH => length(MAX_BIN); - -use base 'Bio::DB::GFF::Adaptor::memory'; - -sub new { - my $class = shift ; - my ($dbdir,$preferred_groups,$autoindex,$write,$create,$tmpdir) = rearrange([ - [qw(DSN DB)], - 'PREFERRED_GROUPS', - [qw(DIR AUTOINDEX)], - [qw(WRITE WRITABLE)], - 'CREATE', - 'TMP', - ],@_); - $tmpdir ||= File::Spec->tmpdir; - $dbdir ||= $autoindex; - $dbdir ||= "$tmpdir/test"; - $write ||= $create; - - my $self = bless {},$class; - $self->dsn($dbdir); - $self->tmpdir($tmpdir); - $self->preferred_groups($preferred_groups) if defined $preferred_groups; - $self->_autoindex($autoindex) if $autoindex; - $self->_open_databases($write,$create); - return $self; -} - -sub _autoindex { - my $self = shift; - my $autodir = shift; - - my $dir = $self->dsn; - my %ignore = map {$_=>1} ($self->_index_file,$self->_data_file, - $self->_fasta_file,$self->_temp_file, - $self->_notes_file, - $self->_timestamp_file); - - my $maxtime = 0; - my $maxfatime = 0; - - opendir (my $D,$autodir) or $self->throw("Couldn't open directory $autodir for reading: $!"); - - while (defined (my $node = readdir($D))) { - next if $node =~ /^\./; - my $path = "$dir/$node"; - next if $ignore{$path}; - next unless -f $path; - my $mtime = _mtime(\*_); # not a typo - $maxtime = $mtime if $mtime > $maxtime; - $maxfatime = $mtime if $mtime > $maxfatime && $node =~ /\.(?:fa|fasta|dna)(?:\.gz)?$/; - } - - close $D; - - my $timestamp_time = _mtime($self->_timestamp_file) || 0; - my $all_files_exist = -e $self->_index_file && -e $self->_data_file && (-e $self->_fasta_file || !$maxfatime); - - # to avoid rebuilding FASTA files if not changed - my $spare_fasta = $maxfatime > 0 && $maxfatime < $timestamp_time && -e $self->_fasta_file; - - if ($maxtime > $timestamp_time || !$all_files_exist) { - print STDERR __PACKAGE__,": Reindexing files in $dir. This may take a while....\n"; - $self->do_initialize(1,$spare_fasta); - $self->load_gff($autodir,1); - $self->load_fasta($autodir,1) unless $spare_fasta; - print STDERR __PACKAGE__,": Reindexing done\n"; - } - - else { - $self->_open_databases(); - } - -} - -sub _open_databases { - my $self = shift; - my ($write,$create) = @_; - - my $dsn = $self->dsn; - unless (-d $dsn) { # directory does not exist - $create or $self->throw("Directory $dsn does not exist and you did not specify the -create flag"); - mkpath($dsn) or $self->throw("Couldn't create database directory $dsn: $!"); - } - - my %db; - local $DB_BTREE->{flags} = R_DUP; - $DB_BTREE->{compare} = sub { lc($_[0]) cmp lc($_[1]) }; - my $flags = O_RDONLY; - $flags |= O_RDWR if $write; - $flags |= O_CREAT if $create; - - tie(%db,'DB_File',$self->_index_file,$flags,0666,$DB_BTREE) - or $self->throw("Couldn't tie ".$self->_index_file.": $!"); - $self->{db} = \%db; - $self->{data} = FeatureStore->new($self->_data_file,$write,$create); - - if (-e $self->_fasta_file) { - my $dna_db = Bio::DB::Fasta->new($self->_fasta_file) or $self->throw("Can't reindex sequence file: $@"); - $self->dna_db($dna_db); - } - - my $mode = $write ? "+>>" - : $create ? "+>" - : "<"; - - my $notes_file = $self->_notes_file; - open my $F, $mode, $notes_file or $self->throw("Could not open file '$notes_file': $!"); - $self->{notes} = $F; -} - -sub _close_databases { - my $self = shift; - delete $self->{db}; - delete $self->{data}; - delete $self->{notes}; -} - -sub _delete_features { - my $self = shift; - my @feature_ids = @_; - my $removed = 0; - my $last_id = $self->{data}->last_id; - for my $id (@feature_ids) { - next unless $id >= 0 && $id < $last_id; - my $feat = $self->{data}->get($id) or next; - $self->{data}->remove($id); - $self->_bump_class_count($feat->{gclass},-1); - my @keys = $self->_secondary_keys($feat); - $self->db->del_dup($_,$id) foreach @keys; - $removed++; - } - $removed; -} - -sub _secondary_keys { - my $self = shift; - my $feat = shift; - return ( - "__name__".lc(join ":",$feat->{gclass},$feat->{gname}), - "__bin__".lc("$feat->{ref}$;$feat->{bin}"), - "__type__".join(':',$feat->{method},$feat->{source}), - map {"__attr__".lc(join(':',$_->[0],$_->[1]))} @{$feat->{attributes}} - ); -} - -sub _delete { - my $self = shift; - my $delete_spec = shift; - return $self->SUPER::_delete($delete_spec) if @{$delete_spec->{segments}} or @{$delete_spec->{types}}; - $self->throw("This operation would delete all feature data and -force not specified") - unless $delete_spec->{force}; - my $deleted = $self->{db}{__count__}; - $self->{data} = FeatureStore->new($self->_data_file,1,1); - %{$self->{db}} = (); - $deleted; -} - -# with duplicates enabled, we cannot simply do $db->{__index__}++; -sub _bump_feature_count { - my $self = shift; - my $db = $self->{db}; - if (@_) { - delete $db->{__count__}; - return $db->{__count__} = shift; - } else { - my $index = ${db}->{__count__}; - delete $db->{__count__}; - $db->{__count__} = ($index || 0) + 1; - return $index; - } -} - -sub _bump_class_count { - my $self = shift; - my ($class,$count) = @_; - $count ||= 1; - my $db = $self->{db}; - my $key = "__class__$class"; - my $newcount = ($db->{$key} || 0) + $count; - delete $db->{$key}; - $db->{$key} = $newcount; -} - -sub classes { - my $self = shift; - my $db = $self->db; - my ($key,$value) = ('__class__',undef); - my %classes; - for (my $status = $db->seq($key,$value,R_CURSOR); - $status == 0; - $status = $db->seq($key,$value,R_NEXT)) { - my ($class) = $key =~ /^__class__(.+)/ or last; - $classes{$class}++ if $value > 0; - } - my @classes = sort keys %classes; - return @classes; -} - -sub do_initialize { - my $self = shift; - my $erase = shift; - my $spare_fasta = shift; # used internally only! - if ($erase) { - $self->_close_databases; - unlink $self->_index_file; - unlink $self->_data_file; - unlink $self->_notes_file; - unless ($spare_fasta) { - unlink $self->_fasta_file; - unlink $self->_fasta_file.'.index'; - } - unlink $self->_timestamp_file; - $self->_open_databases(1,1); - } - 1; -} - -# load_sequence($fasta_filehandle,$first_sequence_id) -sub load_sequence { - my $self = shift; - my ($io_handle,$id) = @_; - my $file = $self->_fasta_file; - my $loaded = 0; - - open my $F, '>>', $file or $self->throw("Could not append file '$file': $!"); - - if (defined $id) { - print $F ">$id\n"; - $loaded++; - } - - while (<$io_handle>) { - $loaded++ if /^>/; - print $F $_; - } - close $F; - my $dna_db = Bio::DB::Fasta->new($file) or $self->throw("Can't reindex sequence file: $@"); - $self->dna_db($dna_db); - $self->_touch_timestamp; - return $loaded; -} - -sub _mtime { - my $file = shift; - my @stat = stat($file); - return $stat[9]; -} - -sub _index_file { - my $self = shift; - return $self->dsn . "/bdb_features.btree"; -} - -sub _data_file { - my $self = shift; - return $self->dsn . "/bdb_features.data"; -} - -sub _fasta_file { - my $self = shift; - return $self->dsn . "/bdb_sequence.fa"; -} - -sub _notes_file { - my $self = shift; - return $self->dsn . "/bdb_notes.idx"; -} - -sub _temp_file { - my $self = shift; - local $^W=0; - my (undef,$filename) = tempfile("bdb_temp_XXXXXX",DIR=>$self->tmpdir,OPEN=>0); - return $filename; -} - -sub _timestamp_file { - my $self = shift; - return $self->dsn ."/bdb_timestamp"; -} - -sub db { - my $db = shift()->{db} or return; - return tied(%$db); -} - -sub dsn { - my $self = shift; - my $d = $self->{dsn}; - $self->{dsn} = shift if @_; - $d; -} - -sub tmpdir { - my $self = shift; - my $d = $self->{tmpdir}; - $self->{tmpdir} = shift if @_; - $d; -} - -sub load_gff_line { - - my ($self, $feat) = @_; - - $feat->{strand} = '' if $feat->{strand} && $feat->{strand} eq '.'; - $feat->{phase} = '' if $feat->{phase} && $feat->{phase} eq '.'; - - my $start = $feat->{start}; - my $stop = $feat->{stop}; - my $type = join(':',$feat->{method},$feat->{source}); - - my $bin = bin($feat->{start},$feat->{stop},MIN_BIN); - $feat->{bin} = $bin; - - my $id = $self->{data}->put($feat); - $bin = $self->normalizeNumber($bin); - - my $db = $self->{db}; - for my $skey ($self->_secondary_keys($feat)) { - $db->{$skey} = $id; - } - - # save searchable notes to separate index - my $fh = $self->{notes}; - my @notes = map {$_->[1]} grep {lc $_->[0] eq 'note'} @{$feat->{attributes}}; - print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating indexes: $!") - foreach @notes; - - $self->{records_loaded}++; - $self->_bump_feature_count(); - $self->_bump_class_count($feat->{gclass}); - -} - -# do nothing! -sub setup_load { - my $self = shift; - $self->{records_loaded} = 0; - 1; -} - -sub finish_load { - my $self = shift; - $self->db->sync && $self->throw("An error occurred while updating indexes: $!"); - $self->_touch_timestamp; - $self->{records_loaded}; -} - -sub _touch_timestamp { - my $self = shift; - my $tsf = $self->_timestamp_file; - open my $F, '>', $tsf or $self->throw("Could not write file '$tsf': $!"); - print $F scalar(localtime); - close $F; -} - - -# given sequence name, return (reference,start,stop,strand) -sub get_abscoords { - my $self = shift; - my ($name,$class,$refseq) = @_; - my %refs; - my $regexp; - - if ($name =~ /[*?]/) { # uh oh regexp time - $name = quotemeta($name); - $name =~ s/\\\*/.*/g; - $name =~ s/\\\?/.?/g; - $regexp++; - } - # Find all features that have the requested name and class. - # Sort them by reference point. - my @features = @{$self->retrieve_features(-table => 'name', -key=>"$class:$name")}; - if (!@features) { # nothing matched exactly, so try aliases - @features = @{$self->retrieve_features(-table=>'attr',-key=>"Alias:$name")}; - } - - foreach my $feature (@features){ - push @{$refs{$feature->{ref}}},$feature; - } - - # find out how many reference points we recovered - if (! %refs) { - $self->error("$name not found in database"); - return; - } - - # compute min and max - my ($ref) = keys %refs; - my @found = @{$refs{$ref}}; - my ($strand,$start,$stop); - - my @found_segments; - foreach my $ref (keys %refs) { - next if defined($refseq) and $ref ne $refseq; - my @found = @{$refs{$ref}}; - my ($strand,$start,$stop,$name); - foreach (@found) { - $strand ||= $_->{strand}; - $strand = '+' if $strand && $strand eq '.'; - $start = $_->{start} if !defined($start) || $start > $_->{start}; - $stop = $_->{stop} if !defined($stop) || $stop < $_->{stop}; - $name ||= $_->{gname}; - } - push @found_segments,[$ref,$class,$start,$stop,$strand,$name]; - - } - - return \@found_segments; -} - -sub get_types { - my $self = shift; - my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_; - my (%obj,%result,$key,$value); - $key = "__type__"; - - if (!$srcseq) { # optimized full type list - my $db = $self->db; - my $status = $db->seq($key,$value,R_CURSOR); - - while ($status == 0 && $key =~ /^__type__(.+)/) { - my $type = $1; - my ($method,$source) = split ':',$type; - $obj{$type} = Bio::DB::GFF::Typename->new($method,$source); - $result{$type}++; - - if ($want_count) { - $status = $db->seq($key,$value,R_NEXT); - } else { # skip to next key set - $key .= "\0"; - $status = $db->seq($key,$value,R_CURSOR) - } - - } - } - - else { # range search - for my $feature (@{$self->_get_features_by_search_options( - {rangetype => 'overlaps', - refseq => $srcseq, - refclass => ($class || undef), - start => ($start || undef), - stop => ($stop || undef), - }, - {} - )} - ) { - my $type = Bio::DB::GFF::Typename->new($feature->{method},$feature->{source}); - $obj{$type} = $type; - $result{$type}++; - } - } - - return $want_count ? %result : values %obj; -} - - -# Low level implementation of fetching a named feature. -# GFF annotations are named using the group class and name fields. -# May return zero, one, or several Bio::DB::GFF::Feature objects. - -=head2 _feature_by_name - - Title : _feature_by_name - Usage : $db->get_features_by_name($class,$name,$callback) - Function: get a list of features by name and class - Returns : count of number of features retrieved - Args : name of feature, class of feature, and a callback - Status : protected - -This method is used internally. The callback arguments are those used -by make_feature(). - -=cut - -sub _feature_by_name { - my $self = shift; - my ($class,$name,$location,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - - #use Devel::StackTrace; - #warn Devel::StackTrace->new->as_string; - - my $count = 0; - my $id = -1; - my ($use_regexp, $use_glob,$using_alias_search); - - if ($name =~ /[*?]/) { # uh oh regexp time - - #If there is only one trailing *, do a range search - if ($name =~ /^([^\*]+)\*$/) { - $name = $1; - $use_glob++; - } - - else { - $name = quotemeta($name); - $name =~ s/\\\*/.*/g; - $name =~ s/\\\?/.?/g; - $use_regexp++; - } - } - - my @features; - if ($use_glob) { - my $callback = sub {my $feat = shift; $feat->{gname} =~ /^$name/i}; - @features = @{$self->retrieve_features_range (-table => 'name', - -start => "$class:$name", - -do_while => $callback) - }; - } - elsif ($use_regexp) { - my $filter = sub {my $feat = shift; $feat->{gname} =~ /$name/i}; - @features = @{$self->filter_features(-table =>'name', -filter => $filter)}; - } - - else { - @features = @{$self->retrieve_features(-table=>'name', -key => "$class:$name")}; - } - - unless (@features) { - $using_alias_search++; - @features = @{$self->retrieve_features(-table=>'attr', -key=>"Alias:$name")}; - } - - foreach my $feature (@features){ - $id++; - next unless $using_alias_search || $feature->{gclass} eq $class; - - if ($location) { - next if $location->[0] ne $feature->{ref}; - next if $location->[1] && $location->[1] > $feature->{stop}; - next if $location->[2] && $location->[2] < $feature->{start}; - } - $count++; - - $callback->(@{$feature}{@hash2array_map},0); - } - return $count; -} - -#sub get_feature_by_attribute{ -sub _feature_by_attribute{ - my $self = shift; - my ($attributes,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - my $count = 0; - my $feature_group_id = undef; - - #there could be more than one set of attributes...... - while (my ($key, $value) = each %$attributes) { - - my @features = @{$self->retrieve_features - (-table => "attr", -key => "$key:$value")}; - - for my $feature (@features) { - $callback->(@{$feature}{@hash2array_map},$feature_group_id); - $count++; - } - } - -} - -sub search_notes { - my $self = shift; - my ($search_string,$limit) = @_; - - $search_string =~ tr/*?//d; - - my @results; - - my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g; - my $search = join '|',@words; - - my (%found,$found); - my $note_index = $self->{notes}; - seek($note_index,0,0); # back to start - while (<$note_index>) { - next unless /$search/; - chomp; - my ($note,$uu) = split "\t"; - $found{unpack("u*",$uu)}++; - last if $limit && ++$found >= $limit; - } - - my (@features, @matches); - for my $idx (keys %found) { - my $feature = $self->{data}->get($idx) or next; - my @attributes = @{$feature->{attributes}}; - my @values = map {lc $_->[0] eq 'note' ? $_->[1] : ()} @attributes; - my $value = "@values"; - - my $hits; - $hits++ while $value =~ /($search)/ig; # count the number of times we were hit - push @matches,$hits; - push @features,$feature; - } - - for (my $i=0; $i<@matches; $i++) { - my $feature = $features[$i]; - my $matches = $matches[$i]; - - my $relevance = 10 * $matches; - my $featname = Bio::DB::GFF::Featname->new($feature->{gclass}=>$feature->{gname}); - my $type = Bio::DB::GFF::Typename->new($feature->{method}=>$feature->{source}); - my $note; - $note = join ' ',map {$_->[1]} grep {$_->[0] eq 'Note'} @{$feature->{attributes}}; - push @results,[$featname,$note,$relevance,$type]; - } - - return @results; -} - -sub _get_features_by_search_options { - - #The $data argument is not used and is preserved for superclass compatibility - my ($self, $search,$options) = @_; - my $count = 0; - - my ($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes,$temp_file) = - (@{$search}{qw(rangetype refseq refclass start stop types)}, - @{$options}{qw(sparse sort_by_group ATTRIBUTES temp_file)}) ; - - $start = 0 unless defined($start); - $stop = MAX_BIN unless defined($stop); - - my $bin = bin($start,$stop,MIN_BIN); - $bin = $self->normalizeNumber($bin); - - my ($results,@features,%found,%results_table); - - if ($temp_file) { - local $DB_BTREE->{flags} = R_DUP; - # note: there is a race condition possible here, if someone reuses the - # same name between the time we get the tmpfile name and the time we - # ask DB_File to open it. - tie(%results_table,'DB_File',$temp_file,O_RDWR|O_CREAT,0666,$DB_BTREE) - or $self->throw("Couldn't tie temporary file ".$temp_file." for writing: $!"); - $results = \%results_table; - } else { - $results = \@features; - } - - my $filter = sub { - my $feature = shift; - - my $ref = $feature->{ref}; - my $feature_start = $feature->{start}; - my $feature_stop = $feature->{stop}; - my $feature_id = $feature->{feature_id}; - - return 0 if $found{$feature_id}++; - - if (defined $refseq) { - return 0 unless lc $refseq eq lc $ref; - $start = 0 unless defined($start); - $stop = MAX_SEGMENT unless defined($stop); - - if ($rangetype eq 'overlaps') { - return 0 unless $feature_stop >= $start && $feature_start <= $stop; - } elsif ($rangetype eq 'contains') { - return 0 unless $feature_start >= $start && $feature_stop <= $stop; - } elsif ($rangetype eq 'contained_in') { - return 0 unless $feature_start <= $start && $feature_stop >= $stop; - } else { - return 0 unless $feature_start == $start && $feature_stop == $stop; - } - } - - my $feature_source = $feature->{source}; - my $feature_method = $feature->{method}; - - if (defined $types && @$types){ - return 0 unless $self->_matching_typelist($feature_method,$feature_source,$types); - } - - my $feature_attributes = $feature->{attributes}; - if (defined $attributes){ - return 0 unless $self->_matching_attributes($feature_attributes,$attributes); - } - - return 1; - }; - - if (defined $refseq && !$sparse) { - my $tier = MAX_BIN; - while ($tier >= MIN_BIN) { - my ($tier_start,$tier_stop) = (bin_bot($tier,$start),bin_top($tier,$stop)); - # warn "Using $tier_start $tier_stop\n"; - if ($tier_start == $tier_stop) { - $self->retrieve_features(-table => "bin", - -key => "$refseq$;$tier_start", - -filter => $filter, - -result => $results); - } else { - my $callback = sub {my $feat = shift; $feat->{bin} <= $tier_stop}; - $self->retrieve_features_range(-table => "bin", - -start => "$refseq$;$tier_start", - -do_while => $callback, - -filter => $filter, - -result => $results); - } - $tier /= 10; - } - } - - elsif (@$types) { - foreach (@$types) { - my $type = join ':',@$_; - $self->retrieve_features_range(-table => 'type', - -start => $type, - -filter => $filter, - -do_while => sub { my $f = shift; - lc($f->{method}) eq lc($_->[0]) - && - lc($f->{source}||$_->[1]||'') eq lc($_->[1]||'') - }, - -result => $results); - } - } - - elsif (defined $attributes) { - my ($attribute_name,$attribute_value) = each %$attributes; # pick first one - $self->retrieve_features(-table => 'attr', - -key => "${attribute_name}:${attribute_value}", - -filter => $filter, - -result => $results); - } - - else { - $self->filter_features(-filter => $filter,-result=>$results); - } - - return $results; -} - -sub retrieve_features { - my $self = shift; - my ($table, $key, $filter, $result) = rearrange(['TABLE','KEY','FILTER', 'RESULT'],@_); - - my @result; - $result ||= \@result; - - my $frozen; - my @ids = $self->db->get_dup("__".lc($table)."__".lc($key)); - my $data = $self->{data}; - local $^W = 0; # because _hash_to_array() will generate lots of uninit values - - foreach my $id (@ids) { - my $feat = $data->get($id); - my $filter_result = $filter ? $filter->($feat) : 1; - next unless $filter_result; - if (ref $result eq 'HASH') { - $result->{"$feat->{gclass}:$feat->{gname}"} = join ($;,$self->_hash_to_array($feat)); - } else { - push @$result, $feat; - } - last if $filter_result == -1; - } - return $result; -} - -sub retrieve_features_range { - my ($self) = shift; - my ($table, $start, $do_while, $filter, $result) = rearrange(['TABLE','START','DO_WHILE', 'FILTER', 'RESULT'],@_); - local $^W = 0; # because _hash_to_array will generate lots of uninit warnings - - my @result; - $result ||= \@result; - my ($id, $key, $value); - - $key = "__".$table."__".$start; - my $db = $self->db; - - for (my $status = $db->seq($key,$value,R_CURSOR); - $status == 0; - $status = $db->seq($key,$value,R_NEXT)) { - - my $feat = $self->{data}->get($value); - last unless $do_while->($feat,$key); - - my $filter_result = $filter ? $filter->($feat) : 1; - next unless $filter_result; - - if (ref $result eq 'HASH') { - $result->{"$feat->{gclass}:$feat->{gname}"} = join($;,$self->_hash_to_array($feat)); - } else { - push @$result,$feat; - } - last if $filter_result == -1; - } - - return $result; -} - - -sub filter_features { - my ($self) = shift; - - my ($filter,$result) = rearrange(['FILTER','RESULT'],@_); - - my @result; - $result ||= \@result; - - my ($key, $frozen); - my $data = $self->{data}; - $data->reset; - while (my $feat = $data->next) { - - my $filter_result = $filter ? $filter->($feat) : 1; - next unless $filter_result; - - if (ref($result) eq 'HASH') { - $result->{"$feat->{gclass}:$feat->{gname}"} = join($;,$self->_hash_to_array($feat)); - } else { - push @$result,$feat; - } - last if $filter_result == -1; - } - - return $result; -} - - -sub _basic_features_by_id{ - my $self = shift; - my ($ids) = @_; - - $ids = [$ids] unless ref $ids =~ /ARRAY/; - - my @result; - my $data = $self->{data}; - for my $feature_id (@$ids){ - push @result, $data->get($feature_id); - } - - return wantarray() ? @result : $result[0]; -} - -sub normalizeNumber { - my ($self, $num) = @_; - while ((length $num) < MAX_NUM_LENGTH) - { - $num = "0".$num; - } - return $num; -} - -sub get_features_iterator { - my $self = shift; - - my ($search,$options,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - $options->{temp_file} = $self->_temp_file; - - my $results = $self->_get_features_by_search_options($search,$options); - return Bio::DB::GFF::Adaptor::berkeleydb::iterator->new($results,$callback,$options->{temp_file}); -} - -#--------------------------------------------------------------------------# - -package FeatureStore; - -# This is a very specialized package that stores serialized features onto a file-based -# array. The array is indexed by the physical offset to the beginning of each serialized -# feature. - -use strict; -use Fcntl qw(SEEK_SET SEEK_END); -use base 'Bio::Root::Root'; -use Bio::DB::GFF::Adaptor::memory::feature_serializer; # qw(feature2string string2feature @hash2array_map); - -sub new { - my $class = shift; - my $dbname = shift or $class->throw("must provide a filepath argument"); - my ($write,$create) = @_; - - my $mode = $create ? "+>" - : $write ? "+>>" - : "<"; - - open my $F, $mode, $dbname or $class->throw("Could not open file '$dbname': $!"); - my $self = bless { - fh => $F, - next_idx => 0, - last_id => 0, - },$class; - return $self; -} - -sub put { - my $self = shift; - my $feature = shift; - my $fh = $self->{fh}; - seek($fh,0,SEEK_END); - my $offset = tell($fh) || 0; - - $self->{last_id} = $offset; - - my $id = pack("L",$offset); - $feature->{feature_id} = $id; - my $value = feature2string($feature); - print $fh pack("n/a*",$value) or $self->throw("An error occurred while updating the data file: $!"); - - - return $id; -} - -sub last_id { - shift->{last_id}; -} - -sub get { - my $self = shift; - my $idx = shift; - my $offset = unpack("L",$idx); - my $fh = $self->{fh}; - - my ($value,$length); - $offset ||= 0; - seek($fh,$offset,SEEK_SET); - return unless read($fh,$length,2); - return unless read($fh,$value,unpack("n",$length)); - $self->{next_idx} = tell($fh); - return if substr($value,0,1) eq "\0"; - return string2feature($value); -} - -sub next { - my $self = shift; - my $fh = $self->{fh}; - my $result; - do { - $result = $self->get(pack("L",$self->{next_idx})); - } until $result || eof($fh); - $self->{next_idx} = 0 unless $result; - $result; -} - -sub remove { - my $self = shift; - my $id = shift; - my $offset = unpack("L",$id); - my $fh = $self->{fh}; - my ($value,$length); - seek($fh,$offset,SEEK_SET); - return unless read($fh,$length,2); - print $fh "\0"x$length; # null it out - 1; -} - -sub _seek { - my $self = shift; - my $idx = shift; - my $offset = unpack("L",$idx); - seek($self->{fh},$offset,SEEK_SET); - $self->{next_idx} = tell($self->{fh}); -} - -sub reset { - my $self = shift; - $self->_seek(pack("L",0)); -} - -sub _feature2string { - my $feature = shift; - my @a = @{$feature}{@hash2array_map}; - push @a,map {@$_} @{$feature->{attributes}} if $feature->{attributes}; - return join $;,@a; -} - -sub _string2feature { - my $string = shift; - my (%feature,@attributes); - - (@feature{@hash2array_map},@attributes) = split $;,$string; - while (@attributes) { - my ($key,$value) = splice(@attributes,0,2); - push @{$feature{attributes}},[$key,$value]; - } - $feature{group_id} = undef; - \%feature; -} - - -1; diff --git a/lib/Bio/DB/GFF/Adaptor/berkeleydb/iterator.pm b/lib/Bio/DB/GFF/Adaptor/berkeleydb/iterator.pm deleted file mode 100644 index 94b32a10c..000000000 --- a/lib/Bio/DB/GFF/Adaptor/berkeleydb/iterator.pm +++ /dev/null @@ -1,83 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Adaptor::berkeleydb::iterator - iterator for Bio::DB::GFF::Adaptor::berkeleydb - -=head1 SYNOPSIS - -For internal use only - -=head1 DESCRIPTION - -This is an internal module that is used by the Bio::DB::GFF in-memory -adaptor to return an iterator across a sequence feature query. The -object has a single method, next_feature(), that returns the next -feature from the query. The method next_seq() is an alias for -next_feature(). - -=head1 BUGS - -None known yet. - -=head1 SEE ALSO - -L, - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -package Bio::DB::GFF::Adaptor::berkeleydb::iterator; -use strict; -use DB_File qw(R_FIRST R_NEXT); - -# this module needs to be cleaned up and documented -use Bio::Root::Version; -*next_seq = \&next_feature; - -sub new { - my $class = shift; - my ($data,$callback,$tmpfile) = @_; - return bless {data => $data, - callback => $callback, - tmpfile => $tmpfile, - cache => []},$class; -} - -sub next_feature { - my $self = shift; - return shift @{$self->{cache}} if @{$self->{cache}}; - - my $data = $self->{data} or return; - my $callback = $self->{callback}; - - my $features; - my $db = tied(%$data); - my ($key,$value); - - for (my $status = $db->seq($key,$value,$self->{iter}++ ? R_NEXT : R_FIRST); - $status == 0; - $status = $db->seq($key,$value,R_NEXT)) { - my @feature = split ($;,$value); - $features = $callback->(@feature); - last if $features; - } - - unless ($features) { - $features = $callback->(); - undef $self->{data}; - undef $self->{cache}; - unlink $self->{tmpfile}; - } - - $self->{cache} = $features or return; - shift @{$self->{cache}}; -} - -1; diff --git a/lib/Bio/DB/GFF/Adaptor/biofetch.pm b/lib/Bio/DB/GFF/Adaptor/biofetch.pm deleted file mode 100644 index b49ed2257..000000000 --- a/lib/Bio/DB/GFF/Adaptor/biofetch.pm +++ /dev/null @@ -1,355 +0,0 @@ -package Bio::DB::GFF::Adaptor::biofetch; -#$Id$ -=head1 NAME - -Bio::DB::GFF::Adaptor::biofetch -- Cache BioFetch objects in a Bio::DB::GFF database - -=head1 SYNOPSIS - -Proof of principle. Not for production use. - -=head1 DESCRIPTION - -This adaptor is a proof-of-principle. It is used to fetch BioFetch -sequences into a Bio::DB::GFF database (currently uses a hard-coded -EMBL database) as needed. This allows the Generic Genome Browser to -be used as a Genbank/EMBL browser. - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright 2002 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -use strict; -use Bio::DB::GFF::Util::Rearrange; # for rearrange() -use Bio::DB::BioFetch; -use Bio::SeqIO; - -use vars qw(%preferred_tags); - -# THIS IS WRONG: biofetch should delegate to an underlying -# database adaptor, and not inherit from one. -use base qw(Bio::DB::GFF::Adaptor::dbi::mysql); - -# priority for choosing names of CDS tags, higher is higher priority -%preferred_tags = ( - strain => 10, - organism => 20, - protein_id => 40, - locus_tag => 50, - locus => 60, - gene => 70, - standard_name => 80, - ); - -=head2 new - - Title : new - Usage : $db = Bio::DB::GFF->new(-adaptor=>'biofetch',@args) - Function: create a new adaptor - Returns : a Bio::DB::GFF object - Args : -adaptor : required. Which adaptor to use; biofetch for mysql, biofetch_oracle for Oracle - -preferred_tags : optional. A hash of {classname => weight,...} - used to determine the class and name of the feature - when a choice of possible feature classes is available - (e.g. a feature has both a 'gene' and a 'locus' tag). - Common defaults are provided that work well for eukaryotic - features (but not well for viral/prokaryotic) - see below for additional arguments. - Status : Public - -This is the constructor for the adaptor. It is called automatically -by Bio::DB::GFF-Enew. In addition to arguments that are common among -all adaptors, the following class-specific arguments are recgonized: - - Argument Description - -------- ----------- - - -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' - - -user username for authentication - - -pass the password for authentication - - -proxy [['http','ftp'],'http://proxy:8080'] - - -source source to use for loaded features ('EMBL') - --dsn,-user and -pass indicate the local database to cache results in, -and as are per Bio::DB::GFF::Adaptor::dbi. The -proxy argument allows -you to set the biofetch web proxy, and uses the same syntax described -for the proxy() method of L, except that the -argument must be passed as an array reference. - -=cut - -sub new { - my $class = shift; - my $self = $class->SUPER::new(@_); - my ($preferred,$proxy,$source) = rearrange(['PREFERRED_TAGS','PROXY','SOURCE'],@_); - - # if the caller sent their own preferences, then use these, otherwise use defaults. - $self->_preferred_tags($preferred ? $preferred : \%preferred_tags); - $self->_source($source || 'EMBL'); - - if ($proxy) { - my @args = ref($proxy) ? @$proxy : eval $proxy; - $self->{_proxy} = \@args if @args; - } - $self; -} - -sub segment { - my $self = shift; - my @segments = $self->SUPER::segment(@_); - - if (!@segments) { - my $refclass = $self->refclass; - - my %args = $self->setup_segment_args(@_); - if ($args{-class} && $args{-class} =~ /$refclass/oi) { - return unless $self->load_from_embl('embl'=>$args{-name}); - @segments = $self->SUPER::segment(@_); - } elsif ($args{-class} && $args{-class} =~ /refseq|swall|embl/i) { #hack to get refseq names - return unless $self->load_from_embl(lc($args{-class})=>$args{-name}); - $args{-class} = $self->refclass; - @segments = $self->SUPER::segment(%args); - } - } - - $self->_multiple_return_args(@segments); -} - -# default is to return 'Sequence' as the class of all references -sub refclass { - my $self = shift; - my $refname = shift; - 'Sequence'; -} - -sub load_from_embl { - my $self = shift; - my $db = shift; - my $acc = shift or $self->throw('Must provide an accession ID'); - - my $biofetch; - if ($self->{_biofetch}{$db}) { - $biofetch = $self->{_biofetch}{$db}; - } else { - $biofetch = $self->{_biofetch}{$db} = Bio::DB::BioFetch->new(-db=>$db); - $biofetch->retrieval_type('tempfile'); - $biofetch->proxy(@{$self->{_proxy}}) if $self->{_proxy}; - } - - my $seq = eval {$biofetch->get_Seq_by_id($acc)} or return; - $self->_load_embl($acc,$seq); - 1; -} - -sub load_from_file { - my $self = shift; - my $file = shift; - - my $format = $file =~ /\.(gb|genbank|gbk)$/i ? 'genbank' : 'embl'; - - my $seqio = Bio::SeqIO->new( '-format' => $format, -file => $file); - my $seq = $seqio->next_seq; - - $self->_load_embl($seq->accession,$seq); - 1; -} - -sub _load_embl { - my $self = shift; - my $acc = shift; - my $seq = shift; - my $refclass = $self->refclass; - my $locus = $seq->id; - my $source = $self->_source; - - # begin loading - $self->setup_load(); - - # first synthesize the entry for the top-level feature - my @aliases; - foreach ($seq->accession,$seq->get_secondary_accessions) { - next if lc($_) eq lc($acc); - push @aliases,[Alias => $_]; - } - $self->load_gff_line( - { - ref => $acc, - class => $refclass, - source => $source, -# method => 'origin', - method => 'region', - start => 1, - stop => $seq->length, - score => undef, - strand => '.', - phase => '.', - gclass => $self->refclass, - gname => $acc, - tstart => undef, - tstop => undef, - attributes => [[Note => $seq->desc],@aliases], - } - ); - # now load each feature in turn - my ($transcript_version,$mRNA_version) = (0,0); - for my $feat ($seq->all_SeqFeatures) { - my $attributes = $self->get_attributes($feat); - my $name = $self->guess_name($attributes); - - my $location = $feat->location; - my @segments = map {[$_->start,$_->end,$_->seq_id]} - $location->can('sub_Location') ? $location->sub_Location : $location; - -# this changed CDS to coding, but that is the wrong thing to do, since -# CDS is in SOFA and coding is not -# my $type = $feat->primary_tag eq 'CDS' ? 'coding' -# : $feat->primary_tag; - my $type= $feat->primary_tag; - next if (lc($type) eq 'contig'); -# next if (lc($type) eq 'variation'); - - if (lc($type) eq 'variation' and $feat->length == 1) { - $type = 'SNP'; - } elsif (lc($type) eq 'variation' ) { - $type = 'chromosome_variation'; - } - - if ($type eq 'source') { - $type = 'region'; - } - - if ($type =~ /misc.*RNA/i) { - $type = 'RNA'; - } - - if ($type eq 'misc_feature' and $name->[1] =~ /similar/i) { - $type = 'computed_feature_by_similarity'; - } elsif ($type eq 'misc_feature') { - warn "skipping a misc_feature\n"; - next; - } - - my $parttype = $feat->primary_tag eq 'mRNA' ? 'exon' : $feat->primary_tag; - - if ($type eq 'gene') { - $transcript_version = 0; - $mRNA_version = 0; - } elsif ($type eq 'mRNA') { - $name->[1] = sprintf("%s.t%02d",$name->[1],++$transcript_version); - } elsif ($type eq 'CDS') { - $name->[0] = 'mRNA'; - $name->[1] = sprintf("%s.t%02d",$name->[1],$transcript_version); - } - - my $strand = $feat->strand; - my $str = defined $strand ? - ($strand > 0 ? '+' : '-') - : '.'; - $self->load_gff_line( { - ref => $acc, - class => $refclass, - source => $source, - method => $type, - start => $location->start, - stop => $location->end, - score => $feat->score || undef, - strand => $str, - phase => $feat->frame || '.', - gclass => $name->[0], - gname => $name->[1], - tstart => undef, - tstop => undef, - attributes => $attributes, - } - ) if ($type && - ($type ne 'CDS'||($type eq 'CDS'&&@segments==1) ) ); - - @$attributes = (); - - next if @segments == 1; - for my $segment (@segments) { - - my $strand = $feat->strand; - my $str = defined $strand ? - ($strand > 0 ? '+' : '-') - : '.'; - $self->load_gff_line( { - ref => $segment->[2] eq $locus ? $acc : $segment->[2], - class => $refclass, - source => $source, - method => $parttype, - start => $segment->[0], - stop => $segment->[1], - score => $feat->score || undef, - strand => $str, - phase => $feat->frame || '.', - gclass => $name->[0], - gname => $name->[1], - tstart => undef, - tstop => undef, - attributes => $attributes, - } - ); - } - - } - - # finish loading - $self->finish_load(); - - # now load the DNA - $self->load_sequence_string($acc,$seq->seq); - - 1; -} - -sub get_attributes { - my $self = shift; - my $seq = shift; - - my @tags = $seq->all_tags or return; - my @result; - foreach my $tag (@tags) { - foreach my $value ($seq->each_tag_value($tag)) { - push @result,[$tag=>$value]; - } - } - \@result; -} - -sub guess_name { - my $self = shift; - my $attributes = shift; -# remove this fix when Lincoln fixes it properly - return ["Misc" => "Misc"] unless ($attributes); # these are arbitrary, and possibly destructive defaults - my @ordered_attributes = sort {($self->_preferred_tags->{$a->[0]} || 0) <=> ($self->_preferred_tags->{$b->[0]} || 0)} @$attributes; - my $best = pop @ordered_attributes; - @$attributes = @ordered_attributes; - return $best; -} - - -sub _preferred_tags { - my $self = shift; - $self->{preferred_tags} = shift if @_; - return $self->{preferred_tags}; -} - -sub _source { - my $self = shift; - $self->{source} = shift if @_; - $self->{source}; -} - -1; diff --git a/lib/Bio/DB/GFF/Adaptor/biofetch_oracle.pm b/lib/Bio/DB/GFF/Adaptor/biofetch_oracle.pm deleted file mode 100644 index 7f8e0ef8a..000000000 --- a/lib/Bio/DB/GFF/Adaptor/biofetch_oracle.pm +++ /dev/null @@ -1,301 +0,0 @@ -package Bio::DB::GFF::Adaptor::biofetch_oracle; - -#$Id$ - -=head1 NAME - -Bio::DB::GFF::Adaptor::biofetch_oracle -- Cache BioFetch objects in a Bio::DB::GFF database - -=head1 SYNOPSIS - -Proof of principle. Not for production use. - -=head1 DESCRIPTION - -This adaptor is a proof-of-principle. It is used to fetch BioFetch -sequences into a Bio::DB::GFF database (currently uses a hard-coded -EMBL database) as needed. This allows the Generic Genome Browser to -be used as a Genbank/EMBL browser. - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright 2002 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -use strict; -use Bio::DB::GFF::Util::Rearrange; # for rearrange() -use Bio::DB::BioFetch; -use Bio::SeqIO; - -use vars qw(%default_preferred_tags); -use base qw(Bio::DB::GFF::Adaptor::dbi::oracle); - -# priority for choosing names of CDS tags, higher is higher priority -%default_preferred_tags = ( - strain => 10, - organism => 20, - protein_id => 40, - locus_tag => 50, - locus => 60, - gene => 70, - standard_name => 80, - ); - -sub _preferred_tags { - my ($self, $tags) = @_; - if ($tags && (ref($tags) =~ /HASH/)){ - $self->{preferred_tags} = $tags; - } - return $self->{preferred_tags}; -} - - -=head2 new - - Title : new - Usage : $db = Bio::DB::GFF->new(-adaptor=>'biofetch_oracle', -preferred_tags => \%preferred, @args) - Function: create a new adaptor - Returns : a Bio::DB::GFF object - Args : -adaptor : required. Which adaptor to use; biofetch for mysql, biofetch_oracle for Oracle - -preferred_tags : optional. A hash of {classname => weight,...} - used to determine the class and name of the feature - when a choice of possible feature classes is available - (e.g. a feature has both a 'gene' and a 'locus' tag). - Common defaults are provided that work well for eukaryotic - features (but not well for viral/prokaryotic) - see below for additional arguments. - Status : Public - -This is the constructor for the adaptor. It is called automatically -by Bio::DB::GFF-Enew. In addition to arguments that are common among -all adaptors, the following class-specific arguments are recgonized: - - Argument Description - -------- ----------- - - -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' - - -user username for authentication - - -pass the password for authentication - - -proxy [['http','ftp'],'http://proxy:8080'] - - -create initialize the database - --dsn,-user and -pass indicate the local database to cache results in, -and as are per Bio::DB::GFF::Adaptor::dbi. The -proxy argument allows -you to set the biofetch web proxy, and uses the same syntax described -for the proxy() method of L, except that the -argument must be passed as an array reference. - -=cut - -sub new { - my $class = shift; - my $args = shift; - my $self = $class->SUPER::new($args); - my ($preferred) = rearrange(['PREFERRED_TAGS'],$args); - $self->_preferred_tags($preferred?$preferred:\%default_preferred_tags); # if the caller sent their own preferences, then use these, otherwise use defaults. - - my ($proxy) = rearrange(['PROXY'],$args); - if ($proxy) { - my @args = ref($proxy) ? @$proxy : eval $proxy; - $self->{_proxy} = \@args if @args; - } - $self; -} - -sub segment { - my $self = shift; - my @segments = $self->SUPER::segment(@_); - - if (!@segments) { - my $refclass = $self->refclass; - - my %args = $self->setup_segment_args(@_); - if ($args{-class} && $args{-class} =~ /$refclass/oi) { - return unless $self->load_from_embl('embl'=>$args{-name}); - @segments = $self->SUPER::segment(@_); - } elsif ($args{-class} && $args{-class} =~ /refseq|swall|embl/i) { #hack to get refseq names - return unless $self->load_from_embl(lc($args{-class})=>$args{-name}); - $args{-class} = $self->refclass; - @segments = $self->SUPER::segment(%args); - } - } - - $self->_multiple_return_args(@segments); -} - -# default is to return 'Sequence' as the class of all references -sub refclass { - my $self = shift; - my $refname = shift; - 'Accession'; -} - -sub load_from_embl { - my $self = shift; - my $db = shift; - my $acc = shift or $self->throw('Must provide an accession ID'); - - my $biofetch; - if ($self->{_biofetch}{$db}) { - $biofetch = $self->{_biofetch}{$db}; - } else { - $biofetch = $self->{_biofetch}{$db} = Bio::DB::BioFetch->new(-db=>$db); - $biofetch->retrieval_type('tempfile'); - $biofetch->proxy(@{$self->{_proxy}}) if $self->{_proxy}; - } - - my $seq = eval {$biofetch->get_Seq_by_id($acc)} or return; - $self->_load_embl($acc,$seq); - 1; -} - -sub load_from_file { - my $self = shift; - my $file = shift; - - my $format = $file =~ /\.(gb|genbank|gbk)$/i ? 'genbank' : 'embl'; - - my $seqio = Bio::SeqIO->new( '-format' => $format, -file => $file); - my $seq = $seqio->next_seq; - - $self->_load_embl($seq->accession,$seq); - 1; -} - -sub _load_embl { - my $self = shift; - my $acc = shift; - my $seq = shift; - my $refclass = $self->refclass; - my $locus = $seq->id; - - # begin loading - $self->setup_load(); - - # first synthesize the entry for the top-level feature - my @aliases; - foreach ($seq->accession,$seq->get_secondary_accessions) { - next if lc($_) eq lc($acc); - push @aliases,[Alias => $_]; - } - $self->load_gff_line( - { - ref => $acc, - class => $refclass, - source => 'EMBL', - method => 'origin', - start => 1, - stop => $seq->length, - score => undef, - strand => '.', - phase => '.', - gclass => $self->refclass, - gname => $acc, - tstart => undef, - tstop => undef, - attributes => [[Note => $seq->desc],@aliases], - } - ); - # now load each feature in turn - for my $feat ($seq->all_SeqFeatures) { - my $attributes = $self->get_attributes($feat); - my $name = $self->guess_name($attributes); - - my $location = $feat->location; - my @segments = map {[$_->start,$_->end,$_->seq_id]} - $location->can('sub_Location') ? $location->sub_Location : $location; - - my $type = $feat->primary_tag eq 'CDS' ? 'mRNA' : $feat->primary_tag; - my $parttype = $feat->primary_tag eq 'gene' ? 'exon' : $feat->primary_tag; - - if ($feat->primary_tag =~ /^(gene|CDS)$/) { - $self->load_gff_line( { - ref => $acc, - class => $refclass, - source => 'EMBL', - method => $type, - start => $location->start, - stop => $location->end, - score => $feat->score || undef, - strand => $feat->strand > 0 ? '+' : ($feat->strand < 0 ? '-' : '.'), - phase => $feat->frame || '.', - gclass => $name->[0], - gname => $name->[1], - tstart => undef, - tstop => undef, - attributes => $attributes, - } - ); - @$attributes = (); - } - - for my $segment (@segments) { - - $self->load_gff_line( { - ref => $segment->[2] eq $locus ? $acc : $segment->[2], - class => $refclass, - source => 'EMBL', - method => $parttype, - start => $segment->[0], - stop => $segment->[1], - score => $feat->score || undef, - strand => $feat->strand > 0 ? '+' : ($feat->strand < 0 ? '-' : '.'), - phase => $feat->frame || '.', - gclass => $name->[0], - gname => $name->[1], - tstart => undef, - tstop => undef, - attributes => $attributes, - } - ); - } - - } - - # finish loading - $self->finish_load(); - - # now load the DNA - $self->load_sequence_string($acc,$seq->seq); - - 1; -} - -sub get_attributes { - my $self = shift; - my $seq = shift; - - my @tags = $seq->all_tags or return; - my @result; - foreach my $tag (@tags) { - foreach my $value ($seq->each_tag_value($tag)) { - push @result,[$tag=>$value]; - } - } - \@result; -} - -sub guess_name { - my $self = shift; - my $attributes = shift; -# remove this fix when Lincoln fixes it properly - return ["Misc" => "Misc"] unless ($attributes); # these are arbitrary, and possibly destructive defaults - my @ordered_attributes = sort {($self->_preferred_tags->{$a->[0]} || 0) <=> ($self->_preferred_tags->{$b->[0]} || 0)} @$attributes; - my $best = pop @ordered_attributes; - @$attributes = @ordered_attributes; - return $best; -} - - - -1; diff --git a/lib/Bio/DB/GFF/Adaptor/dbi.pm b/lib/Bio/DB/GFF/Adaptor/dbi.pm deleted file mode 100644 index 25ac3b12a..000000000 --- a/lib/Bio/DB/GFF/Adaptor/dbi.pm +++ /dev/null @@ -1,2474 +0,0 @@ - -=head1 NAME - -Bio::DB::GFF::Adaptor::dbi -- Database adaptor for DBI (SQL) databases - -=head1 SYNOPSIS - -See L - -=head1 DESCRIPTION - -This is the base class for DBI-based adaptors. It does everything -except generating the text of the queries to be used. See the section -QUERIES TO IMPLEMENT for the list of methods that must be implemented. - -=cut - -package Bio::DB::GFF::Adaptor::dbi; - -# base class for dbi-based implementations -use strict; - -use DBI; -use Bio::DB::GFF::Util::Rearrange; # for rearrange() -use Bio::DB::GFF::Util::Binning; -use Bio::DB::GFF::Adaptor::dbi::iterator; -use Bio::DB::GFF::Adaptor::dbi::caching_handle; - -use base qw(Bio::DB::GFF); - -# constants for choosing - -use constant MAX_SEGMENT => 1_000_000_000; # the largest a segment can get - -# this is the largest that any reference sequence can be (100 megabases) -use constant MAX_BIN => 1_000_000_000; - -# this is the smallest bin (1 K) -use constant MIN_BIN => 1000; - -# size of range over which it is faster to force the database to use the range for indexing -use constant STRAIGHT_JOIN_LIMIT => 200_000; - -# this is the size to which DNA should be shredded -use constant DNA_CHUNK_SIZE => 2000; - -# size of summary bins for interval coverage statistics -use constant SUMMARY_BIN_SIZE => 1000; - -# for debugging fbin optimization -use constant EPSILON => 1e-7; # set to zero if you trust mysql's floating point comparisons -use constant OPTIMIZE => 1; # set to zero to turn off optimization completely - -############################################################################## - - -=head2 new - - Title : new - Usage : $db = Bio::DB::GFF->new(@args) - Function: create a new adaptor - Returns : a Bio::DB::GFF object - Args : see below - Status : Public - -This is the constructor for the adaptor. It is called automatically -by Bio::DB::GFF-Enew. In addition to arguments that are common among -all adaptors, the following class-specific arguments are recgonized: - - Argument Description - -------- ----------- - - -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' - - -user username for authentication - - -pass the password for authentication - -=cut - -# Create a new Bio::DB::GFF::Adaptor::dbi object -sub new { - my $class = shift; - my ($features_db,$username,$auth,$other) = rearrange([ - [qw(FEATUREDB DB DSN)], - [qw(USERNAME USER)], - [qw(PASSWORD PASSWD PASS)], - ],@_); - - $features_db || $class->throw("new(): Provide a data source or DBI database"); - - if (!ref($features_db)) { - my $dsn = $features_db; - my @args; - push @args,$username if defined $username; - push @args,$auth if defined $auth; - $features_db = Bio::DB::GFF::Adaptor::dbi::caching_handle->new($dsn,@args) - || $class->throw("new(): Failed to connect to $dsn: " - . Bio::DB::GFF::Adaptor::dbi::caching_handle->errstr); - } else { - $features_db->isa('DBI::db') - || $class->throw("new(): $features_db is not a DBI handle"); - } - - # fill in object - return bless { - features_db => $features_db - },$class; -} - -sub debug { - my $self = shift; - $self->features_db->debug(@_); - $self->SUPER::debug(@_); -} - -=head2 features_db - - Title : features_db - Usage : $dbh = $db->features_db - Function: get database handle - Returns : a DBI handle - Args : none - Status : Public - - Note: what is returned is not really a DBI::db handle, but a - subclass of one. This means that you cannot manipulate the - handle's attributes directly. Instead call the attribute - method: - - my $dbh = $db->features_db; - $dbh->attribute(AutoCommit=>0); - -=cut - -sub features_db { shift->{features_db} } -sub dbh { shift->{features_db} } - -=head2 get_dna - - Title : get_dna - Usage : $string = $db->get_dna($name,$start,$stop,$class) - Function: get DNA string - Returns : a string - Args : name, class, start and stop of desired segment - Status : Public - -This method performs the low-level fetch of a DNA substring given its -name, class and the desired range. It is actually a front end to the -abstract method make_dna_query(), which it calls after some argument -consistency checking. - -=cut - -sub get_dna { - my $self = shift; - my ($ref,$start,$stop,$class) = @_; - - my ($offset_start,$offset_stop); - - my $has_start = defined $start; - my $has_stop = defined $stop; - - my $reversed; - if ($has_start && $has_stop && $start > $stop) { - $reversed++; - ($start,$stop) = ($stop,$start); - } - - # turn start and stop into 0-based offsets - my $cs = $self->dna_chunk_size; - $start -= 1; $stop -= 1; - $offset_start = int($start/$cs)*$cs; - $offset_stop = int($stop/$cs)*$cs; - - my $sth; - # special case, get it all - if (!($has_start || $has_stop)) { - $sth = $self->dbh->do_query('select fdna,foffset from fdna where fref=? order by foffset',$ref); - } - - elsif (!$has_stop) { - $sth = $self->dbh->do_query('select fdna,foffset from fdna where fref=? and foffset>=? order by foffset', - $ref,$offset_start); - } - - else { # both start and stop defined - $sth = $self->dbh->do_query('select fdna,foffset from fdna where fref=? and foffset>=? and foffset<=? order by foffset', - $ref,$offset_start,$offset_stop); - } - - my $dna = ''; - while (my($frag,$offset) = $sth->fetchrow_array) { - substr($frag,0,$start-$offset) = '' if $has_start && $start > $offset; - $dna .= $frag; - } - substr($dna,$stop-$start+1) = '' if $has_stop && $stop-$start+1 < length($dna); - if ($reversed) { - $dna = reverse $dna; - $dna =~ tr/gatcGATC/ctagCTAG/; - } - - $sth->finish; - $dna; -} - - -=head2 get_abscoords - - Title : get_abscoords - Usage : ($refseq,$refclass,$start,$stop,$strand) = $db->get_abscoords($name,$class) - Function: get absolute coordinates for landmark - Returns : an array ref -- see below - Args : name and class of desired landmark - Status : Public - -This method performs the low-level resolution of a landmark into a -reference sequence and position. - -The result is an array ref, each element of which is a five-element -list containing reference sequence name, class, start, stop and strand. - -=cut - -sub get_abscoords { - my $self = shift; - my ($name,$class,$refseq) = @_; - - my $sth = $self->make_abscoord_query($name,$class,$refseq); - - my @result; - while (my @row = $sth->fetchrow_array) { - push @result,\@row - } - $sth->finish; - - if (@result == 0) { - #$self->error("$name not found in database"); - my $sth2 = $self->make_aliasabscoord_query($name,$class); - - while (my @row2 = $sth2->fetchrow_array) { - push @result,\@row2 - } - $sth->finish; - - if (@result == 0){ - $self->error("$name not found in database"); - return; - } - } - return \@result; -} - - -=head2 get_features - - Title : get_features - Usage : $db->get_features($search,$options,$callback) - Function: retrieve features from the database - Returns : number of features retrieved - Args : see below - Status : Public - -This is the low-level method that is called to retrieve GFF lines from -the database. It is responsible for retrieving features that satisfy -range and feature type criteria, and passing the GFF fields to a -callback subroutine. - -See the manual page for Bio::DB::GFF for the interpretation of the -arguments and how the information retrieved by get_features is passed -to the callback for processing. - -Internally, get_features() is a front end for range_query(). The -latter method constructs the query and executes it. get_features() -calls fetchrow_array() to recover the fields and passes them to the -callback. - -=cut - -# Given sequence name, range, and optional filter, retrieve list of -# all features. Passes features through callback. -sub get_features { - my $self = shift; - my ($search,$options,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - - my $sth = $self->range_query(@{$search}{qw(rangetype - refseq - refclass - start - stop - types) }, - @{$options}{qw( - sparse - sort_by_group - ATTRIBUTES - BINSIZE)}) or return; - - my $count = 0; - while (my @row = $sth->fetchrow_array) { - $callback->(@row); - $count++; - } - $sth->finish; - return $count; -} - -=head2 classes - - Title : classes - Usage : $db->classes - Function: return list of landmark classes in database - Returns : a list of classes - Args : none - Status : public - -This routine returns the list of reference classes known to the -database, or empty if classes are not used by the database. Classes -are distinct from types, being essentially qualifiers on the reference -namespaces. - -NOTE: In the current mysql-based schema, this query takes a while to -run due to the classes not being normalized. - -=cut - -sub classes { - my $self = shift; - my ($query,@args) = $self->make_classes_query or return; - my $sth = $self->dbh->do_query($query,@args); - my @classes; - while (my ($c) = $sth->fetchrow_array) { - push @classes,$c; - } - @classes; -} - -=head2 make_classes_query - - Title : make_classes_query - Usage : ($query,@args) = $db->make_classes_query - Function: return query fragment for generating list of reference classes - Returns : a query and args - Args : none - Status : public - -=cut - -sub make_classes_query { - my $self = shift; - return; -} - -=head2 _feature_by_name - - Title : _feature_by_name - Usage : $db->get_features_by_name($name,$class,$callback) - Function: get a list of features by name and class - Returns : count of number of features retrieved - Args : name of feature, class of feature, and a callback - Status : protected - -This method is used internally. The callback arguments are those used -by make_feature(). Internally, it invokes the following abstract procedures: - - make_features_select_part - make_features_from_part - make_features_by_name_where_part - make_features_by_alias_where_part (for aliases) - make_features_join_part - -=cut - -sub _feature_by_name { - my $self = shift; - my ($class,$name,$location,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - - my $select = $self->make_features_select_part; - my $from = $self->make_features_from_part(undef,{sparse_groups=>1}); - my ($where,@args) = $self->make_features_by_name_where_part($class,$name); - my $join = $self->make_features_join_part; - my $range = $self->make_features_by_range_where_part('overlaps', - {refseq=>$location->[0], - class =>'', - start=>$location->[1], - stop =>$location->[2]}) if $location; - # group query - my $query1 = "SELECT $select FROM $from WHERE $where AND $join"; - $query1 .= " AND $range" if $range; - - # alias query - $from = $self->make_features_from_part(undef,{attributes=>1}); - ($where,@args) = $self->make_features_by_alias_where_part($class,$name); # potential bug - @args1==@args2? - - my $query2 = "SELECT $select FROM $from WHERE $where AND $join"; - $query2 .= " AND $range" if $range; - - my $count = 0; - - for my $query ($query1,$query2) { - my $sth = $self->dbh->do_query($query,@args); - while (my @row = $sth->fetchrow_array) { - $callback->(@row); - $count++; - } - $sth->finish; - } - - return $count; -} - -=head2 _feature_by_id - - Title : _feature_by_id - Usage : $db->_feature_by_id($ids,$type,$callback) - Function: get a list of features by ID - Returns : count of number of features retrieved - Args : arrayref containing list of IDs to fetch and a callback - Status : protected - -This method is used internally. The $type selector is one of -"feature" or "group". The callback arguments are those used by -make_feature(). Internally, it invokes the following abstract -procedures: - - make_features_select_part - make_features_from_part - make_features_by_id_where_part - make_features_join_part - -=cut - -sub _feature_by_id { - my $self = shift; - my ($ids,$type,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - - my $select = $self->make_features_select_part; - my $from = $self->make_features_from_part; - my ($where,@args) = $type eq 'feature' ? $self->make_features_by_id_where_part($ids) - : $self->make_features_by_gid_where_part($ids); - my $join = $self->make_features_join_part; - my $query = "SELECT $select FROM $from WHERE $where AND $join"; - my $sth = $self->dbh->do_query($query,@args); - - my $count = 0; - while (my @row = $sth->fetchrow_array) { - $callback->(@row); - $count++; - } - $sth->finish; - return $count; -} - -sub _feature_by_attribute { - my $self = shift; - my ($attributes,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - - my $select = $self->make_features_select_part; - my $from = $self->make_features_from_part(undef,{attributes=>$attributes}); - my ($where,@args) = $self->make_features_by_range_where_part('',{attributes=>$attributes}); - my $join = $self->make_features_join_part({attributes=>$attributes}); - my $query = "SELECT $select FROM $from WHERE $where AND $join"; - my $sth = $self->dbh->do_query($query,@args); - - my $count = 0; - while (my @row = $sth->fetchrow_array) { - $callback->(@row); - $count++; - } - $sth->finish; - return $count; -} - -=head2 get_types - - Title : get_types - Usage : $db->get_types($refseq,$refclass,$start,$stop,$count) - Function: get list of types - Returns : a list of Bio::DB::GFF::Typename objects - Args : see below - Status : Public - -This method is responsible for fetching the list of feature type names -from the database. The query may be limited to a particular range, in -which case the range is indicated by a landmark sequence name and -class and its subrange, if any. These arguments may be undef if it is -desired to retrieve all feature types in the database (which may be a -slow operation in some implementations). - -If the $count flag is false, the method returns a simple list of -vBio::DB::GFF::Typename objects. If $count is true, the method returns -a list of $name=E$count pairs, where $count indicates the number of -times this feature occurs in the range. - -Internally, this method calls upon the following functions to generate -the SQL and its bind variables: - - ($q1,@args) = make_types_select_part(@args); - ($q2,@args) = make_types_from_part(@args); - ($q3,@args) = make_types_where_part(@args); - ($q4,@args) = make_types_join_part(@args); - ($q5,@args) = make_types_group_part(@args); - -The components are then combined as follows: - - $query = "SELECT $q1 FROM $q2 WHERE $q3 AND $q4 GROUP BY $q5"; - -If any of the query fragments contain the ? bind variable, then the -same number of bind arguments must be provided in @args. The -fragment-generating functions are described below. - -=cut - -sub get_types { - my $self = shift; - my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_; - my $straight = $self->do_straight_join($srcseq,$start,$stop,[]) ? 'straight_join' : ''; - my ($select,@args1) = $self->make_types_select_part($srcseq,$start,$stop,$want_count,$typelist); - my ($from,@args2) = $self->make_types_from_part($srcseq,$start,$stop,$want_count,$typelist); - my ($join,@args3) = $self->make_types_join_part($srcseq,$start,$stop,$want_count,$typelist); - my ($where,@args4) = $self->make_types_where_part($srcseq,$start,$stop,$want_count,$typelist); - my ($group,@args5) = $self->make_types_group_part($srcseq,$start,$stop,$want_count,$typelist); - - my $query = "SELECT $straight $select FROM $from WHERE $join AND $where"; - $query .= " GROUP BY $group" if $group; - my @args = (@args1,@args2,@args3,@args4,@args5); - my $sth = $self->dbh->do_query($query,@args) or return; - - my (%result,%obj); - while (my ($method,$source,$count) = $sth->fetchrow_array) { - my $type = Bio::DB::GFF::Typename->new($method,$source); - $result{$type} = $count; - $obj{$type} = $type; - } - return $want_count ? %result : values %obj; -} - -=head2 range_query - - Title : range_query - Usage : $db->range_query($range_type,$refseq,$refclass,$start,$stop,$types,$order_by_group,$attributes,$binsize) - Function: create statement handle for range/overlap queries - Returns : a DBI statement handle - Args : see below - Status : Protected - -This method constructs the statement handle for this module's central -query: given a range and/or a list of feature types, fetch their GFF -records. - -The positional arguments are as follows: - - Argument Description - - $isrange A flag indicating that this is a range. - query. Otherwise an overlap query is - assumed. - - $refseq The reference sequence name (undef if no range). - - $refclass The reference sequence class (undef if no range). - - $start The start of the range (undef if none). - - $stop The stop of the range (undef if none). - - $types Array ref containing zero or feature types in the - format [method,source]. - - $order_by_group A flag indicating that statement handler should group - the features by group id (handy for iterative fetches) - - $attributes A hash containing select attributes. - - $binsize A bin size for generating tables of feature density. - -If successful, this method returns a statement handle. The handle is -expected to return the fields described for get_features(). - -Internally, range_query() makes calls to the following methods, -each of which is expected to be overridden in subclasses: - - $select = $self->make_features_select_part; - $from = $self->make_features_from_part; - $join = $self->make_features_join_part; - ($where,@args) = $self->make_features_by_range_where_part($isrange,$srcseq,$class, - $start,$stop,$types,$class); - -The query that is constructed looks like this: - - SELECT $select FROM $from WHERE $join AND $where - -The arguments that are returned from make_features_by_range_where_part() are -passed to the statement handler's execute() method. - -range_query() also calls a do_straight_join() method, described -below. If this method returns true, then the keyword "straight_join" -is inserted right after SELECT. - -=cut - -sub range_query { - my $self = shift; - my($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes,$bin) = @_; - - my $dbh = $self->features_db; - - # NOTE: straight_join is necessary in some database to force the right index to be used. - my %a = (refseq=>$refseq,class=>$class,start=>$start,stop=>$stop,types=>$types,attributes=>$attributes,bin_width=>$bin); - my $straight = $self->do_straight_join(\%a) ? 'straight_join' : ''; - my $select = $self->make_features_select_part(\%a); - my $from = $self->make_features_from_part($sparse,\%a); - my $join = $self->make_features_join_part(\%a); - my ($where,@args) = $self->make_features_by_range_where_part($rangetype,\%a); - my ($group_by,@more_args) = $self->make_features_group_by_part(\%a); - my $order_by = $self->make_features_order_by_part(\%a) if $order_by_group; - - my $query = "SELECT $straight $select FROM $from WHERE $join"; - $query .= " AND $where" if $where; - if ($group_by) { - $query .= " GROUP BY $group_by"; - push @args,@more_args; - } - $query .= " ORDER BY $order_by" if $order_by; - - my $sth = $self->dbh->do_query($query,@args); - $sth; -} - -=head2 make_features_by_range_where_part - - Title : make_features_by_range_where_part - Usage : ($string,@args) = - $db->make_features_select_part($isrange,$refseq,$class,$start,$stop,$types) - Function: make where part of the features query - Returns : the list ($query,@bind_args) - Args : see below - Status : Protected - -This method creates the part of the features query that immediately -follows the WHERE keyword and is ANDed with the string returned by -make_features_join_part(). - -The six positional arguments are a flag indicating whether to perform -a range search or an overlap search, the reference sequence, class, -start and stop, all of which define an optional range to search in, -and an array reference containing a list [$method,$souce] pairs. - -The method result is a multi-element list containing the query string -and the list of runtime arguments to bind to it with the execute() -method. - -This method's job is to clean up arguments and perform consistency -checking. The real work is done by the following abstract methods: - - Method Description - - refseq_query() Return the query string needed to match the reference - sequence. - - range_query() Return the query string needed to find all features contained - within a range. - - overlap_query() Return the query string needed to find all features that overlap - a range. - -See Bio::DB::Adaptor::dbi::mysql for an example of how this works. - -=cut - -#' - -sub make_features_by_range_where_part { - my $self = shift; - my ($rangetype,$options) = @_; - $options ||= {}; - my ($refseq,$class,$start,$stop,$types,$attributes) = - @{$options}{qw(refseq class start stop types attributes)}; - - my (@query,@args); - - if ($refseq) { - my ($q,@a) = $self->refseq_query($refseq,$class); - push @query,$q; - push @args,@a; - } - - if (defined $start or defined $stop) { - $start = 0 unless defined($start); - $stop = MAX_SEGMENT unless defined($stop); - - my ($range_query,@range_args) = - $rangetype eq 'overlaps' ? $self->overlap_query($start,$stop) - : $rangetype eq 'contains' ? $self->contains_query($start,$stop) - : $rangetype eq 'contained_in' ? $self->contained_in_query($start,$stop) - : (); - - push @query,$range_query; - push @args,@range_args; - } - - if (defined $types && @$types) { - my ($type_query,@type_args) = $self->types_query($types); - push @query,$type_query; - push @args,@type_args; - } - - if ($attributes) { - my ($attribute_query,@attribute_args) = $self->make_features_by_attribute_where_part($attributes); - push @query,"($attribute_query)"; - push @args,@attribute_args; - } - - my $query = join "\n\tAND ",@query; - return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); -} - -=head2 do_straight_join - - Title : do_straight_join - Usage : $boolean = $db->do_straight_join($refseq,$class,$start,$stop,$types) - Function: optimization flag - Returns : a flag - Args : see range_query() - Status : Protected - -This subroutine, called by range_query() returns a boolean flag. -If true, range_query() will perform a straight join, which can be -used to optimize certain SQL queries. The four arguments correspond -to similarly-named arguments passed to range_query(). - -=cut - -sub do_straight_join { 0 } # false by default - -=head2 string_match - - Title : string_match - Usage : $string = $db->string_match($field,$value) - Function: create a SQL fragment for performing exact or regexp string matching - Returns : query string - Args : the table field and match value - Status : public - -This method examines the passed value for meta characters. If so it -produces a SQL fragment that performs a regular expression match. -Otherwise, it produces a fragment that performs an exact string match. - -This method is not used in the module, but is available for use by -subclasses. - -=cut - -sub string_match { - my $self = shift; - my ($field,$value) = @_; - return qq($field = ?) if $value =~ /^[!@%&a-zA-Z0-9_\'\" ~-]+$/; - return qq($field REGEXP ?); -} - -=head2 exact_match - - Title : exact_match - Usage : $string = $db->exact_match($field,$value) - Function: create a SQL fragment for performing exact string matching - Returns : query string - Args : the table field and match value - Status : public - -This method produces the SQL fragment for matching a field name to a -constant string value. - -=cut - -sub exact_match { - my $self = shift; - my ($field,$value) = @_; - return qq($field = ?); -} - -=head2 search_notes - - Title : search_notes - Usage : @search_results = $db->search_notes("full text search string",$limit) - Function: Search the notes for a text string, using mysql full-text search - Returns : array of results - Args : full text search string, and an optional row limit - Status : public - -This is a mysql-specific method. Given a search string, it performs a -full-text search of the notes table and returns an array of results. -Each row of the returned array is a arrayref containing the following fields: - - column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment() - column 2 The text of the note - column 3 A relevance score. - column 4 A Bio::DB::GFF::Typename object - -=cut - -sub search_notes { - my $self = shift; - my ($search_string,$limit) = @_; - - $search_string =~ tr/*?//d; - - my @words = $search_string =~ /(\w+)/g; - my $regex = join '|',@words; - my @searches = map {"fattribute_value LIKE '%${_}%'"} @words; - my $search = join(' OR ',@searches); - - my $query = <dbh->do_query($query); - my @results; - while (my ($class,$name,$note,$method,$source) = $sth->fetchrow_array) { - next unless $class && $name; # sorry, ignore NULL objects - my @matches = $note =~ /($regex)/g; - my $relevance = 10*@matches; - my $featname = Bio::DB::GFF::Featname->new($class=>$name); - my $type = Bio::DB::GFF::Typename->new($method,$source); - push @results,[$featname,$note,$relevance,$type]; - last if $limit && @results >= $limit; - } - @results; -} - - -=head2 meta - - Title : meta - Usage : $value = $db->meta($name [,$newval]) - Function: get or set a meta variable - Returns : a string - Args : meta variable name and optionally value - Status : public - -Get or set a named metavariable for the database. Metavariables can -be used for database-specific settings. This method calls two -class-specific methods which must be implemented: - - make_meta_get_query() Returns a sql fragment which given a meta - parameter name, returns its value. One bind - variable. - make_meta_set_query() Returns a sql fragment which takes two bind - arguments, the parameter name and its value - - -Don't make changes unless you know what you're doing! It will affect the -persistent database. - -=cut - -sub meta { - my $self = shift; - my $param_name = uc shift; - - # getting - if (@_) { - my $value = shift; - my $sql = $self->make_meta_set_query() or return; - my $sth = $self->dbh->prepare_delayed($sql) - or $self->error("Can't prepare $sql: ",$self->dbh->errstr), return; - $sth->execute($param_name,$value) - or $self->error("Can't execute $sql: ",$self->dbh->errstr), return; - $sth->finish; - return $self->{meta}{$param_name} = $value; - } - - elsif (exists $self->{meta}{$param_name}) { - return $self->{meta}{$param_name}; - } - - else { - undef $self->{meta}{$param_name}; # so that we don't check again - my $sql = $self->make_meta_get_query() or return; - my $sth = $self->dbh->prepare_delayed($sql) - or $self->error("Can't prepare $sql: ",$self->dbh->errstr), return; - $sth->execute($param_name) - or $self->error("Can't execute $sql: ",$sth->errstr),return; - my ($value) = $sth->fetchrow_array; - $sth->finish; - return $self->{meta}{$param_name} = $value; - } - -} - -=head2 make_meta_get_query - - Title : make_meta_get_query - Usage : $sql = $db->make_meta_get_query - Function: return SQL fragment for getting a meta parameter - Returns : SQL fragment - Args : none - Status : public - -By default this does nothing; meta parameters are not stored or -retrieved. - -=cut - -sub make_meta_get_query { - return 'SELECT fvalue FROM fmeta WHERE fname=?'; -} - - -sub dna_chunk_size { - my $self = shift; - $self->meta('chunk_size') || DNA_CHUNK_SIZE; -} - -=head2 make_meta_set_query - - Title : make_meta_set_query - Usage : $sql = $db->make_meta_set_query - Function: return SQL fragment for setting a meta parameter - Returns : SQL fragment - Args : none - Status : public - -By default this does nothing; meta parameters are not stored or -retrieved. - -=cut - -sub make_meta_set_query { - return; -} - -=head2 default_meta_values - - Title : default_meta_values - Usage : %values = $db->default_meta_values - Function: empty the database - Returns : a list of tag=>value pairs - Args : none - Status : protected - -This method returns a list of tag=Evalue pairs that contain default -meta information about the database. It is invoked by initialize() to -write out the default meta values. The base class version returns an -empty list. - -For things to work properly, meta value names must be UPPERCASE. - -=cut - -sub default_meta_values { - my $self = shift; - my @values = $self->SUPER::default_meta_values; - return ( - @values, - max_bin => MAX_BIN, - min_bin => MIN_BIN, - straight_join_limit => STRAIGHT_JOIN_LIMIT, - chunk_size => DNA_CHUNK_SIZE, - ); -} - -sub min_bin { - my $self = shift; - return $self->meta('min_bin') || MIN_BIN; -} -sub max_bin { - my $self = shift; - return $self->meta('max_bin') || MAX_BIN; -} - -sub straight_join_limit { - my $self = shift; - return $self->meta('straight_join_limit') || STRAIGHT_JOIN_LIMIT; -} - -=head2 get_features_iterator - - Title : get_features_iterator - Usage : $iterator = $db->get_features_iterator($search,$options,$callback) - Function: create an iterator on a features() query - Returns : A Bio::DB::GFF::Adaptor::dbi::iterator object - Args : see get_features() - Status : public - -This method is similar to get_features(), except that it returns an -iterator across the query. See -L. - -=cut - -sub get_features_iterator { - my $self = shift; - my ($search,$options,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - my $sth = $self->range_query(@{$search}{qw(rangetype - refseq - refclass - start - stop - types)}, - @{$options}{qw( - sparse - sort_by_group - ATTRIBUTES - BINSIZE)}) or return; - return Bio::DB::GFF::Adaptor::dbi::iterator->new($sth,$callback); -} - -########################## loading and initialization ##################### - -=head2 do_initialize - - Title : do_initialize - Usage : $success = $db->do_initialize($drop_all) - Function: initialize the database - Returns : a boolean indicating the success of the operation - Args : a boolean indicating whether to delete existing data - Status : protected - -This method will load the schema into the database. If $drop_all is -true, then any existing data in the tables known to the schema will be -deleted. - -Internally, this method calls schema() to get the schema data. - -=cut - -# Create the schema from scratch. -# You will need create privileges for this. -sub do_initialize { - #shift->throw("do_initialize(): must be implemented by subclass"); - my $self = shift; - my $erase = shift; - $self->drop_all if $erase; - - my $dbh = $self->features_db; - my $schema = $self->schema; - foreach my $table_name ($self->tables) { - my $create_table_stmt = $schema->{$table_name}{table} ; - $dbh->do($create_table_stmt) || warn $dbh->errstr; - $self->create_other_schema_objects(\%{$schema->{$table_name}}); - } - - 1; -} - -=head2 finish_load - - Title : finish_load - Usage : $db->finish_load - Function: called after load_gff_line() - Returns : number of records loaded - Args : none - Status : protected - -This method performs schema-specific cleanup after loading a set of -GFF records. It finishes each of the statement handlers prepared by -setup_load(). - -=cut - -sub finish_load { - my $self = shift; - - my $dbh = $self->features_db or return; - $dbh->do('UNLOCK TABLES') if $self->lock_on_load; - - foreach (keys %{$self->{load_stuff}{sth}}) { - $self->{load_stuff}{sth}{$_}->finish; - } - - my $counter = $self->{load_stuff}{counter}; - delete $self->{load_stuff}; - return $counter; -} - - -=head2 create_other_schema_objects - - Title : create_other_schema_objects - Usage : $self->create_other_schema_objects($table_name) - Function: create other schema objects like : indexes, sequences, triggers - Returns : - Args : - Status : Abstract - -=cut - -sub create_other_schema_objects{ - #shift->throw("create_other_schema_objects(): must be implemented by subclass"); - my $self = shift ; - my $table_schema = shift ; - my $dbh = $self->features_db; - foreach my $object_type(keys %$table_schema){ - if ($object_type !~ /table/) { - foreach my $object_name(keys %{$table_schema->{$object_type}}){ - my $create_object_stmt = $table_schema->{$object_type}{$object_name}; - $dbh->do($create_object_stmt) || warn $dbh->errstr; - } - } - } - 1; -} - -=head2 drop_all - - Title : drop_all - Usage : $db->drop_all - Function: empty the database - Returns : void - Args : none - Status : protected - -This method drops the tables known to this module. Internally it -calls the abstract tables() method. - -=cut - -# Drop all the GFF tables -- dangerous! -sub drop_all { - #shift->throw("drop_all(): must be implemented by subclass"); - my $self = shift; - my $dbh = $self->features_db; - my $schema = $self->schema; - - local $dbh->{PrintError} = 0; - foreach ($self->tables) { - $dbh->do("drop table $_") || warn $dbh->errstr; - - #when dropping a table - the indexes and triggers are being dropped automatically - # sequences needs to be dropped - if there are any (Oracle, PostgreSQL) - if ($schema->{$_}{sequence}){ - foreach my $sequence_name(keys %{$schema->{$_}{sequence}}) { - $dbh->do("drop sequence $sequence_name"); - } - } - - #$self->drop_other_schema_objects($_); - - } -} - -=head2 clone - -The clone() method should be used when you want to pass the -Bio::DB::GFF object to a child process across a fork(). The child must -call clone() before making any queries. - -This method does two things: (1) it sets the underlying database -handle's InactiveDestroy parameter to 1, thereby preventing the -database connection from being destroyed in the parent when the dbh's -destructor is called; (2) it replaces the dbh with the result of -dbh-Eclone(), so that we now have an independent handle. - -=cut - -sub clone { - my $self = shift; - $self->features_db->clone; -} - - -=head1 QUERIES TO IMPLEMENT - -The following astract methods either return DBI statement handles or -fragments of SQL. They must be implemented by subclasses of this -module. See Bio::DB::GFF::Adaptor::dbi::mysql for examples. - - - - -=head2 drop_other_schema_objects - - Title : drop_other_schema_objects - Usage : $self->create_other_schema_objects($table_name) - Function: create other schema objects like : indexes, sequences, triggers - Returns : - Args : - Status : Abstract - - -=cut - -sub drop_other_schema_objects{ - #shift->throw("drop_other_schema_objects(): must be implemented by subclass"); - -} - -=head2 make_features_select_part - - Title : make_features_select_part - Usage : $string = $db->make_features_select_part() - Function: make select part of the features query - Returns : a string - Args : none - Status : Abstract - -This abstract method creates the part of the features query that -immediately follows the SELECT keyword. - -=cut - -sub make_features_select_part { - shift->throw("make_features_select_part(): must be implemented by subclass"); -} - -=head2 tables - - Title : tables - Usage : @tables = $db->tables - Function: return list of tables that belong to this module - Returns : list of tables - Args : none - Status : protected - -This method lists the tables known to the module. - -=cut - -# return list of tables that "belong" to us. -sub tables { - my $schema = shift->schema; - return keys %$schema; -} - -=head2 schema - - Title : schema - Usage : $schema = $db->schema - Function: return the CREATE script for the schema - Returns : a hashref - Args : none - Status : abstract - -This method returns an array ref containing the various CREATE -statements needed to initialize the database tables. The keys are the -table names, and the values are strings containing the appropriate -CREATE statement. - -=cut - -sub schema { - shift->throw("The schema() method must be implemented by subclass"); -} - -=head2 DESTROY - - Title : DESTROY - Usage : $db->DESTROY - Function: disconnect database at destruct time - Returns : void - Args : none - Status : protected - -This is the destructor for the class. - -=cut - -sub DESTROY { - my $self = shift; - $self->features_db->disconnect if defined $self->features_db; -} - -################## query cache ################## - - -######################################### -## Moved from mysql.pm and mysqlopt.pm ## -######################################### - -=head2 make_features_by_name_where_part - - Title : make_features_by_name_where_part - Usage : $db->make_features_by_name_where_part - Function: create the SQL fragment needed to select a feature by its group name & class - Returns : a SQL fragment and bind arguments - Args : see below - Status : Protected - -=cut - -sub make_features_by_name_where_part { - my $self = shift; - my ($class,$name) = @_; - if ($name =~ /\*/) { - $name =~ s/%/\\%/g; - $name =~ s/_/\\_/g; - $name =~ tr/*/%/; - return ("fgroup.gclass=? AND fgroup.gname LIKE ?",$class,$name); - } else { - return ("fgroup.gclass=? AND fgroup.gname=?",$class,$name); - } -} - -sub make_features_by_alias_where_part { - my $self = shift; - my ($class,$name) = @_; - if ($name =~ /\*/) { - $name =~ tr/*/%/; - $name =~ s/_/\\_/g; - return ("fgroup.gclass=? AND fattribute_to_feature.fattribute_value LIKE ? AND fgroup.gid=fdata.gid AND fattribute.fattribute_name in ('Alias','Name') AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id AND fattribute_to_feature.fid=fdata.fid AND ftype.ftypeid=fdata.ftypeid",$class,$name) - } else { - return ("fgroup.gclass=? AND fattribute_to_feature.fattribute_value=? AND fgroup.gid=fdata.gid AND fattribute.fattribute_name in ('Alias','Name') AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id AND fattribute_to_feature.fid=fdata.fid AND ftype.ftypeid=fdata.ftypeid",$class,$name); - } - -} - -sub make_features_by_attribute_where_part { - my $self = shift; - my $attributes = shift; - my @args; - my @sql; - foreach (keys %$attributes) { - push @sql,"(fattribute.fattribute_name=? AND fattribute_to_feature.fattribute_value=?)"; - push @args,($_,$attributes->{$_}); - } - return (join(' OR ',@sql),@args); -} - -=head2 make_features_by_id_where_part - - Title : make_features_by_id_where_part - Usage : $db->make_features_by_id_where_part($ids) - Function: create the SQL fragment needed to select a set of features by their ids - Returns : a SQL fragment and bind arguments - Args : arrayref of IDs - Status : Protected - -=cut - -sub make_features_by_id_where_part { - my $self = shift; - my $ids = shift; - my $set = join ",",@$ids; - return ("fdata.fid IN ($set)"); -} - -=head2 make_features_by_gid_where_part - - Title : make_features_by_id_where_part - Usage : $db->make_features_by_gid_where_part($ids) - Function: create the SQL fragment needed to select a set of features by their ids - Returns : a SQL fragment and bind arguments - Args : arrayref of IDs - Status : Protected - -=cut - -sub make_features_by_gid_where_part { - my $self = shift; - my $ids = shift; - my $set = join ",",@$ids; - return ("fgroup.gid IN ($set)"); -} - - -=head2 make_features_from_part - - Title : make_features_from_part - Usage : $string = $db->make_features_from_part() - Function: make from part of the features query - Returns : a string - Args : none - Status : protected - -This method creates the part of the features query that immediately -follows the FROM keyword. - -=cut - -sub make_features_from_part { - my $self = shift; - my $sparse = shift; - my $options = shift || {}; - return $options->{attributes} ? "fdata,ftype,fgroup,fattribute,fattribute_to_feature\n" - : "fdata,ftype,fgroup\n"; -} - - -=head2 make_features_join_part - - Title : make_features_join_part - Usage : $string = $db->make_features_join_part() - Function: make join part of the features query - Returns : a string - Args : none - Status : protected - -This method creates the part of the features query that immediately -follows the WHERE keyword. - -=cut - -sub make_features_join_part { - my $self = shift; - my $options = shift || {}; - return !$options->{attributes} ? <make_features_order_by_part() - Function: make the ORDER BY part of the features() query - Returns : a SQL fragment and bind arguments, if any - Args : none - Status : protected - -This method creates the part of the features query that immediately -follows the ORDER BY part of the query issued by features() and -related methods. - -=cut - -sub make_features_order_by_part { - my $self = shift; - my $options = shift || {}; - return "fgroup.gname"; -} - -=head2 make_features_group_by_part - - Title : make_features_group_by_part - Usage : ($query,@args) = $db->make_features_group_by_part() - Function: make the GROUP BY part of the features() query - Returns : a SQL fragment and bind arguments, if any - Args : none - Status : protected - -This method creates the part of the features query that immediately -follows the GROUP BY part of the query issued by features() and -related methods. - -=cut - -sub make_features_group_by_part { - my $self = shift; - my $options = shift || {}; - if (my $att = $options->{attributes}) { - my $key_count = keys %$att; - return unless $key_count > 1; - return ("fdata.fid,fref,fstart,fstop,fsource, - fmethod,fscore,fstrand,fphase,gclass,gname,ftarget_start, - ftarget_stop,fdata.gid - HAVING count(fdata.fid) > ?",$key_count-1); - } - elsif (my $b = $options->{bin_width}) { - return "fref,fstart,fdata.ftypeid"; - } - -} - -=head2 refseq_query - - Title : refseq_query - Usage : ($query,@args) = $db->refseq_query($name,$class) - Function: create SQL fragment that selects the desired reference sequence - Returns : a list containing the query and bind arguments - Args : reference sequence name and class - Status : protected - -This method is called by make_features_by_range_where_part() to -construct the part of the select WHERE section that selects a -particular reference sequence. It returns a mult-element list in -which the first element is the SQL fragment and subsequent elements -are bind values. - -For example: - - sub refseq_query { - my ($name,$class) = @_; - return ('gff.refseq=? AND gff.refclass=?', - $name,$class); - } - -The current schema does not distinguish among different classes of -reference sequence. - -=cut - -# IMPORTANT NOTE: THE MYSQL SCHEMA IGNORES THE SEQUENCE CLASS -# THIS SHOULD BE FIXED -sub refseq_query { - my $self = shift; - my ($refseq,$refclass) = @_; - my $query = "fdata.fref=?"; - return wantarray ? ($query,$refseq) : $self->dbh->dbi_quote($query,$refseq); -} - -=head2 attributes - - Title : attributes - Usage : @attributes = $db->attributes($id,$name) - Function: get the attributes on a particular feature - Returns : an array of string - Args : feature ID - Status : public - -Some GFF version 2 files use the groups column to store a series of -attribute/value pairs. In this interpretation of GFF, the first such -pair is treated as the primary group for the feature; subsequent pairs -are treated as attributes. Two attributes have special meaning: -"Note" is for backward compatibility and is used for unstructured text -remarks. "Alias" is considered as a synonym for the feature name. - -If no name is provided, then attributes() returns a flattened hash, of -attribute=Evalue pairs. This lets you do: - - %attributes = $db->attributes($id); - -Normally, attributes() will be called by the feature: - - @notes = $feature->attributes('Note'); - -=cut - -sub do_attributes { - my $self = shift; - my ($id,$tag) = @_; - my $sth; - if ($id) { - my $from = 'fattribute_to_feature,fattribute'; - my $join = 'fattribute.fattribute_id=fattribute_to_feature.fattribute_id'; - my $where1 = 'fid=? AND fattribute_name=?'; - my $where2 = 'fid=?'; - $sth = defined($tag) ? $self->dbh->do_query("SELECT fattribute_value FROM $from WHERE $where1 AND $join",$id,$tag) - : $self->dbh->do_query("SELECT fattribute_name,fattribute_value FROM $from WHERE $where2 AND $join",$id); - } - else { - $sth = $self->dbh->do_query("SELECT fattribute_name FROM fattribute"); - } - my @result; - while (my @stuff = $sth->fetchrow_array) { - push @result,@stuff; - } - $sth->finish; - return @result; -} - - - -=head2 overlap_query_nobin - - Title : overlap_query - Usage : ($query,@args) = $db->overlap_query($start,$stop) - Function: create SQL fragment that selects the desired features by range - Returns : a list containing the query and bind arguments - Args : the start and stop of a range, inclusive - Status : protected - -This method is called by make_features_byrange_where_part() to construct the -part of the select WHERE section that selects a set of features that -overlap a range. It returns a multi-element list in which the first -element is the SQL fragment and subsequent elements are bind values. - - -sub overlap_query_nobin { - my ($start,$stop) = @_; - return ('gff.stopE=? AND gff.startE=?', - $start,$stop); - -=cut - -# find features that overlap a given range -sub overlap_query_nobin { - my $self = shift; - my ($start,$stop) = @_; - - my $query = qq(fdata.fstop>=? AND fdata.fstart<=?); - return wantarray ? ($query,$start,$stop) : $self->dbh->dbi_quote($query,$start,$stop); -} - -=head2 contains_query_nobin - - Title : contains_query - Usage : ($query,@args) = $db->contains_query_nobin($start,$stop) - Function: create SQL fragment that selects the desired features by range - Returns : a list containing the query and bind arguments - Args : the start and stop of a range, inclusive - Status : protected - -This method is called by make_features_byrange_where_part() to construct the -part of the select WHERE section that selects a set of features -entirely enclosed by a range. It returns a multi-element list in which -the first element is the SQL fragment and subsequent elements are bind -values. For example: - - sub contains_query_nobin { - my ($start,$stop) = @_; - return ('gff.start>=? AND gff.stop<=?', - $start,$stop); - -=cut - -# find features that are completely contained within a range -sub contains_query_nobin { - my $self = shift; - my ($start,$stop) = @_; - my $query = qq(fdata.fstart>=? AND fdata.fstop<=?); - return wantarray ? ($query,$start,$stop) : $self->dbh->dbi_quote($query,$start,$stop); -} - -=head2 contained_in_query_nobin - - Title : contained_in_query_nobin - Usage : ($query,@args) = $db->contained_in_query($start,$stop) - Function: create SQL fragment that selects the desired features by range - Returns : a list containing the query and bind arguments - Args : the start and stop of a range, inclusive - Status : protected - -This method is called by make_features_byrange_where_part() to construct the -part of the select WHERE section that selects a set of features -entirely enclosed by a range. It returns a multi-element list in which -the first element is the SQL fragment and subsequent elements are bind -values.For example: - - sub contained_in_query_nobin { - my ($start,$stop) = @_; - return ('gff.start<=? AND gff.stop>=?', - $start,$stop); - } - -=cut - -# find features that are completely contained within a range -sub contained_in_query_nobin { - my $self = shift; - my ($start,$stop) = @_; - my $query = qq(fdata.fstart<=? AND fdata.fstop>=?); - return wantarray ? ($query,$start,$stop) : $self->dbh->dbi_quote($query,$start,$stop); -} - -=head2 types_query - - Title : types_query - Usage : ($query,@args) = $db->types_query($types) - Function: create SQL fragment that selects the desired features by type - Returns : a list containing the query and bind arguments - Args : an array reference containing the types - Status : protected - -This method is called by make_features_byrange_where_part() to construct the -part of the select WHERE section that selects a set of features based -on their type. It returns a multi-element list in which the first -element is the SQL fragment and subsequent elements are bind values. -The argument is an array reference containing zero or more -[$method,$source] pairs. - -=cut - -# generate the fragment of SQL responsible for searching for -# features with particular types and methods -sub types_query { - my $self = shift; - my $types = shift; - - my @method_queries; - my @args; - for my $type (@$types) { - my ($method,$source) = @$type; - my ($mlike, $slike) = (0, 0); - if ($method && $method =~ m/\.\*/) { - $method =~ s/%/\\%/g; - $method =~ s/_/\\_/g; - $method =~ s/\.\*\??/%/g; - $mlike++; - } - if ($source && $source =~ m/\.\*/) { - $source =~ s/%/\\%/g; - $source =~ s/_/\\_/g; - $source =~ s/\.\*\??/%/g; - $slike++; - } - my @pair; - if (defined $method && length $method) { - push @pair, $mlike ? qq(fmethod LIKE ?) : qq(fmethod = ?); - push @args, $method; - } - if (defined $source && length $source) { - push @pair, $slike ? qq(fsource LIKE ?) : qq(fsource = ?); - push @args, $source; - } - push @method_queries,"(" . join(' AND ',@pair) .")" if @pair; -} - my $query = " (".join(' OR ',@method_queries).")\n" if @method_queries; - return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); -} - -=head2 make_types_select_part - - Title : make_types_select_part - Usage : ($string,@args) = $db->make_types_select_part(@args) - Function: create the select portion of the SQL for fetching features type list - Returns : query string and bind arguments - Args : see below - Status : protected - -This method is called by get_types() to generate the query fragment -and bind arguments for the SELECT part of the query that retrieves -lists of feature types. The four positional arguments are as follows: - - $refseq reference sequence name - $start start of region - $stop end of region - $want_count true to return the count of this feature type - -If $want_count is false, the SQL fragment returned must produce a list -of feature types in the format (method, source). - -If $want_count is true, the returned fragment must produce a list of -feature types in the format (method, source, count). - -=cut - -#------------------------- support for the types() query ------------------------ -sub make_types_select_part { - my $self = shift; - my ($srcseq,$start,$stop,$want_count) = @_; - my $query = $want_count ? 'ftype.fmethod,ftype.fsource,count(fdata.ftypeid)' - : 'fmethod,fsource'; - return $query; -} - -=head2 make_types_from_part - - Title : make_types_from_part - Usage : ($string,@args) = $db->make_types_from_part(@args) - Function: create the FROM portion of the SQL for fetching features type lists - Returns : query string and bind arguments - Args : see below - Status : protected - -This method is called by get_types() to generate the query fragment -and bind arguments for the FROM part of the query that retrieves lists -of feature types. The four positional arguments are as follows: - - $refseq reference sequence name - $start start of region - $stop end of region - $want_count true to return the count of this feature type - -If $want_count is false, the SQL fragment returned must produce a list -of feature types in the format (method, source). - -If $want_count is true, the returned fragment must produce a list of -feature types in the format (method, source, count). - -=cut - -sub make_types_from_part { - my $self = shift; - my ($srcseq,$start,$stop,$want_count) = @_; - my $query = defined($srcseq) || $want_count ? 'fdata,ftype' : 'ftype'; - return $query; -} - -=head2 make_types_join_part - - Title : make_types_join_part - Usage : ($string,@args) = $db->make_types_join_part(@args) - Function: create the JOIN portion of the SQL for fetching features type lists - Returns : query string and bind arguments - Args : see below - Status : protected - -This method is called by get_types() to generate the query fragment -and bind arguments for the JOIN part of the query that retrieves lists -of feature types. The four positional arguments are as follows: - - $refseq reference sequence name - $start start of region - $stop end of region - $want_count true to return the count of this feature type - -=cut - -sub make_types_join_part { - my $self = shift; - my ($srcseq,$start,$stop,$want_count) = @_; - my $query = defined($srcseq) || $want_count ? 'fdata.ftypeid=ftype.ftypeid' - : ''; - return $query || '1=1'; -} - -=head2 make_types_where_part - - Title : make_types_where_part - Usage : ($string,@args) = $db->make_types_where_part(@args) - Function: create the WHERE portion of the SQL for fetching features type lists - Returns : query string and bind arguments - Args : see below - Status : protected - -This method is called by get_types() to generate the query fragment -and bind arguments for the WHERE part of the query that retrieves -lists of feature types. The four positional arguments are as follows: - - $refseq reference sequence name - $start start of region - $stop end of region - $want_count true to return the count of this feature type - -=cut - -sub make_types_where_part { - my $self = shift; - my ($srcseq,$start,$stop,$want_count,$typelist) = @_; - my (@query,@args); - if (defined($srcseq)) { - push @query,'fdata.fref=?'; - push @args,$srcseq; - if (defined $start or defined $stop) { - $start = 1 unless defined $start; - $stop = MAX_SEGMENT unless defined $stop; - my ($q,@a) = $self->overlap_query($start,$stop); - push @query,"($q)"; - push @args,@a; - } - } - if (defined $typelist && @$typelist) { - my ($q,@a) = $self->types_query($typelist); - push @query,($q); - push @args,@a; - } - my $query = @query ? join(' AND ',@query) : '1=1'; - return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); -} - -=head2 make_types_group_part - - Title : make_types_group_part - Usage : ($string,@args) = $db->make_types_group_part(@args) - Function: create the GROUP BY portion of the SQL for fetching features type lists - Returns : query string and bind arguments - Args : see below - Status : protected - -This method is called by get_types() to generate the query fragment -and bind arguments for the GROUP BY part of the query that retrieves -lists of feature types. The four positional arguments are as follows: - - $refseq reference sequence name - $start start of region - $stop end of region - $want_count true to return the count of this feature type - -=cut - -sub make_types_group_part { - my $self = shift; - my ($srcseq,$start,$stop,$want_count) = @_; - return unless $srcseq or $want_count; - return 'ftype.ftypeid,ftype.fmethod,ftype.fsource'; -} - - -=head2 get_feature_id - - Title : get_feature_id - Usage : $integer = $db->get_feature_id($ref,$start,$stop,$typeid,$groupid) - Function: get the ID of a feature - Returns : an integer ID or undef - Args : none - Status : private - -This internal method is called by load_gff_line to look up the integer -ID of an existing feature. It is ony needed when replacing a feature -with new information. - -=cut - -# this method is called when needed to look up a feature's ID -sub get_feature_id { - my $self = shift; - my ($ref,$start,$stop,$typeid,$groupid) = @_; - my $s = $self->{load_stuff}; - unless ($s->{get_feature_id}) { - my $dbh = $self->features_db; - $s->{get_feature_id} = - $dbh->prepare_delayed('SELECT fid FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND ftypeid=? AND gid=?'); - } - my $sth = $s->{get_feature_id} or return; - $sth->execute($ref,$start,$stop,$typeid,$groupid) or return; - my ($fid) = $sth->fetchrow_array; - return $fid; -} - - - -=head2 make_abscoord_query - - Title : make_abscoord_query - Usage : $sth = $db->make_abscoord_query($name,$class); - Function: create query that finds the reference sequence coordinates given a landmark & classa - Returns : a DBI statement handle - Args : name and class of landmark - Status : protected - -The statement handler should return rows containing five fields: - - 1. reference sequence name - 2. reference sequence class - 3. start position - 4. stop position - 5. strand ("+" or "-") - -This query always returns "Sequence" as the class of the reference -sequence. - -=cut - -# given sequence name, return (reference,start,stop,strand) -sub make_abscoord_query { - my $self = shift; - my ($name,$class,$refseq) = @_; - #my $query = GETSEQCOORDS; - my $query = $self->getseqcoords_query(); - my $getforcedseqcoords = $self->getforcedseqcoords_query() ; - if ($name =~ /\*/) { - $name =~ s/%/\\%/g; - $name =~ s/_/\\_/g; - $name =~ tr/*/%/; - $query =~ s/gname=\?/gname LIKE ?/; - } - defined $refseq ? $self->dbh->do_query($getforcedseqcoords,$name,$class,$refseq) - : $self->dbh->do_query($query,$name,$class); -} - -sub make_aliasabscoord_query { - my $self = shift; - my ($name,$class) = @_; - #my $query = GETALIASCOORDS; - my $query = $self->getaliascoords_query(); - if ($name =~ /\*/) { - $name =~ s/%/\\%/g; - $name =~ s/_/\\_/g; - $name =~ tr/*/%/; - $query =~ s/gname=\?/gname LIKE ?/; - } - $self->dbh->do_query($query,$name,$class); -} - -sub getseqcoords_query { - shift->throw("getseqcoords_query(): must be implemented by a subclass"); -} - -sub getaliascoords_query { - shift->throw("getaliascoords_query(): must be implemented by a subclass"); -} - -sub bin_query { - my $self = shift; - my ($start,$stop,$minbin,$maxbin) = @_; - if ($start && $start < 0 && $stop > 0) { # split the queries - my ($lower_query,@lower_args) = $self->_bin_query($start,0,$minbin,$maxbin); - my ($upper_query,@upper_args) = $self->_bin_query(0,$stop,$minbin,$maxbin); - my $query = "$lower_query\n\t OR $upper_query"; - my @args = (@lower_args,@upper_args); - return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); - } else { - return $self->_bin_query($start,$stop,$minbin,$maxbin); - } -} - -sub _bin_query { - my $self = shift; - my ($start,$stop,$minbin,$maxbin) = @_; - my ($query,@args); - - $start = 0 unless defined($start); - $stop = $self->meta('max_bin') unless defined($stop); - - my @bins; - $minbin = defined $minbin ? $minbin : $self->min_bin; - $maxbin = defined $maxbin ? $maxbin : $self->max_bin; - my $tier = $maxbin; - while ($tier >= $minbin) { - my ($tier_start,$tier_stop) = (bin_bot($tier,$start)-EPSILON(),bin_top($tier,$stop)+EPSILON()); - ($tier_start,$tier_stop) = ($tier_stop,$tier_start) if $tier_start > $tier_stop; # can happen when working with negative coordinates - if ($tier_start == $tier_stop) { - push @bins,'fbin=?'; - push @args,$tier_start; - } else { - push @bins,'fbin between ? and ?'; - push @args,($tier_start,$tier_stop); - } - $tier /= 10; - } - $query = join("\n\t OR ",@bins); - return wantarray ? ($query,@args) - : $self->dbh->dbi_quote($query,@args); -} - -# find features that overlap a given range -sub overlap_query { - my $self = shift; - my ($start,$stop) = @_; - - my ($query,@args); - my ($iq,@iargs) = $self->overlap_query_nobin($start,$stop); - if (OPTIMIZE) { - my ($bq,@bargs) = $self->bin_query($start,$stop); - $query = "($bq)\n\tAND $iq"; - @args = (@bargs,@iargs); - } - else { - $query = $iq; - @args = @iargs; - } - - return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); -} - -# find features that are completely contained within a ranged -sub contains_query { - my $self = shift; - my ($start,$stop) = @_; - my ($bq,@bargs) = $self->bin_query($start,$stop,undef,bin($start,$stop,$self->min_bin)); - my ($iq,@iargs) = $self->contains_query_nobin($start,$stop); - my $query = "($bq)\n\tAND $iq"; - my @args = (@bargs,@iargs); - return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); -} - -# find features that are completely contained within a range -sub contained_in_query { - my $self = shift; - my ($start,$stop) = @_; - my ($bq,@bargs) = $self->bin_query($start,$stop,abs($stop-$start)+1,undef); - my ($iq,@iargs) = $self->contained_in_query_nobin($start,$stop); - my $query = "($bq)\n\tAND $iq"; - my @args = (@bargs,@iargs); - return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); -} - -# implement the _delete_fattribute_to_feature() method -sub _delete_fattribute_to_feature { - my $self = shift; - my @feature_ids = @_; - my $dbh = $self->features_db; - my $fields = join ',',map{$dbh->quote($_)} @feature_ids; - - my $query = "delete from fattribute_to_feature where fid in ($fields)"; - warn "$query\n" if $self->debug; - my $result = $dbh->do($query); - defined $result or $self->throw($dbh->errstr); - $result; -} - -# implement the _delete_features() method -sub _delete_features { - my $self = shift; - my @feature_ids = @_; - my $dbh = $self->features_db; - my $fields = join ',',map{$dbh->quote($_)} @feature_ids; - - # delete from fattribute_to_feature - $self->_delete_fattribute_to_feature(@feature_ids); - - my $query = "delete from fdata where fid in ($fields)"; - warn "$query\n" if $self->debug; - my $result = $dbh->do($query); - defined $result or $self->throw($dbh->errstr); - $result; -} - -# implement the _delete_groups() method -sub _delete_groups { - my $self = shift; - my @group_ids = @_; - my $dbh = $self->features_db; - my $fields = join ',',map{$dbh->quote($_)} @group_ids; - - foreach my $gid (@group_ids){ - my @features = $self->get_feature_by_gid($gid); - $self->delete_features(@features); - } - - my $query = "delete from fgroup where gid in ($fields)"; - warn "$query\n" if $self->debug; - my $result = $dbh->do($query); - defined $result or $self->throw($dbh->errstr); - $result; -} - -# implement the _delete() method -sub _delete { - my $self = shift; - my $delete_spec = shift; - my $ranges = $delete_spec->{segments} || []; - my $types = $delete_spec->{types} || []; - my $force = $delete_spec->{force}; - my $range_type = $delete_spec->{range_type}; - my $dbh = $self->features_db; - - my $query = 'delete from fdata'; - my @where; - - my @range_part; - for my $segment (@$ranges) { - my $ref = $dbh->quote($segment->abs_ref); - my $start = $segment->abs_start; - my $stop = $segment->abs_stop; - my $range = $range_type eq 'overlaps' ? $self->overlap_query($start,$stop) - : $range_type eq 'contains' ? $self->contains_query($start,$stop) - : $range_type eq 'contained_in' ? $self->contained_in_query($start,$stop) - : $self->throw("Invalid range type '$range_type'"); - push @range_part,"(fref=$ref AND $range)"; - } - push @where,'('. join(' OR ',@range_part).')' if @range_part; - - # get all the types - if (@$types) { - my $types_where = $self->types_query($types); - my $types_query = "select ftypeid from ftype where $types_where"; - my $result = $dbh->selectall_arrayref($types_query); - my @typeids = map {$_->[0]} @$result; - my $typelist = join ',',map{$dbh->quote($_)} @typeids; - $typelist ||= "0"; # don't cause DBI to die with invalid SQL when - # unknown feature types were requested. - push @where,"(ftypeid in ($typelist))"; - } - $self->throw("This operation would delete all feature data and -force not specified") - unless @where || $force; - $query .= " where ".join(' and ',@where) if @where; - warn "$query\n" if $self->debug; - my $result = $dbh->do($query); - - defined $result or $self->throw($dbh->errstr); - $result; -} - - -=head2 feature_summary - - Title : feature_summary - Usage : $summary = $db->feature_summary(@args) - Function: returns a coverage summary across indicated region/type - Returns : a Bio::SeqFeatureI object containing the "coverage" tag - Args : see below - Status : public - -This method is used to get coverage density information across a -region of interest. You provide it with a region of interest, optional -a list of feature types, and a count of the number of bins over which -you want to calculate the coverage density. An object is returned -corresponding to the requested region. It contains a tag called -"coverage" that will return an array ref of "bins" length. Each -element of the array describes the number of features that overlap the -bin at this position. - -Arguments: - - Argument Description - -------- ----------- - - -seq_id Sequence ID for the region - -start Start of region - -end End of region - -type/-types Feature type of interest or array ref of types - -bins Number of bins across region. Defaults to 1000. - -iterator Return an iterator across the region - -Note that this method uses an approximate algorithm that is only -accurate to 500 bp, so when dealing with bins that are smaller than -1000 bp, you may see some shifting of counts between adjacent bins. - -Although an -iterator option is provided, the method only ever returns -a single feature, so this is fairly useless. - -=cut - - -sub feature_summary { - my $self = shift; - my ($seq_name,$start,$end,$types,$bins,$iterator) = - rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'], - ['TYPES','TYPE','PRIMARY_TAG'], - 'BINS', - 'ITERATOR', - ],@_); - my ($coverage,$tag) = $self->coverage_array(-seqid=> $seq_name, - -start=> $start, - -end => $end, - -type => $types, - -bins => $bins) or return; - my $score = 0; - for (@$coverage) { $score += $_ } - $score /= @$coverage; - - my $feature = Bio::SeqFeature::Lite->new(-seq_id => $seq_name, - -start => $start, - -end => $end, - -type => $tag, - -score => $score, - -attributes => - { coverage => [$coverage] }); - return $iterator - ? Bio::DB::GFF::FeatureIterator->new($feature) - : $feature; -} - -=head2 coverage_array - - Title : coverage_array - Usage : $arrayref = $db->coverage_array(@args) - Function: returns a coverage summary across indicated region/type - Returns : an array reference - Args : see below - Status : public - -This method is used to get coverage density information across a -region of interest. The arguments are identical to feature_summary, -except that instead of returning a Bio::SeqFeatureI object, it returns -an array reference of the desired number of bins. The value of each -element corresponds to the number of features in the bin. - -Arguments: - - Argument Description - -------- ----------- - - -seq_id Sequence ID for the region - -start Start of region - -end End of region - -type/-types Feature type of interest or array ref of types - -bins Number of bins across region. Defaults to 1000. - -Note that this method uses an approximate algorithm that is only -accurate to 500 bp, so when dealing with bins that are smaller than -1000 bp, you may see some shifting of counts between adjacent bins. - -=cut - -sub coverage_array { - my $self = shift; - my ($seq_name,$start,$end,$types,$bins) = - rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'], - ['TYPES','TYPE','PRIMARY_TAG'],'BINS'],@_); - - $types = $self->parse_types($types); - my $dbh = $self->features_db; - - $bins ||= 1000; - $start ||= 1; - unless ($end) { - my $segment = $self->segment($seq_name) or $self->throw("unknown seq_id $seq_name"); - $end = $segment->end; - } - - my $binsize = ($end-$start+1)/$bins; - my $seqid = $seq_name; - - return [] unless $seqid; - - # where each bin starts - my @his_bin_array = map {$start + $binsize * $_} (0..$bins); - my @sum_bin_array = map {int(($_-1)/SUMMARY_BIN_SIZE)} @his_bin_array; - - my $interval_stats_table = 'finterval_stats'; - - # pick up the type ids - my ($type_from,@a) = $self->types_query($types); - my $query = "select ftypeid,fmethod,fsource from ftype where $type_from"; - my $sth = $dbh->prepare_delayed($query); - my (@t,$report_tag); - $sth->execute(@a); - while (my ($t,$method,$source) = $sth->fetchrow_array) { - $report_tag ||= "$method:$source"; - push @t,$t; - } - - - my %bins; - my $sql = <= ? - LIMIT 1 -END -; - $sth = $dbh->prepare_delayed($sql) or warn $dbh->errstr; - eval { - for my $typeid (@t) { - - for (my $i=0;$i<@sum_bin_array;$i++) { - - my @args = ($typeid,$seqid,$sum_bin_array[$i]); - $self->_print_query($sql,@args) if $self->debug; - - $sth->execute(@args) or $self->throw($sth->errstr); - my ($bin,$cum_count) = $sth->fetchrow_array; - push @{$bins{$typeid}},[$bin,$cum_count]; - } - } - }; - return unless %bins; - - my @merged_bins; - my $firstbin = int(($start-1)/$binsize); - for my $type (keys %bins) { - my $arry = $bins{$type}; - my $last_count = $arry->[0][1]; - my $last_bin = -1; - my $i = 0; - my $delta; - for my $b (@$arry) { - my ($bin,$count) = @$b; - $delta = $count - $last_count if $bin > $last_bin; - $merged_bins[$i++] = $delta; - $last_count = $count; - $last_bin = $bin; - } - } - - return wantarray ? (\@merged_bins,$report_tag) : \@merged_bins; -} - - -=head2 build_summary_statistics - - Title : build_summary_statistics - Usage : $db->build_summary_statistics - Function: prepares the table needed to call feature_summary() - Returns : nothing - Args : none - Status : public - -This method is used to build the summary statistics table that is used -by the feature_summary() and coverage_array() methods. It needs to be -called whenever the database is updated. - -=cut - -sub build_summary_statistics { - my $self = shift; - my $interval_stats_table = 'finterval_stats'; - my $dbh = $self->dbh; - $dbh->begin_work; - - my $sbs = SUMMARY_BIN_SIZE; - - my $result = eval { - $self->_add_interval_stats_table; - $self->disable_keys($interval_stats_table); - $dbh->do("DELETE FROM $interval_stats_table"); - - my $insert = $dbh->prepare(<throw($dbh->errstr); -INSERT INTO $interval_stats_table - (ftypeid,fref,fbin,fcum_count) - VALUES (?,?,?,?) -END - -; - - my $sql = 'select ftypeid,fref,fstart,fstop from fdata order by ftypeid,fref,fstart'; - my $select = $dbh->prepare($sql) or $self->throw($dbh->errstr); - - my $current_bin = -1; - my ($current_type,$current_seqid,$count); - my $cum_count = 0; - my (%residuals,$last_bin); - - my $le = -t \*STDERR ? "\r" : "\n"; - - $select->execute; - - while (my($typeid,$seqid,$start,$end) = $select->fetchrow_array) { - print STDERR $count," features processed$le" if ++$count % 1000 == 0; - - my $bin = int($start/$sbs); - $current_type ||= $typeid; - $current_seqid ||= $seqid; - - # because the input is sorted by start, no more features will contribute to the - # current bin so we can dispose of it - if ($bin != $current_bin) { - if ($seqid != $current_seqid or $typeid != $current_type) { - # load all bins left over - $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid); - %residuals = () ; - $cum_count = 0; - } else { - # load all up to current one - $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid,$current_bin); - } - } - - $last_bin = $current_bin; - ($current_seqid,$current_type,$current_bin) = ($seqid,$typeid,$bin); - - # summarize across entire spanned region - my $last_bin = int(($end-1)/$sbs); - for (my $b=$bin;$b<=$last_bin;$b++) { - $residuals{$b}++; - } - } - # handle tail case - # load all bins left over - $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid); - $self->enable_keys($interval_stats_table); - 1; - }; - - if ($result) { $dbh->commit } else { warn "Can't build summary statistics: $@"; $dbh->rollback }; - print STDERR "\n"; -} - -sub _load_bins { - my $self = shift; - my ($insert,$residuals,$cum_count,$type,$seqid,$stop_after) = @_; - for my $b (sort {$a<=>$b} keys %$residuals) { - last if defined $stop_after and $b > $stop_after; - $$cum_count += $residuals->{$b}; - my @args = ($type,$seqid,$b,$$cum_count); - $insert->execute(@args) or warn $insert->errstr; - delete $residuals->{$b}; # no longer needed - } -} - -sub _add_interval_stats_table { - my $self = shift; - my $schema = $self->schema; - my $create_table_stmt = $schema->{'finterval_stats'}{'table'}; - my $dbh = $self->features_db; - $dbh->do("drop table finterval_stats"); - $dbh->do($create_table_stmt) || warn $dbh->errstr; -} - -sub disable_keys { } # noop -sub enable_keys { } # noop - -1; - -__END__ - -=head1 BUGS - -Schemas need work to support multiple hierarchical groups. - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Adaptor/dbi/caching_handle.pm b/lib/Bio/DB/GFF/Adaptor/dbi/caching_handle.pm deleted file mode 100644 index 4a348d45f..000000000 --- a/lib/Bio/DB/GFF/Adaptor/dbi/caching_handle.pm +++ /dev/null @@ -1,273 +0,0 @@ -package Bio::DB::GFF::Adaptor::dbi::caching_handle; - -use strict; -use DBI; -use vars '$AUTOLOAD'; -use base qw(Bio::Root::Root); - -=head1 NAME - -Bio::DB::GFF::Adaptor::dbi::caching_handle -- Cache for database handles - -=head1 SYNOPSIS - - use Bio::DB::GFF::Adaptor::dbi::caching_handle; - $db = Bio::DB::GFF::Adaptor::dbi::caching_handle->new('dbi:mysql:test'); - $sth = $db->prepare('select * from foo'); - @h = $sth->fetch_rowarray; - $sth->finish - -=head1 DESCRIPTION - -This module handles a pool of database handles. It was motivated by -the MYSQL driver's {mysql_use_result} attribute, which dramatically -improves query speed and memory usage, but forbids additional query -statements from being evaluated while an existing one is in use. - -This module is a plug-in replacement for vanilla DBI. It -automatically activates the {mysql_use_result} attribute for the mysql -driver, but avoids problems with multiple active statement handlers by -creating new database handles as needed. - -=head1 USAGE - -The object constructor is -Bio::DB::GFF::Adaptor::dbi::caching_handle-Enew(). This is called -like DBI-Econnect() and takes the same arguments. The returned object -looks and acts like a conventional database handle. - -In addition to all the standard DBI handle methods, this package adds -the following: - -=head2 dbi_quote - - Title : dbi_quote - Usage : $string = $db->dbi_quote($sql,@args) - Function: perform bind variable substitution - Returns : query string - Args : the query string and bind arguments - Status : public - -This method replaces the bind variable "?" in a SQL statement with -appropriately quoted bind arguments. It is used internally to handle -drivers that don't support argument binding. - -=head2 do_query - - Title : do_query - Usage : $sth = $db->do_query($query,@args) - Function: perform a DBI query - Returns : a statement handler - Args : query string and list of bind arguments - Status : Public - -This method performs a DBI prepare() and execute(), returning a -statement handle. You will typically call fetch() of fetchrow_array() -on the statement handle. The parsed statement handle is cached for -later use. - -=head2 debug - - Title : debug - Usage : $debug = $db->debug([$debug]) - Function: activate debugging messages - Returns : current state of flag - Args : optional new setting of flag - Status : public - -=cut - -sub new { - my $class = shift; - my @dbi_args = @_; - my $self = bless { - dbh => [], - args => \@dbi_args, - debug => 0, - },$class; - $self->dbh || $self->throw("Can't connect to database: " . DBI->errstr); - $self; -} - -sub AUTOLOAD { - my($pack,$func_name) = $AUTOLOAD=~/(.+)::([^:]+)$/; - return if $func_name eq 'DESTROY'; - my $self = shift or return DBI->$func_name(@_); - $self->dbh->$func_name(@_); -} - -sub debug { - my $self = shift; - my $d = $self->{debug}; - $self->{debug} = shift if @_; - $d; -} - -sub prepare { - my $self = shift; - my $query = shift; - - # find a non-busy dbh - my $dbh = $self->dbh || $self->throw("Can't connect to database: " . DBI->errstr); - - warn "Using prepare_cache\n" if $self->debug; - my $sth = $dbh->prepare_cached($query, {}, 3) || $self->throw("Couldn't prepare query $query:\n ".DBI->errstr."\n"); - return $sth; -} - -sub do_query { - my $self = shift; - my ($query,@args) = @_; - warn $self->dbi_quote($query,@args),"\n" if $self->debug; - my $sth = $self->prepare($query); - $sth->execute(@args) || $self->throw("Couldn't execute query $query:\n ".DBI->errstr."\n"); - $sth; -} - -sub dbh { - my $self = shift; - foreach (@{$self->{dbh}}) { - return $_ if $_->inuse == 0; - } - # if we get here, we must create a new one - warn "(Re)connecting to database\n" if $self->debug; - my $dbh = DBI->connect(@{$self->{args}}) or return; - - $dbh->{PrintError} = 0; - - # for Oracle - to retrieve LOBs, need to define the length (Jul 15, 2002) - $dbh->{LongReadLen} = 100*65535; - $dbh->{LongTruncOk} = 0; - $dbh->{mysql_auto_reconnect} = 1; - - my $wrapper = Bio::DB::GFF::Adaptor::dbi::faux_dbh->new($dbh); - push @{$self->{dbh}},$wrapper; - $wrapper; -} - -# The clone method should only be called in child processes after a fork(). -# It does two things: (1) it sets the "real" dbh's InactiveDestroy to 1, -# thereby preventing the database connection from being destroyed in -# the parent when the dbh's destructor is called; (2) it replaces the -# "real" dbh with the result of dbh->clone(), so that we now have an -# independent handle. -sub clone { - my $self = shift; - foreach (@{$self->{dbh}}) { $_->clone }; -} - -=head2 attribute - - Title : attribute - Usage : $value = $db->attribute(AttributeName , [$newvalue]) - Function: get/set DBI::db handle attribute - Returns : current state of the attribute - Args : name of the attribute and optional new setting of attribute - Status : public - - Under Bio::DB::GFF::Adaptor::dbi::caching_handle the DBI::db - attributes that are usually set using hashref calls are unavailable. - Use attribute() instead. For example, instead of: - - $dbh->{AutoCommit} = 0; - - use - - $dbh->attribute(AutoCommit=>0); - -=cut - -sub attribute { - my $self = shift; - my $dbh = $self->dbh->{dbh}; - return $dbh->{$_[0]} = $_[1] if @_ == 2; - return $dbh->{$_[0]} if @_ == 1; - return; -} - -sub disconnect { - my $self = shift; - $_ && $_->disconnect foreach @{$self->{dbh}}; - $self->{dbh} = []; -} - -sub dbi_quote { - my $self = shift; - my ($query,@args) = @_; - my $dbh = $self->dbh; - $query =~ s/\?/$dbh->quote(shift @args)/eg; - $query; -} - -package Bio::DB::GFF::Adaptor::dbi::faux_dbh; -use vars '$AUTOLOAD'; - -sub new { - my $class = shift; - my $dbh = shift; - bless {dbh=>$dbh},$class; -} - -sub prepare { - my $self = shift; - my $sth = $self->{dbh}->prepare(@_) or return; - $sth->{mysql_use_result} = 1 if $self->{dbh}->{Driver}{Name} eq 'mysql'; - $sth; -} - -sub prepare_delayed { - my $self = shift; - my $sth = $self->{dbh}->prepare(@_) or return; - $sth; -} - -sub inuse { - shift->{dbh}->{ActiveKids}; -} - -# The clone method should only be called in child processes after a fork(). -# It does two things: (1) it sets the "real" dbh's InactiveDestroy to 1, -# thereby preventing the database connection from being destroyed in -# the parent when the dbh's destructor is called; (2) it replaces the -# "real" dbh with the result of dbh->clone(), so that we now have an -# independent handle. -sub clone { - my $self = shift; - $self->{dbh}{InactiveDestroy} = 1; - $self->{dbh} = $self->{dbh}->clone; -} - -sub DESTROY { } - -sub AUTOLOAD { - my($pack,$func_name) = $AUTOLOAD=~/(.+)::([^:]+)$/; - return if $func_name eq 'DESTROY'; - my $self = shift; - if( defined $self->{dbh} ) { - $self->{dbh}->$func_name(@_); - } -} - -1; - -__END__ - -=head1 BUGS - -Report to the author. - -=head1 SEE ALSO - -L, L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Adaptor/dbi/iterator.pm b/lib/Bio/DB/GFF/Adaptor/dbi/iterator.pm deleted file mode 100644 index 476378721..000000000 --- a/lib/Bio/DB/GFF/Adaptor/dbi/iterator.pm +++ /dev/null @@ -1,74 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Adaptor::dbi::iterator - iterator for Bio::DB::GFF::Adaptor::dbi - -=head1 SYNOPSIS - -For internal use only - -=head1 DESCRIPTION - -This is an internal module that is used by the Bio::DB::GFF DBI -adaptor to return an iterator across a sequence feature query. The -object has a single method, next_feature(), that returns the next -feature from the query. The method next_seq() is an alias for -next_feature(). - -=head1 BUGS - -None known yet. - -=head1 SEE ALSO - -L, - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -package Bio::DB::GFF::Adaptor::dbi::iterator; -use strict; -use Bio::Root::Version; - -use constant STH => 0; -use constant CALLBACK => 1; -use constant CACHE => 2; - -*next_seq = \&next_feature; - -sub new { - my $class = shift; - my ($sth,$callback) = @_; - return bless [$sth,$callback,[]],$class; -} - -sub next_feature { - my $self = shift; - return shift @{$self->[CACHE]} if @{$self->[CACHE]}; - my $sth = $self->[STH] or return; - my $callback = $self->[CALLBACK]; - - my $features; - while (1) { - if (my @row = $sth->fetchrow_array) { - $features = $callback->(@row); - last if $features; - } else { - $sth->finish; - undef $self->[STH]; - $features = $callback->(); - last; - } - } - $self->[CACHE] = $features or return; - shift @{$self->[CACHE]}; -} - -1; diff --git a/lib/Bio/DB/GFF/Adaptor/dbi/mysql.pm b/lib/Bio/DB/GFF/Adaptor/dbi/mysql.pm deleted file mode 100644 index cb43ceedd..000000000 --- a/lib/Bio/DB/GFF/Adaptor/dbi/mysql.pm +++ /dev/null @@ -1,909 +0,0 @@ -package Bio::DB::GFF::Adaptor::dbi::mysql; - -=head1 NAME - -Bio::DB::GFF::Adaptor::dbi::mysql -- Database adaptor for a specific mysql schema - -=head1 SYNOPSIS - -See L - -=cut - -# a simple mysql adaptor -use strict; -use Bio::DB::GFF::Util::Rearrange; # for rearrange() -use Bio::DB::GFF::Util::Binning; -use base qw(Bio::DB::GFF::Adaptor::dbi); - -use constant MAX_SEGMENT => 100_000_000; # the largest a segment can get - -use constant GETSEQCOORDS =><<<< < select * from fgroup where gname='sjj_2L52.1'; - +-------+-------------+------------+ - | gid | gclass | gname | - +-------+-------------+------------+ - | 69736 | PCR_product | sjj_2L52.1 | - +-------+-------------+------------+ - 1 row in set (0.70 sec) - - mysql> select fref,fstart,fstop from fdata,fgroup - where gclass='PCR_product' and gname = 'sjj_2L52.1' - and fdata.gid=fgroup.gid; - +---------------+--------+-------+ - | fref | fstart | fstop | - +---------------+--------+-------+ - | CHROMOSOME_II | 1586 | 2355 | - +---------------+--------+-------+ - 1 row in set (0.03 sec) - -=item ftype - -This table contains the feature types, one per row. Columns are: - - ftypeid the feature type ID (integer) - fmethod the feature type method name (string) - fsource the feature type source name (string) - -The ftype.ftypeid field joins with the fdata.ftypeid field. Example: - - mysql> select fref,fstart,fstop,fmethod,fsource from fdata,fgroup,ftype - where gclass='PCR_product' - and gname = 'sjj_2L52.1' - and fdata.gid=fgroup.gid - and fdata.ftypeid=ftype.ftypeid; - +---------------+--------+-------+-------------+-----------+ - | fref | fstart | fstop | fmethod | fsource | - +---------------+--------+-------+-------------+-----------+ - | CHROMOSOME_II | 1586 | 2355 | PCR_product | GenePairs | - +---------------+--------+-------+-------------+-----------+ - 1 row in set (0.08 sec) - -=item fdna - -This table holds the raw DNA of the reference sequences. It has three -columns: - - fref reference sequence name (string) - foffset offset of this sequence - fdna the DNA sequence (longblob) - -To overcome problems loading large blobs, DNA is automatically -fragmented into multiple segments when loading, and the position of -each segment is stored in foffset. The fragment size is controlled by -the -clump_size argument during initialization. - -=item fattribute_to_feature - -This table holds "attributes", which are tag/value pairs stuffed into -the GFF line. The first tag/value pair is treated as the group, and -anything else is treated as an attribute (weird, huh?). - - CHR_I assembly_tag Finished 2032 2036 . + . Note "Right: cTel33B" - CHR_I assembly_tag Polymorphism 668 668 . + . Note "A->C in cTel33B" - -The columns of this table are: - - fid feature ID (integer) - fattribute_id ID of the attribute (integer) - fattribute_value text of the attribute (text) - -The fdata.fid column joins with fattribute_to_feature.fid. - -=item fattribute - -This table holds the normalized names of the attributes. Fields are: - - fattribute_id ID of the attribute (integer) - fattribute_name Name of the attribute (varchar) - -=back - -=head2 Data Loading Methods - -In addition to implementing the abstract SQL-generating methods of -Bio::DB::GFF::Adaptor::dbi, this module also implements the data -loading functionality of Bio::DB::GFF. - -=cut - - -=head2 new - - Title : new - Usage : $db = Bio::DB::GFF->new(@args) - Function: create a new adaptor - Returns : a Bio::DB::GFF object - Args : see below - Status : Public - -The new constructor is identical to the "dbi" adaptor's new() method, -except that the prefix "dbi:mysql" is added to the database DSN identifier -automatically if it is not there already. - - Argument Description - -------- ----------- - - -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' or "ens0040" - - -user username for authentication - - -pass the password for authentication - -=cut - -#' - -sub new { - my $class = shift; - my ($dsn,$other) = rearrange([ - [qw(FEATUREDB DB DSN)], - ],@_); - $dsn = "dbi:mysql:$dsn" if !ref($dsn) && $dsn !~ /^(?:dbi|DBI):/; - my $self = $class->SUPER::new(-dsn=>$dsn,%$other); - $self; -} - -=head2 get_dna - - Title : get_dna - Usage : $string = $db->get_dna($name,$start,$stop,$class) - Function: get DNA string - Returns : a string - Args : name, class, start and stop of desired segment - Status : Public - -This method performs the low-level fetch of a DNA substring given its -name, class and the desired range. This should probably be moved to -the parent class. - -=cut - -sub getseqcoords_query { - my $self = shift; - return GETSEQCOORDS ; -} - -sub getaliascoords_query{ - my $self = shift; - return GETALIASCOORDS ; -} - - -sub getforcedseqcoords_query{ - my $self = shift; - return GETFORCEDSEQCOORDS ; -} - - -sub getaliaslike_query{ - my $self = shift; - return GETALIASLIKE ; -} - - -# override parent -sub get_abscoords_bkup { - my $self = shift; - my ($name,$class,$refseq) = @_; - - my $result = $self->SUPER::get_abscoords(@_); - return $result if $result; - - my $sth; - if ($name =~ s/\*/%/g) { - $sth = $self->dbh->do_query(GETALIASLIKE,$name,$class); - } else { - $sth = $self->dbh->do_query(GETALIASCOORDS,$name,$class); - } - my @result; - while (my @row = $sth->fetchrow_array) { push @result,\@row } - $sth->finish; - - if (@result == 0) { - $self->error("$name not found in database"); - return; - } else { - return \@result; - } - -} - - - -sub make_features_select_part { - my $self = shift; - my $options = shift || {}; - my $s; - if (my $b = $options->{bin_width}) { - - $s = <{attributes} && keys %{$options->{attributes}}>1; - $s; -} - - -# IMPORTANT NOTE: -# WHETHER OR NOT THIS WORKS IS CRITICALLY DEPENDENT ON THE RELATIVE MAGNITUDE OF THE -sub make_features_from_part { - my $self = shift; - my $sparse_types = shift; - my $options = shift || {}; - my $sparse_groups = $options->{sparse_groups}; - my $index = $sparse_groups ? ' USE INDEX(gid)' - : $sparse_types ? ' USE INDEX(ftypeid)' - : ''; - return $options->{attributes} ? "fdata${index},ftype,fgroup,fattribute,fattribute_to_feature\n" - : "fdata${index},ftype,fgroup\n"; -} - -=head2 search_notes - - Title : search_notes - Usage : @search_results = $db->search_notes("full text search string",$limit) - Function: Search the notes for a text string, using mysql full-text search - Returns : array of results - Args : full text search string, and an optional row limit - Status : public - -This is a mysql-specific method. Given a search string, it performs a -full-text search of the notes table and returns an array of results. -Each row of the returned array is a arrayref containing the following fields: - - column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment() - column 2 The text of the note - column 3 A relevance score. - -=cut - -sub search_notes { - my $self = shift; - my ($search_string,$limit) = @_; - - $search_string =~ tr/*?//d; - - my $query = FULLTEXTSEARCH; - $query .= " limit $limit" if defined $limit; - my $sth = $self->dbh->do_query($query,$search_string,$search_string); - my @results; - while (my ($class,$name,$note,$relevance,$method,$source) = $sth->fetchrow_array) { - next unless $class && $name; # sorry, ignore NULL objects - $relevance = sprintf("%.2f",$relevance); # trim long floats - my $featname = Bio::DB::GFF::Featname->new($class=>$name); - my $type = Bio::DB::GFF::Typename->new($method,$source); - push @results,[$featname,$note,$relevance,$type]; - } - - #added result filtering so that this method returns the expected results - #this section of code used to be in GBrowse's do_keyword_search method - - my $match_sub = 'sub {'; - foreach (split /\s+/,$search_string) { - $match_sub .= "return unless \$_[0] =~ /\Q$_\E/i; "; - } - $match_sub .= "};"; - my $match = eval $match_sub; - - my @matches = grep { $match->($_->[1]) } @results; - - return @matches; -} - - - -################################ loading and initialization ################################## - -=head2 schema - - Title : schema - Usage : $schema = $db->schema - Function: return the CREATE script for the schema - Returns : a list of CREATE statemetns - Args : none - Status : protected - -This method returns a list containing the various CREATE statements -needed to initialize the database tables. - -=cut - -sub schema { - my $self = shift; - my $dbh = $self->dbh; - my ($version) = $dbh->selectrow_array('select version()'); - my ($major, $minor) = split /\./, $version; - $version = "$major.$minor"; - my $engine = $version >= 4.1 ? 'ENGINE' : 'TYPE'; - my %schema = ( - fdata =>{ -table=> qq{ - create table fdata ( - fid int not null auto_increment, - fref varchar(100) not null, - fstart int not null, - fstop int not null, - fbin double precision, - ftypeid int not null, - fscore float, - fstrand enum('+','-'), - fphase enum('0','1','2'), - gid int not null, - ftarget_start int, - ftarget_stop int, - primary key(fid), - unique index(fref,fbin,fstart,fstop,ftypeid,gid), - index(ftypeid), - index(gid) - ) $engine=MyISAM -} # fdata table -}, # fdata - - fgroup =>{ -table=> qq{ -create table fgroup ( - gid int not null auto_increment, - gclass varchar(100), - gname varchar(100), - primary key(gid), - unique(gclass,gname) -) $engine=MyISAM -} -}, - - ftype => { -table=> qq{ -create table ftype ( - ftypeid int not null auto_increment, - fmethod varchar(100) not null, - fsource varchar(100), - primary key(ftypeid), - index(fmethod), - index(fsource), - unique ftype (fmethod,fsource) -) $engine=MyISAM -} #ftype table -}, #ftype - - fdna => { -table=> qq{ -create table fdna ( - fref varchar(100) not null, - foffset int(10) unsigned not null, - fdna longblob, - primary key(fref,foffset) -) $engine=MyISAM -} # fdna table -},#fdna - - fmeta => { -table=> qq{ -create table fmeta ( - fname varchar(255) not null, - fvalue varchar(255) not null, - primary key(fname) -) $engine=MyISAM -} # fmeta table -},#fmeta - - fattribute => { -table=> qq{ -create table fattribute ( - fattribute_id int(10) unsigned not null auto_increment, - fattribute_name varchar(255) not null, - primary key(fattribute_id) -) $engine=MyISAM -} #fattribute table -},#fattribute - - fattribute_to_feature => { -table=> qq{ -create table fattribute_to_feature ( - fid int(10) not null, - fattribute_id int(10) not null, - fattribute_value text, - key(fid,fattribute_id), - key(fattribute_value(48)), - fulltext(fattribute_value) -) $engine=MyISAM -} # fattribute_to_feature table -},# fattribute_to_feature - - finterval_stats => { -table=> qq{ -create table finterval_stats ( - ftypeid integer not null, - fref varchar(100) not null, - fbin integer not null, - fcum_count integer not null, - primary key(ftypeid,fref,fbin) -) $engine=MyISAM -} # finterval_stats table -},# finterval_stats - -); - return \%schema; -} - - - -=head2 make_classes_query - - Title : make_classes_query - Usage : ($query,@args) = $db->make_classes_query - Function: return query fragment for generating list of reference classes - Returns : a query and args - Args : none - Status : public - -=cut - -sub make_classes_query { - my $self = shift; - return 'SELECT DISTINCT gclass FROM fgroup WHERE NOT ISNULL(gclass)'; -} - - -=head2 make_meta_set_query - - Title : make_meta_set_query - Usage : $sql = $db->make_meta_set_query - Function: return SQL fragment for setting a meta parameter - Returns : SQL fragment - Args : none - Status : public - -By default this does nothing; meta parameters are not stored or -retrieved. - -=cut - -sub make_meta_set_query { - return 'REPLACE INTO fmeta VALUES (?,?)'; -} - -=head2 setup_load - - Title : setup_load - Usage : $db->setup_load - Function: called before load_gff_line() - Returns : void - Args : none - Status : protected - -This method performs schema-specific initialization prior to loading a -set of GFF records. It prepares a set of DBI statement handlers to be -used in loading the data. - -=cut - -sub setup_load { - my $self = shift; - - my $dbh = $self->features_db; - - if ($self->lock_on_load) { - my @tables = map { "$_ WRITE"} $self->tables; - my $tables = join ', ',@tables; - $dbh->do("LOCK TABLES $tables"); - } -# for my $table (qw(fdata)) { -# $dbh->do("alter table $table disable keys"); -# } - - my $lookup_type = $dbh->prepare_delayed('SELECT ftypeid FROM ftype WHERE fmethod=? AND fsource=?'); - my $insert_type = $dbh->prepare_delayed('INSERT INTO ftype (fmethod,fsource) VALUES (?,?)'); - - my $lookup_group = $dbh->prepare_delayed('SELECT gid FROM fgroup WHERE gname=? AND gclass=?'); - my $insert_group = $dbh->prepare_delayed('INSERT INTO fgroup (gname,gclass) VALUES (?,?)'); - - my $lookup_attribute = $dbh->prepare_delayed('SELECT fattribute_id FROM fattribute WHERE fattribute_name=?'); - my $insert_attribute = $dbh->prepare_delayed('INSERT INTO fattribute (fattribute_name) VALUES (?)'); - my $insert_attribute_value = $dbh->prepare_delayed('INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) VALUES (?,?,?)'); - - my $insert_data = $dbh->prepare_delayed(<{load_stuff}{sth}{lookup_ftype} = $lookup_type; - $self->{load_stuff}{sth}{insert_ftype} = $insert_type; - $self->{load_stuff}{sth}{lookup_fgroup} = $lookup_group; - $self->{load_stuff}{sth}{insert_fgroup} = $insert_group; - $self->{load_stuff}{sth}{insert_fdata} = $insert_data; - $self->{load_stuff}{sth}{lookup_fattribute} = $lookup_attribute; - $self->{load_stuff}{sth}{insert_fattribute} = $insert_attribute; - $self->{load_stuff}{sth}{insert_fattribute_value} = $insert_attribute_value; - $self->{load_stuff}{types} = {}; - $self->{load_stuff}{groups} = {}; - $self->{load_stuff}{counter} = 0; -} - -=head2 load_gff_line - - Title : load_gff_line - Usage : $db->load_gff_line($fields) - Function: called to load one parsed line of GFF - Returns : true if successfully inserted - Args : hashref containing GFF fields - Status : protected - -This method is called once per line of the GFF and passed a series of -parsed data items that are stored into the hashref $fields. The keys are: - - ref reference sequence - source annotation source - method annotation method - start annotation start - stop annotation stop - score annotation score (may be undef) - strand annotation strand (may be undef) - phase annotation phase (may be undef) - group_class class of annotation's group (may be undef) - group_name ID of annotation's group (may be undef) - target_start start of target of a similarity hit - target_stop stop of target of a similarity hit - attributes array reference of attributes, each of which is a [tag=>value] array ref - -=cut - -sub load_gff_line { - my $self = shift; - my $gff = shift; - - my $s = $self->{load_stuff}; - my $dbh = $self->features_db; - local $dbh->{PrintError} = 0; - - defined(my $typeid = $self->get_table_id('ftype', $gff->{method} => $gff->{source})) or return; - defined(my $groupid = $self->get_table_id('fgroup',$gff->{gname} => $gff->{gclass})) or return; - - if ($gff->{stop}-$gff->{start}+1 > $self->max_bin) { - warn "$gff->{gclass}:$gff->{gname} is ",$gff->{stop}-$gff->{start}+1, - " bp long, but the maximum indexable feature is set to ",$self->max_bin," bp.\n"; - warn "Please set the maxbin value to a length at least as large as the largest feature you wish to store.\n"; - warn "\n* You will need to reinitialize the database from scratch.\n"; - warn "* With the Perl API you do this using the -max_bin argument to \$db->initialize().\n"; - warn "* With the command-line tools you do with this with --maxfeature option.\n"; - } - - my $bin = bin($gff->{start},$gff->{stop},$self->min_bin); - my $result = $s->{sth}{insert_fdata}->execute($gff->{ref}, - $gff->{start},$gff->{stop},$bin, - $typeid, - $gff->{score},$gff->{strand},$gff->{phase}, - $groupid, - $gff->{tstart},$gff->{tstop}); - - warn $dbh->errstr,"\n" && return unless $result; - - my $fid = $dbh->{mysql_insertid} - || $self->get_feature_id($gff->{ref},$gff->{start},$gff->{stop},$typeid,$groupid); - - - # insert attributes - foreach (@{$gff->{attributes}}) { - defined(my $attribute_id = $self->get_table_id('fattribute',$_->[0])) or return; - $s->{sth}{insert_fattribute_value}->execute($fid,$attribute_id,$_->[1]); - } - - if ( (++$s->{counter} % 1000) == 0) { - print STDERR "$s->{counter} records loaded..."; - print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n"; - } - - $fid; -} - -sub finish_load { - my $self = shift; - my $dbh = $self->features_db; - local $dbh->{PrintError} = 0; -# for my $table (qw(fdata)) { -# $dbh->do("alter table $table enable keys"); -# } - $self->SUPER::finish_load; -} - - -sub insert_sequence { - my $self = shift; - my($id,$offset,$seq) = @_; - my $sth = $self->{_insert_sequence} - ||= $self->dbh->prepare_delayed('replace into fdna values (?,?,?)'); - $sth->execute($id,$offset,$seq) or $self->throw($sth->errstr); -} - - -=head2 get_table_id - - Title : get_table_id - Usage : $integer = $db->get_table_id($table,@ids) - Function: get the ID of a group or type - Returns : an integer ID or undef - Args : none - Status : private - -This internal method is called by load_gff_line to look up the integer -ID of an existing feature type or group. The arguments are the name -of the table, and two string identifiers. For feature types, the -identifiers are the method and source. For groups, the identifiers -are group name and class. - -This method requires that a statement handler named I, -have been created previously by setup_load(). It is here to overcome -deficiencies in mysql's INSERT syntax. - -=cut - -#' -# get the object ID from a named table -sub get_table_id { - my $self = shift; - my $table = shift; - my @ids = @_; - - # irritating warning for null id - my $id_key; - { - local $^W=0; - $id_key = join ':',@ids; - } - - my $s = $self->{load_stuff}; - my $sth = $s->{sth}; - my $dbh = $self->features_db; - - unless (defined($s->{$table}{$id_key})) { - - ######################################### - # retrieval of the last inserted id is now located at the adaptor and not in caching_handle - ####################################### - if ( (my $result = $sth->{"lookup_$table"}->execute(@ids)) > 0) { - $s->{$table}{$id_key} = ($sth->{"lookup_$table"}->fetchrow_array)[0]; - } else { - $sth->{"insert_$table"}->execute(@ids) - && ($s->{$table}{$id_key} = $self->insertid($sth->{"insert_$table"})); - #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}{sth}{mysql_insertid}); - #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}->insertid); - } - } - - my $id = $s->{$table}{$id_key}; - unless (defined $id) { - warn "No $table id for $id_key ",$dbh->errstr," Record skipped.\n"; - return; - } - $id; -} - -sub insertid { - my $self = shift; - my $s = shift ; - $s->{mysql_insertid}; -} - - -=head2 get_feature_id - - Title : get_feature_id - Usage : $integer = $db->get_feature_id($ref,$start,$stop,$typeid,$groupid) - Function: get the ID of a feature - Returns : an integer ID or undef - Args : none - Status : private - -This internal method is called by load_gff_line to look up the integer -ID of an existing feature. It is ony needed when replacing a feature -with new information. - -=cut - -# this method is called when needed to look up a feature's ID -sub get_feature_id { - my $self = shift; - my ($ref,$start,$stop,$typeid,$groupid) = @_; - my $s = $self->{load_stuff}; - unless ($s->{get_feature_id}) { - my $dbh = $self->features_db; - $s->{get_feature_id} = - $dbh->prepare_delayed('SELECT fid FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND ftypeid=? AND gid=?'); - } - my $sth = $s->{get_feature_id} or return; - $sth->execute($ref,$start,$stop,$typeid,$groupid) or return; - my ($fid) = $sth->fetchrow_array; - return $fid; -} - -sub _add_interval_stats_table { - my $self = shift; - my $schema = $self->schema; - my $create_table_stmt = $schema->{'finterval_stats'}{'table'}; - $create_table_stmt =~ s/create table/create table if not exists/i; - my $dbh = $self->features_db; - $dbh->do($create_table_stmt) || warn $dbh->errstr; -} - -sub disable_keys { - my $self = shift; - my $table = shift; - my $dbh = $self->dbh; - $dbh->do("alter table $table disable keys"); -} -sub enable_keys { - my $self = shift; - my $table = shift; - my $dbh = $self->dbh; - $dbh->do("alter table $table enable keys"); -} - - -1; - - - -__END__ - -=head1 BUGS - -none ;-) - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2002 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Adaptor/dbi/mysqlcmap.pm b/lib/Bio/DB/GFF/Adaptor/dbi/mysqlcmap.pm deleted file mode 100644 index a977b6d96..000000000 --- a/lib/Bio/DB/GFF/Adaptor/dbi/mysqlcmap.pm +++ /dev/null @@ -1,1176 +0,0 @@ -package Bio::DB::GFF::Adaptor::dbi::mysqlcmap; - -=head1 NAME - -Bio::DB::GFF::Adaptor::dbi::mysqlcmap -- Database adaptor for an integraded -CMap/GBrowse mysql schema - -=head1 SYNOPSIS - -See L - -=cut - -# a simple mysql adaptor -use strict; -use Data::Dumper; -use Bio::DB::GFF::Adaptor::dbi; -use Bio::DB::GFF::Util::Rearrange; # for rearrange() -use Bio::DB::GFF::Util::Binning; -use base qw(Bio::DB::GFF::Adaptor::dbi::mysql); -require Bio::DB::GFF::Adaptor::dbi::mysql; - -use constant GETSEQCOORDS =><<<< < select * from cmap_feature where feature_name='sjj_2L52.1'; - +--------------+-------------+--------------+ - | feature_id | gclass | feature_name | - +--------------+-------------+--------------+ - | 69736 | PCR_product | sjj_2L52.1 | - +--------------+-------------+--------------+ - 1 row in set (0.70 sec) - - mysql> select fref,fstart,fstop from fdata,cmap_feature - where gclass='PCR_product' and feature_name = 'sjj_2L52.1' - and fdata.feature_id=cmap_feature.feature_id; - +---------------+--------+-------+ - | fref | fstart | fstop | - +---------------+--------+-------+ - | CHROMOSOME_II | 1586 | 2355 | - +---------------+--------+-------+ - 1 row in set (0.03 sec) - -=item ftype - -This table contains the feature types, one per row. Columns are: - - ftypeid the feature type ID (integer) - fmethod the feature type method name (string) - fsource the feature type source name (string) - -The ftype.ftypeid field joins with the fdata.ftypeid field. Example: - - mysql> select fref,fstart,fstop,fmethod,fsource from fdata,cmap_feature,ftype - where gclass='PCR_product' - and feature_name = 'sjj_2L52.1' - and fdata.feature_id=cmap_feature.feature_id - and fdata.ftypeid=ftype.ftypeid; - +---------------+--------+-------+-------------+-----------+ - | fref | fstart | fstop | fmethod | fsource | - +---------------+--------+-------+-------------+-----------+ - | CHROMOSOME_II | 1586 | 2355 | PCR_product | GenePairs | - +---------------+--------+-------+-------------+-----------+ - 1 row in set (0.08 sec) - -=item fdna - -This table holds the raw DNA of the reference sequences. It has three -columns: - - fref reference sequence name (string) - foffset offset of this sequence - fdna the DNA sequence (longblob) - -To overcome problems loading large blobs, DNA is automatically -fragmented into multiple segments when loading, and the position of -each segment is stored in foffset. The fragment size is controlled by -the -clump_size argument during initialization. - -=item fattribute_to_feature - -This table holds "attributes", which are tag/value pairs stuffed into -the GFF line. The first tag/value pair is treated as the group, and -anything else is treated as an attribute (weird, huh?). - - CHR_I assembly_tag Finished 2032 2036 . + . Note "Right: cTel33B" - CHR_I assembly_tag Polymorphism 668 668 . + . Note "A->C in cTel33B" - -The columns of this table are: - - fid feature ID (integer) - fattribute_id ID of the attribute (integer) - fattribute_value text of the attribute (text) - -The fdata.fid column joins with fattribute_to_feature.fid. - -=item fattribute - -This table holds the normalized names of the attributes. Fields are: - - fattribute_id ID of the attribute (integer) - fattribute_name Name of the attribute (varchar) - -=back - -=head2 Data Loading Methods - -In addition to implementing the abstract SQL-generating methods of -Bio::DB::GFF::Adaptor::dbi, this module also implements the data -loading functionality of Bio::DB::GFF. - -=cut - - -=head2 new - - Title : new - Usage : $db = Bio::DB::GFF->new(@args) - Function: create a new adaptor - Returns : a Bio::DB::GFF object - Args : see below - Status : Public - -The new constructor is identical to the "dbi" adaptor's new() method, -except that the prefix "dbi:mysql" is added to the database DSN identifier -automatically if it is not there already. - - Argument Description - -------- ----------- - - -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' or "ens0040" - - -user username for authentication - - -pass the password for authentication - -=cut - -#' - -#Defined in mysql.pm - -=head2 get_dna - - Title : get_dna - Usage : $string = $db->get_dna($name,$start,$stop,$class) - Function: get DNA string - Returns : a string - Args : name, class, start and stop of desired segment - Status : Public - -This method performs the low-level fetch of a DNA substring given its -name, class and the desired range. This should probably be moved to -the parent class. - -=cut - -sub make_features_select_part { - my $self = shift; - my $options = shift || {}; - my $s; - if (my $b = $options->{bin_width}) { - - $s = <{attributes} && keys %{$options->{attributes}}>1; - $s; -} - - -# IMPORTANT NOTE: -# WHETHER OR NOT THIS WORKS IS CRITICALLY DEPENDENT ON THE RELATIVE MAGNITUDE OF THE -sub make_features_from_part { - my $self = shift; - my $sparse_types = shift; - my $options = shift || {}; - my $sparse_groups = $options->{sparse_groups}; - my $index = $sparse_groups ? ' USE INDEX(feature_id)' - : $sparse_types ? ' USE INDEX(ftypeid)' - : ''; - return $options->{attributes} ? "fdata${index},ftype,cmap_feature,fattribute,fattribute_to_feature\n" - : "fdata${index},ftype,cmap_feature\n"; -} - -################################ loading and initialization ################################## - -=head2 schema - - Title : schema - Usage : $schema = $db->schema - Function: return the CREATE script for the schema - Returns : a list of CREATE statemetns - Args : none - Status : protected - -This method returns a list containing the various CREATE statements -needed to initialize the database tables. - -=cut - -sub schema { - my %schema = ( - fdata =>{ -table=> q{ -#create table fdata ( -# fid int not null auto_increment, -# fref varchar(100) not null, -# fstart int unsigned not null, -# fstop int unsigned not null, -# ftypeid int not null, -# fscore float, -# fstrand enum('+','-'), -# fphase enum('0','1','2'), -# feature_id int not null, -# ftarget_start int unsigned, -# ftarget_stop int unsigned, -# primary key(fid), -# unique index(fref,fstart,fstop,ftypeid,feature_id), -# index(ftypeid), -# index(feature_id) -#) type=MyISAM - - - create table fdata ( - fid int not null auto_increment, - fref varchar(100) not null, - fstart int unsigned not null, - fstop int unsigned not null, - fbin double(20,6) not null, - ftypeid int not null, - fscore float, - fstrand enum('+','-'), - fphase enum('0','1','2'), - feature_id int not null, - ftarget_start int unsigned, - ftarget_stop int unsigned, - primary key(fid), - unique index(fref,fbin,fstart,fstop,ftypeid,feature_id), - index(ftypeid), - index(feature_id) - ) type=MyISAM -} # fdata table -}, # fdata - - ftype => { -table=> q{ -create table ftype ( - ftypeid int not null auto_increment, - fmethod varchar(100) not null, - fsource varchar(100), - primary key(ftypeid), - index(fmethod), - index(fsource), - unique ftype (fmethod,fsource) -)type=MyISAM -} #ftype table -}, #ftype - - fdna => { -table=> q{ -create table fdna ( - fref varchar(100) not null, - foffset int(10) unsigned not null, - fdna longblob, - primary key(fref,foffset) -)type=MyISAM -} # fdna table -},#fdna - - fmeta => { -table=> q{ -create table fmeta ( - fname varchar(255) not null, - fvalue varchar(255) not null, - primary key(fname) -)type=MyISAM -} # fmeta table -},#fmeta - - fattribute => { -table=> q{ -create table fattribute ( - fattribute_id int(10) unsigned not null auto_increment, - fattribute_name varchar(255) not null, - primary key(fattribute_id) -)type=MyISAM -} #fattribute table -},#fattribute - - fattribute_to_feature => { -table=> q{ -create table fattribute_to_feature ( - fid int(10) not null, - fattribute_id int(10) not null, - fattribute_value text, - key(fid,fattribute_id), - key(fattribute_value(48)), - fulltext(fattribute_value) -)type=MyISAM -} # fattribute_to_feature table - }, # fattribute_to_feature - - -cmap_attribute => { -table=>q{ -create table cmap_attribute ( - attribute_id int(11) NOT NULL default '0', - table_name varchar(30) NOT NULL default '', - object_id int(11) NOT NULL default '0', - display_order int(11) NOT NULL default '1', - is_public tinyint(4) NOT NULL default '1', - attribute_name varchar(200) NOT NULL default '', - attribute_value text NOT NULL, - PRIMARY KEY (attribute_id), - KEY table_name (table_name,object_id,display_order,attribute_name) -) TYPE=MyISAM; -} # table -}, - -cmap_correspondence_evidence => { -table=>q{ -create table cmap_correspondence_evidence ( - correspondence_evidence_id int(11) NOT NULL default '0', - accession_id varchar(20) NOT NULL default '', - feature_correspondence_id int(11) NOT NULL default '0', - evidence_type_accession varchar(20) NOT NULL default '0', - score double(8,2) default NULL, - rank int(11) NOT NULL default '0', - PRIMARY KEY (correspondence_evidence_id), - UNIQUE KEY accession_id (accession_id), - KEY feature_correspondence_id (feature_correspondence_id) -) TYPE=MyISAM; -} # table -}, - - -cmap_correspondence_lookup => { -table=>q{ -create table cmap_correspondence_lookup ( - feature_id1 int(11) default NULL, - feature_id2 int(11) default NULL, - feature_correspondence_id int(11) default NULL, - start_position1 double(11,2) default NULL, - start_position2 double(11,2) default NULL, - stop_position1 double(11,2) default NULL, - stop_position2 double(11,2) default NULL, - map_id1 int(11) default NULL, - map_id2 int(11) default NULL, - feature_type_accession1 varchar(20) default NULL, - feature_type_accession2 varchar(20) default NULL, - KEY feature_id1 (feature_id1), - KEY corr_id (feature_correspondence_id), - KEY cl_map_id1 (map_id1), - KEY cl_map_id2 (map_id2), - KEY cl_map_id1_map_id2 (map_id1,map_id2), - KEY cl_map_id2_map_id1 (map_id2,map_id1) -) TYPE=MyISAM; -} # table -}, - - -cmap_correspondence_matrix => { -table=>q{ -create table cmap_correspondence_matrix ( - reference_map_aid varchar(20) NOT NULL default '0', - reference_map_name varchar(32) NOT NULL default '', - reference_map_set_aid varchar(20) NOT NULL default '0', - reference_species_aid varchar(20) NOT NULL default '0', - link_map_aid varchar(20) default NULL, - link_map_name varchar(32) default NULL, - link_map_set_aid varchar(20) NOT NULL default '0', - link_species_aid varchar(20) NOT NULL default '0', - no_correspondences int(11) NOT NULL default '0' -) TYPE=MyISAM; -} # table -}, - - -cmap_feature => { -table=>q{ -create table cmap_feature ( - feature_id int(11) NOT NULL default '0', - accession_id varchar(20) NOT NULL default '', - map_id int(11) default NULL, - feature_type_accession varchar(20) NOT NULL default '0', - feature_name varchar(32) NOT NULL default '', - is_landmark tinyint(4) NOT NULL default '0', - start_position double(11,2) NOT NULL default '0.00', - stop_position double(11,2) default NULL, - default_rank int(11) NOT NULL default '1', - direction tinyint(4) NOT NULL default '1', - gclass varchar(100) default NULL, - PRIMARY KEY (feature_id), - UNIQUE KEY gclass (gclass,feature_name), - UNIQUE KEY accession_id (accession_id), - KEY feature_name (feature_name), - KEY feature_id_map_id (feature_id,map_id), - KEY feature_id_map_id_start (feature_id,map_id,start_position), - KEY map_id (map_id), - KEY map_id_feature_id (map_id,feature_id) -) TYPE=MyISAM; -} # table -}, - - -cmap_feature_alias => { -table=>q{ -create table cmap_feature_alias ( - feature_alias_id int(11) NOT NULL default '0', - feature_id int(11) NOT NULL default '0', - alias varchar(255) default NULL, - PRIMARY KEY (feature_alias_id), - UNIQUE KEY feature_id_2 (feature_id,alias), - KEY feature_id (feature_id), - KEY alias (alias) -) TYPE=MyISAM; -} # table -}, - - -cmap_feature_correspondence => { -table=>q{ -create table cmap_feature_correspondence ( - feature_correspondence_id int(11) NOT NULL default '0', - accession_id varchar(20) NOT NULL default '', - feature_id1 int(11) NOT NULL default '0', - feature_id2 int(11) NOT NULL default '0', - is_enabled tinyint(4) NOT NULL default '1', - PRIMARY KEY (feature_correspondence_id), - UNIQUE KEY accession_id (accession_id), - KEY feature_id1 (feature_id1), - KEY cmap_feature_corresp_idx (is_enabled,feature_correspondence_id) -) TYPE=MyISAM; -} # table -}, - - -cmap_map => { -table=>q{ -create table cmap_map ( - map_id int(11) NOT NULL default '0', - accession_id varchar(20) NOT NULL default '', - map_set_id int(11) NOT NULL default '0', - map_name varchar(32) NOT NULL default '', - display_order int(11) NOT NULL default '1', - start_position double(11,2) default NULL, - stop_position double(11,2) default NULL, - PRIMARY KEY (map_id), - UNIQUE KEY accession_id (accession_id), - UNIQUE KEY map_id (map_id,map_set_id,map_name,accession_id), - KEY map_set_id_index (map_set_id) -) TYPE=MyISAM; -} # table -}, - - -cmap_map_set => { -table=>q{ -create table cmap_map_set ( - map_set_id int(11) NOT NULL default '0', - accession_id varchar(20) NOT NULL default '', - map_set_name varchar(64) NOT NULL default '', - short_name varchar(30) NOT NULL default '', - map_type_accession varchar(20) NOT NULL default '0', - species_id int(11) NOT NULL default '0', - published_on date default NULL, - can_be_reference_map tinyint(4) NOT NULL default '1', - display_order int(11) NOT NULL default '1', - is_enabled tinyint(4) NOT NULL default '1', - shape varchar(12) default NULL, - color varchar(20) default NULL, - width int(11) default NULL, - map_units varchar(12) NOT NULL default '', - is_relational_map tinyint(11) NOT NULL default '0', - PRIMARY KEY (map_set_id), - UNIQUE KEY accession_id (accession_id), - UNIQUE KEY map_set_id (map_set_id,species_id,short_name,accession_id), - KEY cmap_map_set_idx (can_be_reference_map,is_enabled,species_id,display_order,published_on,short_name) -) TYPE=MyISAM; -} # table -}, - - -cmap_next_number => { -table=>q{ -create table cmap_next_number ( - table_name varchar(40) NOT NULL default '', - next_number int(11) NOT NULL default '0', - PRIMARY KEY (table_name) -) TYPE=MyISAM; -}, # table -insert=>{next_num=>q[ insert into cmap_next_number (table_name,next_number) VALUES ('cmap_feature',82);]} -}, - - -cmap_species => { -table=>q{ -create table cmap_species ( - species_id int(11) NOT NULL default '0', - accession_id varchar(20) NOT NULL default '', - common_name varchar(64) NOT NULL default '', - full_name varchar(64) NOT NULL default '', - display_order int(11) NOT NULL default '1', - PRIMARY KEY (species_id), - KEY acc_id_species_id (accession_id,species_id) -) TYPE=MyISAM; -} # table -}, - - -cmap_xref => { -table=>q{ -create table cmap_xref ( - xref_id int(11) NOT NULL default '0', - table_name varchar(30) NOT NULL default '', - object_id int(11) default NULL, - display_order int(11) NOT NULL default '1', - xref_name varchar(200) NOT NULL default '', - xref_url text NOT NULL, - PRIMARY KEY (xref_id), - KEY table_name (table_name,object_id,display_order) -) TYPE=MyISAM; -} # table -}, - - -); - return \%schema; -} - - - -=head2 make_classes_query - - Title : make_classes_query - Usage : ($query,@args) = $db->make_classes_query - Function: return query fragment for generating list of reference classes - Returns : a query and args - Args : none - Status : public - -=cut - -sub make_classes_query { - my $self = shift; - return 'SELECT DISTINCT gclass FROM cmap_feature WHERE NOT ISNULL(gclass)'; -} - - -=head2 setup_load - - Title : setup_load - Usage : $db->setup_load - Function: called before load_gff_line() - Returns : void - Args : none - Status : protected - -This method performs schema-specific initialization prior to loading a -set of GFF records. It prepares a set of DBI statement handlers to be -used in loading the data. - -=cut - -sub setup_load { - my $self = shift; - - my $dbh = $self->features_db; - - if ($self->lock_on_load) { - my @tables = map { "$_ WRITE"} $self->tables; - my $tables = join ', ',@tables; - $dbh->do("LOCK TABLES $tables"); - } - -#xx1 - my $lookup_type = $dbh->prepare_delayed('SELECT ftypeid FROM ftype WHERE fmethod=? AND fsource=?'); - my $insert_type = $dbh->prepare_delayed('INSERT INTO ftype (fmethod,fsource) VALUES (?,?)'); - - my $lookup_group = $dbh->prepare_delayed('SELECT feature_id FROM cmap_feature WHERE feature_name=? AND gclass=?'); - my $insert_group = $dbh->prepare_delayed(' INSERT into cmap_feature (feature_id, accession_id,feature_name, gclass ) VALUES (?,feature_id,?,?)'); - my $aux_insert_group = $dbh->prepare_delayed(' update cmap_next_number set next_number = next_number +1 where table_name=\'cmap_feature\''); - my $next_id_group = $dbh->prepare_delayed('select next_number from cmap_next_number where table_name=\'cmap_feature\''); - - my $lookup_attribute = $dbh->prepare_delayed('SELECT fattribute_id FROM fattribute WHERE fattribute_name=?'); - my $insert_attribute = $dbh->prepare_delayed('INSERT INTO fattribute (fattribute_name) VALUES (?)'); - my $insert_attribute_value = $dbh->prepare_delayed('INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) VALUES (?,?,?)'); - - my $insert_data = $dbh->prepare_delayed(<{load_stuff}{sth}{lookup_ftype} = $lookup_type; - $self->{load_stuff}{sth}{insert_ftype} = $insert_type; - #$self->{load_stuff}{sth}{lookup_fgroup} = $lookup_group; - #$self->{load_stuff}{sth}{insert_fgroup} = $insert_group; - $self->{load_stuff}{sth}{lookup_cmap_feature} = $lookup_group; - $self->{load_stuff}{sth}{insert_cmap_feature} = $insert_group; - $self->{load_stuff}{sth}{aux_insert_cmap_feature} = $aux_insert_group; - $self->{load_stuff}{sth}{next_id_cmap_feature} = $next_id_group; - $self->{load_stuff}{sth}{insert_fdata} = $insert_data; - $self->{load_stuff}{sth}{lookup_fattribute} = $lookup_attribute; - $self->{load_stuff}{sth}{insert_fattribute} = $insert_attribute; - $self->{load_stuff}{sth}{insert_fattribute_value} = $insert_attribute_value; - $self->{load_stuff}{types} = {}; - $self->{load_stuff}{groups} = {}; - $self->{load_stuff}{counter} = 0; -} - -=head2 load_gff_line - - Title : load_gff_line - Usage : $db->load_gff_line($fields) - Function: called to load one parsed line of GFF - Returns : true if successfully inserted - Args : hashref containing GFF fields - Status : protected - -This method is called once per line of the GFF and passed a series of -parsed data items that are stored into the hashref $fields. The keys are: - - ref reference sequence - source annotation source - method annotation method - start annotation start - stop annotation stop - score annotation score (may be undef) - strand annotation strand (may be undef) - phase annotation phase (may be undef) - group_class class of annotation's group (may be undef) - group_name ID of annotation's group (may be undef) - target_start start of target of a similarity hit - target_stop stop of target of a similarity hit - attributes array reference of attributes, each of which is a [tag=>value] array ref - -=cut - -sub load_gff_line { - my $self = shift; - my $gff = shift; - - my $s = $self->{load_stuff}; - my $dbh = $self->features_db; - local $dbh->{PrintError} = 0; - - defined(my $typeid = $self->get_table_id('ftype', $gff->{method} => $gff->{source})) or return; - defined(my $groupid = $self->get_table_id('cmap_feature',$gff->{gname} => $gff->{gclass})) or return; - - if ($gff->{stop}-$gff->{start}+1 > $self->max_bin) { - warn "$gff->{gclass}:$gff->{gname} is longer than ",$self->maxbin,".\n"; - warn "Please set the maxbin value to a larger length than the largest feature you wish to store.\n"; - warn "With the command-line tools you do with this with --maxfeature option.\n"; - } - - my $bin = bin($gff->{start},$gff->{stop},$self->min_bin); - my $result = $s->{sth}{insert_fdata}->execute($gff->{ref}, - $gff->{start},$gff->{stop},$bin, - $typeid, - $gff->{score},$gff->{strand},$gff->{phase}, - $groupid, - $gff->{tstart},$gff->{tstop}); - - warn $dbh->errstr,"\n" && return unless $result; - - my $fid = $dbh->{mysql_insertid} - || $self->get_feature_id($gff->{ref},$gff->{start},$gff->{stop},$typeid,$groupid); - - - # insert attributes - foreach (@{$gff->{attributes}}) { - defined(my $attribute_id = $self->get_table_id('fattribute',$_->[0])) or return; - $s->{sth}{insert_fattribute_value}->execute($fid,$attribute_id,$_->[1]); - } - - if ( (++$s->{counter} % 1000) == 0) { - print STDERR "$s->{counter} records loaded..."; - print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n"; - } - - $fid; -} - -=head2 get_feature_id - - Title : get_feature_id - Usage : $integer = $db->get_feature_id($ref,$start,$stop,$typeid,$groupid) - Function: get the ID of a feature - Returns : an integer ID or undef - Args : none - Status : private - -This internal method is called by load_gff_line to look up the integer -ID of an existing feature. It is ony needed when replacing a feature -with new information. - -=cut - -# this method is called when needed to look up a feature's ID -sub get_feature_id { - my $self = shift; - my ($ref,$start,$stop,$typeid,$groupid) = @_; - my $s = $self->{load_stuff}; - unless ($s->{get_feature_id}) { - my $dbh = $self->features_db; - $s->{get_feature_id} = - $dbh->prepare_delayed('SELECT fid FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND ftypeid=? AND feature_id=?'); - } - my $sth = $s->{get_feature_id} or return; - $sth->execute($ref,$start,$stop,$typeid,$groupid) or return; - my ($fid) = $sth->fetchrow_array; - return $fid; -} - -=head2 get_table_id - - Title : get_table_id - Usage : $integer = $db->get_table_id($table,@ids) - Function: get the ID of a group or type - Returns : an integer ID or undef - Args : none - Status : private - -This internal method is called by load_gff_line to look up the integer -ID of an existing feature type or group. The arguments are the name -of the table, and two string identifiers. For feature types, the -identifiers are the method and source. For groups, the identifiers -are group name and class. - -This method requires that a statement handler named I, -have been created previously by setup_load(). It is here to overcome -deficiencies in mysql's INSERT syntax. - -=cut - -#' -# get the object ID from a named table -sub get_table_id { - my $self = shift; - my $table = shift; - my @ids = @_; - - # irritating warning for null id - my $id_key; - { - local $^W=0; - $id_key = join ':',@ids; - } - - my $s = $self->{load_stuff}; - my $sth = $s->{sth}; - my $dbh = $self->features_db; - - unless (defined($s->{$table}{$id_key})) { - - ######################################### - # retrieval of the last inserted id is now located at the adaptor and not in caching_handle - ####################################### - if ( (my $result = $sth->{"lookup_$table"}->execute(@ids)) > 0) { - $s->{$table}{$id_key} = ($sth->{"lookup_$table"}->fetchrow_array)[0]; - } else { - if (defined($sth->{"next_id_$table"})){ - - $sth->{"insert_$table"}->execute(3,'string1','string2'); - # Can't use auto incrementing - $sth->{"next_id_$table"}->execute(); - $s->{$table}{$id_key} = ($sth->{"next_id_$table"}->fetchrow_array)[0]; - if ($s->{$table}{$id_key}){ - $sth->{"insert_$table"}->execute($s->{$table}{$id_key},@ids); - $sth->{"aux_insert_$table"}->execute() if $sth->{"aux_insert_$table"}; - } - } - else{ - $sth->{"insert_$table"}->execute(@ids); - $s->{$table}{$id_key} = $self->insertid($sth->{"insert_$table"}) unless $s->{$table}{$id_key}; - $sth->{"aux_insert_$table"}->execute() if $sth->{"aux_insert_$table"}; - } - } - } - - my $id = $s->{$table}{$id_key}; - unless (defined $id) { - warn "No $table id for $id_key ",$dbh->errstr," Record skipped.\n"; - return; - } - $id; -} - - - -#----------------------------------- - -=head2 make_features_by_name_where_part - - Title : make_features_by_name_where_part - Usage : $db->make_features_by_name_where_part - Function: create the SQL fragment needed to select a feature by its group name & class - Returns : a SQL fragment and bind arguments - Args : see below - Status : Protected - -=cut - -sub make_features_by_name_where_part { - my $self = shift; - my ($class,$name) = @_; - if ($name =~ /\*/) { - $name =~ tr/*/%/; - return ("cmap_feature.gclass=? AND cmap_feature.feature_name LIKE ?",$class,$name); - } else { - return ("cmap_feature.gclass=? AND cmap_feature.feature_name=?",$class,$name); - } -} - -=head2 make_features_join_part - - Title : make_features_join_part - Usage : $string = $db->make_features_join_part() - Function: make join part of the features query - Returns : a string - Args : none - Status : protected - -This method creates the part of the features query that immediately -follows the WHERE keyword. - -=cut - -sub make_features_join_part { - my $self = shift; - my $options = shift || {}; - return !$options->{attributes} ? <search_notes("full text search string",$limit) - Function: Search the notes for a text string, using mysql full-text search - Returns : array of results - Args : full text search string, and an optional row limit - Status : public - -This is a mysql-specific method. Given a search string, it performs a -full-text search of the notes table and returns an array of results. -Each row of the returned array is a arrayref containing the following fields: - - column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment() - column 2 The text of the note - column 3 A relevance score. - -=cut - -sub search_notes { - my $self = shift; - my ($search_string,$limit) = @_; - - $search_string =~ tr/*?//d; - - my @words = $search_string =~ /(\w+)/g; - my $regex = join '|',@words; - my @searches = map {"fattribute_value LIKE '%${_}%'"} @words; - my $search = join(' OR ',@searches); - - my $query = <dbh->do_query($query); - my @results; - while (my ($class,$name,$note) = $sth->fetchrow_array) { - next unless $class && $name; # sorry, ignore NULL objects - my @matches = $note =~ /($regex)/g; - my $relevance = 10*@matches; - my $featname = Bio::DB::GFF::Featname->new($class=>$name); - push @results,[$featname,$note,$relevance]; - last if $limit && @results >= $limit; - } - @results; -} - -# sub search_notes { -# my $self = shift; -# my ($search_string,$limit) = @_; -# my $query = FULLTEXTSEARCH; -# $query .= " limit $limit" if defined $limit; -# my $sth = $self->dbh->do_query($query,$search_string,$search_string); -# my @results; -# while (my ($class,$name,$note,$relevance) = $sth->fetchrow_array) { -# next unless $class && $name; # sorry, ignore NULL objects -# $relevance = sprintf("%.2f",$relevance); # trim long floats -# my $featname = Bio::DB::GFF::Featname->new($class=>$name); -# push @results,[$featname,$note,$relevance]; -# } -# @results; -# } - -=head2 make_features_order_by_part - - Title : make_features_order_by_part - Usage : ($query,@args) = $db->make_features_order_by_part() - Function: make the ORDER BY part of the features() query - Returns : a SQL fragment and bind arguments, if any - Args : none - Status : protected - -This method creates the part of the features query that immediately -follows the ORDER BY part of the query issued by features() and -related methods. - -=cut - -sub make_features_order_by_part { - my $self = shift; - my $options = shift || {}; - return "cmap_feature.feature_name"; -} - -=head2 create_cmap_viewer_link - - Title : create_cmap_viewer_link - Usage : $link_str = $db->create_cmap_viewer_link(data_source=>$ds,group_id=>$gid) - Function: - Returns : - Args : - Status : - - -=cut - -sub create_cmap_viewer_link { - my $self = shift; - my %args = @_; - my $data_source = $args{'data_source'}; - my $gid = $args{'group_id'}; - my $link_str = undef; - - my $db = $self->features_db; - my $sql_str = qq[ - select f.feature_name, - f.feature_type_accession feature_type_aid, - m.accession_id as map_aid, - ms.accession_id as map_set_aid - from cmap_feature f, - cmap_map m, - cmap_map_set ms - where f.map_id=m.map_id - and ms.map_set_id=m.map_set_id - and f.feature_id=$gid - ]; - - my $result_ref = $db->selectrow_hashref($sql_str,{ Columns => {} }); - - if ( $result_ref ) { - $link_str='/cgi-bin/cmap/viewer?ref_map_set_aid=' - . $result_ref->{'map_set_aid'} - . '&ref_map_aids=' - . $result_ref->{'map_aid'} - . '&data_source=' - . $data_source - . '&highlight=' - .$result_ref->{'feature_name'} - . '&feature_type_' - .$result_ref->{'feature_type_aid'} - . '=2'; - } - - return $link_str; -} - -1; - - -__END__ - -=head1 BUGS - -none ;-) - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Ben Faga Efaga@cshl.orgE. - -Modified from mysql.pm by: - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2002 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Adaptor/dbi/mysqlopt.pm b/lib/Bio/DB/GFF/Adaptor/dbi/mysqlopt.pm deleted file mode 100644 index 1e846752f..000000000 --- a/lib/Bio/DB/GFF/Adaptor/dbi/mysqlopt.pm +++ /dev/null @@ -1,31 +0,0 @@ -package Bio::DB::GFF::Adaptor::dbi::mysqlopt; - -=head1 NAME - -Bio::DB::GFF::Adaptor::dbi::mysqlopt -- Deprecated database adaptor - -=head1 SYNOPSIS - -This adaptor has been superseded by Bio::DB::GFF::Adaptor::dbi::mysql. - -See L and L - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2002 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -use strict; -use base qw(Bio::DB::GFF::Adaptor::dbi::mysql); - -1; diff --git a/lib/Bio/DB/GFF/Adaptor/dbi/oracle.pm b/lib/Bio/DB/GFF/Adaptor/dbi/oracle.pm deleted file mode 100644 index 0423e4da5..000000000 --- a/lib/Bio/DB/GFF/Adaptor/dbi/oracle.pm +++ /dev/null @@ -1,1029 +0,0 @@ -package Bio::DB::GFF::Adaptor::dbi::oracle; - -=head1 NAME - -Bio::DB::GFF::Adaptor::dbi::oracle -- Database adaptor for a specific oracle schema - -=head1 SYNOPSIS - -See L - -=cut - -# a simple oracle adaptor -use strict; -#use Bio::DB::GFF::Adaptor::dbi::mysql; -#use Bio::DB::GFF::Adaptor::dbi::mysqlopt; -use Bio::DB::GFF::Util::Binning; -use Bio::DB::GFF::Util::Rearrange; # for rearrange() -use base qw(Bio::DB::GFF::Adaptor::dbi); - -use constant MAX_SEGMENT => 100_000_000; # the largest a segment can get -use constant DEFAULT_CHUNK => 2000; - -use constant GETSEQCOORDS =><<<< 100_000_000; - -# this is the smallest bin (1 K) -use constant MIN_BIN => 1000; - -# size of range over which it is faster to force mysql to use the range for indexing -use constant STRAIGHT_JOIN_LIMIT => 200_000; - -############################################################################## - -=head1 DESCRIPTION - -This adaptor implements a specific oracle database schema that is -compatible with Bio::DB::GFF. It inherits from -Bio::DB::GFF::Adaptor::dbi, which itself inherits from Bio::DB::GFF. - -The schema uses several tables: - -=over 4 - -=item fdata - -This is the feature data table. Its columns are: - - fid feature ID (integer) - fref reference sequence name (string) - fstart start position relative to reference (integer) - fstop stop position relative to reference (integer) - ftypeid feature type ID (integer) - fscore feature score (float); may be null - fstrand strand; one of "+" or "-"; may be null - fphase phase; one of 0, 1 or 2; may be null - gid group ID (integer) - ftarget_start for similarity features, the target start position (integer) - ftarget_stop for similarity features, the target stop position (integer) - -Note that it would be desirable to normalize the reference sequence -name, since there are usually many features that share the same -reference feature. However, in the current schema, query performance -suffers dramatically when this additional join is added. - -=item fgroup - -This is the group table. There is one row for each group. Columns: - - gid the group ID (integer) - gclass the class of the group (string) - gname the name of the group (string) - -The group table serves multiple purposes. As you might expect, it is -used to cluster features that logically belong together, such as the -multiple exons of the same transcript. It is also used to assign a -name and class to a singleton feature. Finally, the group table is -used to identify the target of a similarity hit. This is consistent -with the way in which the group field is used in the GFF version 2 -format. - -The fgroup.gid field joins with the fdata.gid field. - -Examples: - - sql> select * from fgroup where gname='sjj_2L52.1'; - +-------+-------------+------------+ - | gid | gclass | gname | - +-------+-------------+------------+ - | 69736 | PCR_product | sjj_2L52.1 | - +-------+-------------+------------+ - 1 row in set (0.70 sec) - - sql> select fref,fstart,fstop from fdata,fgroup - where gclass='PCR_product' and gname = 'sjj_2L52.1' - and fdata.gid=fgroup.gid; - +---------------+--------+-------+ - | fref | fstart | fstop | - +---------------+--------+-------+ - | CHROMOSOME_II | 1586 | 2355 | - +---------------+--------+-------+ - 1 row in set (0.03 sec) - -=item ftype - -This table contains the feature types, one per row. Columns are: - - ftypeid the feature type ID (integer) - fmethod the feature type method name (string) - fsource the feature type source name (string) - -The ftype.ftypeid field joins with the fdata.ftypeid field. Example: - - sql> select fref,fstart,fstop,fmethod,fsource from fdata,fgroup,ftype - where gclass='PCR_product' - and gname = 'sjj_2L52.1' - and fdata.gid=fgroup.gid - and fdata.ftypeid=ftype.ftypeid; - +---------------+--------+-------+-------------+-----------+ - | fref | fstart | fstop | fmethod | fsource | - +---------------+--------+-------+-------------+-----------+ - | CHROMOSOME_II | 1586 | 2355 | PCR_product | GenePairs | - +---------------+--------+-------+-------------+-----------+ - 1 row in set (0.08 sec) - -=item fdna - -This table holds the raw DNA of the reference sequences. It has three -columns: - - fref reference sequence name (string) - foffset offset of this sequence - fdna the DNA sequence (longblob) - -To overcome problems loading large blobs, DNA is automatically -fragmented into multiple segments when loading, and the position of -each segment is stored in foffset. The fragment size is controlled by -the -clump_size argument during initialization. - -=item fattribute_to_feature - -This table holds "attributes", which are tag/value pairs stuffed into -the GFF line. The first tag/value pair is treated as the group, and -anything else is treated as an attribute (weird, huh?). - - CHR_I assembly_tag Finished 2032 2036 . + . Note "Right: cTel33B" - CHR_I assembly_tag Polymorphism 668 668 . + . Note "A->C in cTel33B" - -The columns of this table are: - - fid feature ID (integer) - fattribute_id ID of the attribute (integer) - fattribute_value text of the attribute (text) - -The fdata.fid column joins with fattribute_to_feature.fid. - -=item fattribute - -This table holds the normalized names of the attributes. Fields are: - - fattribute_id ID of the attribute (integer) - fattribute_name Name of the attribute (varchar) - -=back - -=head2 Data Loading Methods - -In addition to implementing the abstract SQL-generating methods of -Bio::DB::GFF::Adaptor::dbi, this module also implements the data -loading functionality of Bio::DB::GFF. - -=cut - - -=head2 new - - Title : new - Usage : $db = Bio::DB::GFF->new(@args) - Function: create a new adaptor - Returns : a Bio::DB::GFF object - Args : see below - Status : Public - -The new constructor is identical to the "dbi" adaptor's new() method, -except that the prefix "dbi:oracle" is added to the database DSN identifier -automatically if it is not there already. - - Argument Description - -------- ----------- - - -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' or "ens0040" - - -user username for authentication - - -pass the password for authentication - -=cut - -#' - -sub new { - my $class = shift; - my ($dsn,$other) = rearrange([ - [qw(FEATUREDB DB DSN)], - ],@_); - $dsn = "dbi:Oracle:$dsn" if !ref($dsn) && $dsn !~ /^(dbi|DBI):/; - my $self = $class->SUPER::new(-dsn=>$dsn,%$other); - $self; -} - -=head2 schema - - Title : schema - Usage : $schema = $db->schema - Function: return the CREATE script for the schema - Returns : a list of CREATE statemetns - Args : none - Status : protected - -This method returns a list containing the various CREATE statements -needed to initialize the database tables. - -=cut - -sub schema { - my %schema = ( - fdata =>{ -table=> q{ -create table fdata ( - fid INTEGER NOT NULL, - fref VARCHAR(100) DEFAULT '' NOT NULL, - fstart INTEGER DEFAULT '0' NOT NULL, - fstop INTEGER DEFAULT '0' NOT NULL, - fbin NUMBER DEFAULT '0.000000' NOT NULL, - ftypeid INTEGER DEFAULT '0' NOT NULL, - fscore NUMBER , - fstrand VARCHAR2(3) CHECK (fstrand IN ('+','-')), - fphase VARCHAR2(3) CHECK (fphase IN ('0','1','2')), - gid INTEGER DEFAULT '0' NOT NULL, - ftarget_start INTEGER , - ftarget_stop INTEGER , - CONSTRAINT fdata_pk PRIMARY KEY (fid) -) -}, # fdata table - -index=>{ - fdata_fref_idx => q{ -CREATE UNIQUE INDEX fdata_fref_idx ON fdata (fref,fbin,fstart,fstop,ftypeid,gid) -}, - - fdata_ftypeid_idx => q{ -CREATE INDEX fdata_ftypeid_idx ON fdata (ftypeid) -}, - - fdata_gid_idx => q{ -CREATE INDEX fdata_gid_idx ON fdata (gid) -} - }, # fdata indexes - -sequence=> { - fdata_fid_sq => q{ -CREATE SEQUENCE fdata_fid_sq START WITH 1 -} - }, # fdata sequences - -trigger=> { - fdata_fid_ai => q{ -CREATE OR REPLACE TRIGGER fdata_fid_ai -BEFORE INSERT ON fdata -FOR EACH ROW WHEN (new.fid IS NULL OR new.fid = 0) -BEGIN - SELECT fdata_fid_sq.nextval INTO :new.fid FROM dual; -END; -} - }# fdata triggers - -}, # fdata - - - - fgroup => { -table => q{ -CREATE TABLE fgroup ( - gid INTEGER NOT NULL, - gclass VARCHAR(100) , - gname VARCHAR(100) , - CONSTRAINT fgroup_pk PRIMARY KEY (gid) -) -}, # fgroup table - -index => { - fgroup_gclass_idx => q{ -CREATE UNIQUE INDEX fgroup_gclass_idx ON fgroup (gclass,gname) -} - }, # fgroup indexes - -sequence => { - - fgroup_gid_sq => q{ -CREATE SEQUENCE fgroup_gid_sq START WITH 1 -} - }, # fgroup sequences - - -trigger => { - fgroup_gid_ai => q{ -CREATE OR REPLACE TRIGGER fgroup_gid_ai -BEFORE INSERT ON fgroup -FOR EACH ROW WHEN (new.gid IS NULL OR new.gid = 0) -BEGIN - SELECT fgroup_gid_sq.nextval INTO :new.gid FROM dual; -END; -} - } # fgroup triggers - -}, # fgroup - - ftype => { -table => q{ -CREATE TABLE ftype ( - ftypeid INTEGER NOT NULL, - fmethod VARCHAR(100) DEFAULT '' NOT NULL, - fsource VARCHAR(100), - CONSTRAINT ftype_pk PRIMARY KEY (ftypeid) -) -}, # ftype table - -index => { - ftype_fmethod_idx => q{ -CREATE INDEX ftype_fmethod_idx ON ftype (fmethod) -}, - - ftype_fsource_idx => q{ -CREATE INDEX ftype_fsource_idx ON ftype (fsource) -}, - - ftype_ftype_idx => q{ -CREATE UNIQUE INDEX ftype_ftype_idx ON ftype (fmethod,fsource) -} - }, # ftype indexes - -sequence => { - ftype_ftypeid_sq => q{ -CREATE SEQUENCE ftype_ftypeid_sq START WITH 1 -} - }, #ftype sequences - -trigger => { - ftype_ftypeid_ai => q{ -CREATE OR REPLACE TRIGGER ftype_ftypeid_ai -BEFORE INSERT ON ftype -FOR EACH ROW WHEN (new.ftypeid IS NULL OR new.ftypeid = 0) -BEGIN - SELECT ftype_ftypeid_sq.nextval INTO :new.ftypeid FROM dual; -END; -} - } #ftype triggers -}, # ftype - - - fdna => { -table => q{ -CREATE TABLE fdna ( - fref VARCHAR(100) DEFAULT '' NOT NULL, - foffset INTEGER DEFAULT '0' NOT NULL, - fdna LONG /* LONGBLOB */ , - CONSTRAINT fdna_pk PRIMARY KEY (fref,foffset) -) -} #fdna table - }, #fdna - - fmeta => { -table => q{ -CREATE TABLE fmeta ( - fname VARCHAR(255) DEFAULT '' NOT NULL, - fvalue VARCHAR(255) DEFAULT '' NOT NULL, - CONSTRAINT fmeta_pk PRIMARY KEY (fname) -) -} # fmeta table - }, # fmeta - - - fattribute => { -table => q{ -CREATE TABLE fattribute ( - fattribute_id INTEGER NOT NULL, - fattribute_name VARCHAR(255) DEFAULT '' NOT NULL, - CONSTRAINT fattribute_pk PRIMARY KEY (fattribute_id) -) -}, # fattribute table - -sequence=> { - fattribute_fattribute_id_sq => q{ -CREATE SEQUENCE fattribute_fattribute_id_sq START WITH 1 -} - }, # fattribute sequences - -trigger => { - fattribute_fattribute_id_ai => q{ -CREATE OR REPLACE TRIGGER fattribute_fattribute_id_ai -BEFORE INSERT ON fattribute -FOR EACH ROW WHEN (new.fattribute_id IS NULL OR new.fattribute_id = 0) -BEGIN - SELECT fattribute_fattribute_id_sq.nextval INTO :new.fattribute_id FROM dual; -END; -} - } # fattribute triggers -}, # fattribute - - fattribute_to_feature => { -table => q{ -CREATE TABLE fattribute_to_feature ( - fid INTEGER DEFAULT '0' NOT NULL, - fattribute_id INTEGER DEFAULT '0' NOT NULL, - fattribute_value VARCHAR2(255) /* TEXT */ -) -}, # fattribute_to_feature table - -index => { - fattribute_to_feature_fid => q{ -CREATE INDEX fattribute_to_feature_fid ON fattribute_to_feature (fid,fattribute_id) -} - } # fattribute_to_feature indexes -}, # fattribute_to_feature - - finterval_stats => { -table=> q{ -CREATE TABLE "finterval_stats" ( - "ftypeid" integer DEFAULT '0' NOT NULL, - "fref" VARCHAR(100) DEFAULT '' NOT NULL, - "fbin" integer DEFAULT '0' NOT NULL, - "fcum_count" integer DEFAULT '0' NOT NULL, - CONSTRAINT finterval_stats_pk PRIMARY KEY (ftypeid,fref,fbin) -) -} # finterval_stats table -},# finterval_stats - -); - return \%schema; -} - - -=head2 do_initialize - - Title : do_initialize - Usage : $success = $db->do_initialize($drop_all) - Function: initialize the database - Returns : a boolean indicating the success of the operation - Args : a boolean indicating whether to delete existing data - Status : protected - -This method will load the schema into the database. If $drop_all is -true, then any existing data in the tables known to the schema will be -deleted. - -Internally, this method calls schema() to get the schema data. - -=cut - -# Create the schema from scratch. -# You will need create privileges for this. -#sub do_initialize { -# my $self = shift; -# my $erase = shift; -# $self->drop_all if $erase; - -# my $dbh = $self->features_db; -# my $schema = $self->schema; - -# foreach my $table_name(keys %$schema) { -# my $create_table_stmt = $$schema{$table_name}{table} ; -# $dbh->do($create_table_stmt) || warn $dbh->errstr; -# } -# 1; -#} - - - -=head2 drop_all - - Title : drop_all - Usage : $db->drop_all - Function: empty the database - Returns : void - Args : none - Status : protected - -This method drops the tables known to this module. Internally it -calls the abstract tables() method. - -=cut - -# Drop all the GFF tables -- dangerous! -#sub drop_all { -# my $self = shift; -# my $dbh = $self->features_db; -# local $dbh->{PrintError} = 0; -# foreach ($self->tables) { -# $dbh->do("drop table $_"); -# } -#} - - - - - - -=head2 setup_load - - Title : setup_load - Usage : $db->setup_load - Function: called before load_gff_line() - Returns : void - Args : none - Status : protected - -This method performs schema-specific initialization prior to loading a -set of GFF records. It prepares a set of DBI statement handlers to be -used in loading the data. - -=cut - -sub setup_load { - my $self = shift; - my $schema = $self->schema; - - my $dbh = $self->features_db; - - if ($self->lock_on_load) { - my @tables = map { "$_ WRITE"} $self->tables; - my $tables = join ', ',@tables; - $dbh->do("LOCK TABLES $tables"); - } - - my $lookup_type = $dbh->prepare_delayed('SELECT ftypeid FROM ftype WHERE fmethod=? AND fsource=?'); - my $insert_type = $dbh->prepare_delayed('INSERT INTO ftype (fmethod,fsource) VALUES (?,?)'); - my $sequence_type = (keys %{$schema->{ftype}{sequence}})[0]; - my $insertid_type = $dbh->prepare_delayed("SELECT $sequence_type.CURRVAL FROM dual"); - - my $lookup_group = $dbh->prepare_delayed('SELECT gid FROM fgroup WHERE gname=? AND gclass=?'); - my $insert_group = $dbh->prepare_delayed('INSERT INTO fgroup (gname,gclass) VALUES (?,?)'); - my $sequence_group = (keys %{$schema->{fgroup}{sequence}})[0]; - my $insertid_group = $dbh->prepare_delayed("SELECT $sequence_group.CURRVAL FROM dual"); - - my $lookup_attribute = $dbh->prepare_delayed('SELECT fattribute_id FROM fattribute WHERE fattribute_name=?'); - my $insert_attribute = $dbh->prepare_delayed('INSERT INTO fattribute (fattribute_name) VALUES (?)'); - my $sequence_attribute = (keys %{$schema->{fattribute}{sequence}})[0]; - my $insertid_attribute = $dbh->prepare_delayed("SELECT $sequence_attribute.CURRVAL FROM dual"); - - my $insert_attribute_value = $dbh->prepare_delayed('INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) VALUES (?,?,?)'); - - my $insert_data = $dbh->prepare_delayed(<prepare_delayed('DELETE FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND fbin=? AND ftypeid=? AND GID=?'); - my $sequence_data = (keys %{$schema->{fdata}{sequence}})[0]; - my $insertid_data = $dbh->prepare_delayed("SELECT $sequence_data.CURRVAL FROM dual"); - - - - $self->{load_stuff}{sth}{lookup_ftype} = $lookup_type; - $self->{load_stuff}{sth}{insert_ftype} = $insert_type; - $self->{load_stuff}{sth}{insertid_ftype} = $insertid_type; - $self->{load_stuff}{sth}{lookup_fgroup} = $lookup_group; - $self->{load_stuff}{sth}{insert_fgroup} = $insert_group; - $self->{load_stuff}{sth}{insertid_fgroup} = $insertid_group; - $self->{load_stuff}{sth}{insert_fdata} = $insert_data; - $self->{load_stuff}{sth}{insertid_fdata} = $insertid_data; - $self->{load_stuff}{sth}{delete_existing_fdata} = $delete_existing_data; - $self->{load_stuff}{sth}{lookup_fattribute} = $lookup_attribute; - $self->{load_stuff}{sth}{insert_fattribute} = $insert_attribute; - $self->{load_stuff}{sth}{insertid_fattribute} = $insertid_attribute; - $self->{load_stuff}{sth}{insert_fattribute_value} = $insert_attribute_value; - $self->{load_stuff}{types} = {}; - $self->{load_stuff}{groups} = {}; - $self->{load_stuff}{counter} = 0; -} - -=head2 load_gff_line - - Title : load_gff_line - Usage : $db->load_gff_line($fields) - Function: called to load one parsed line of GFF - Returns : true if successfully inserted - Args : hashref containing GFF fields - Status : protected - -This method is called once per line of the GFF and passed a series of -parsed data items that are stored into the hashref $fields. The keys are: - - ref reference sequence - source annotation source - method annotation method - start annotation start - stop annotation stop - score annotation score (may be undef) - strand annotation strand (may be undef) - phase annotation phase (may be undef) - group_class class of annotation's group (may be undef) - group_name ID of annotation's group (may be undef) - target_start start of target of a similarity hit - target_stop stop of target of a similarity hit - attributes array reference of attributes, each of which is a [tag=>value] array ref - -=cut - -sub load_gff_line { - my $self = shift; - my $gff = shift; - - if (defined $gff->{phase}){ - chomp($gff->{phase}); - undef($gff->{phase}) if $gff->{phase} eq '.'; - } - - if (defined $gff->{strand} && $gff->{strand} eq '.'){undef($gff->{strand})}; - if (defined $gff->{score} && $gff->{score} eq '.'){undef($gff->{score})}; - - my $s = $self->{load_stuff}; - my $dbh = $self->features_db; - local $dbh->{PrintError} = 0; - - defined(my $typeid = $self->get_table_id('ftype', $gff->{method} => $gff->{source})) or return; - defined(my $groupid = $self->get_table_id('fgroup',$gff->{gname} => $gff->{gclass})) or return; - - my $bin = bin($gff->{start},$gff->{stop},$self->min_bin); - my $result = $s->{sth}{insert_fdata}->execute($gff->{ref}, - $gff->{start},$gff->{stop},$bin, - $typeid, - $gff->{score},$gff->{strand},$gff->{phase}, - $groupid, - $gff->{tstart},$gff->{tstop}); - if (defined ($dbh->errstr)){ - print $dbh->errstr,"\n" ,%$gff,"\n"; - if ($dbh->errstr =~ /ORA-02290: check constraint/){ - print "PHASE=$gff->{phase}"."===","\n"; - } - - if ($dbh->errstr =~ /ORA-00001: unique constraint/){ - $result = $s->{sth}{delete_existing_fdata}->execute($gff->{ref}, - $gff->{start},$gff->{stop},$bin, - $typeid, - $groupid); - - print "delete row result=$result\n"; - $result = $s->{sth}{insert_fdata}->execute($gff->{ref}, - $gff->{start},$gff->{stop},$bin, - $typeid, - $gff->{score},$gff->{strand},$gff->{phase}, - $groupid, - $gff->{tstart},$gff->{tstop}); - - print "insert row result=$result\n"; - } - } - warn $dbh->errstr,"\n" and print "ref=",$gff->{ref}," start=",$gff->{start}," stop=",$gff->{stop}," bin=",$bin," typeid=",$typeid," groupid=",$groupid,"\n" - and return unless $result; - - my $fid = $self->insertid($s->{sth},'fdata') - || $self->get_feature_id($gff->{ref},$gff->{start},$gff->{stop},$typeid,$groupid); - - - # insert attributes - - # print STDERR map {"$fid attribute:". $_->[0]."=".$_->[1]."\n"} @{$gff->{attributes}}; - - foreach (@{$gff->{attributes}}) { - defined(my $attribute_id = $self->get_table_id('fattribute',$_->[0])) or return; - $s->{sth}{insert_fattribute_value}->execute($fid,$attribute_id,$_->[1]); - } - - if ( (++$s->{counter} % 1000) == 0) { - print STDERR "$s->{counter} records loaded..."; - print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n"; - } - - $fid; -} - - - - -=head2 get_table_id - - Title : get_table_id - Usage : $integer = $db->get_table_id($table,@ids) - Function: get the ID of a group or type - Returns : an integer ID or undef - Args : none - Status : private - -This internal method is called by load_gff_line to look up the integer -ID of an existing feature type or group. The arguments are the name -of the table, and two string identifiers. For feature types, the -identifiers are the method and source. For groups, the identifiers -are group name and class. - -This method requires that a statement handler named I, -have been created previously by setup_load(). It is here to overcome -deficiencies in mysql's INSERT syntax. - -=cut - -#' -# get the object ID from a named table -sub get_table_id { - my $self = shift; - my $table = shift; - my @ids = @_; - - # irritating warning for null id - my $id_key; - { - local $^W=0; - $id_key = join ':',@ids; - } - - my $s = $self->{load_stuff}; - my $sth = $s->{sth}; - my $dbh = $self->features_db; - - unless (defined($s->{$table}{$id_key})) { - $sth->{"lookup_$table"}->execute(@ids); - my @result = $sth->{"lookup_$table"}->fetchrow_array; - if (@result > 0) { - $s->{$table}{$id_key} = $result[0]; - } else { - $sth->{"insert_$table"}->execute(@ids) - && ($s->{$table}{$id_key} = $self->insertid($sth,$table)); - #&& ($s->{$table}{$id_key} = $self->insertid($sth->{"insertid_$table"})); - #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}->insertid); - } - } - - my $id = $s->{$table}{$id_key}; - unless (defined $id) { - warn "No $table id for $id_key ",$dbh->errstr," Record skipped.\n"; - return; - } - $id; -} - -sub insertid { - my $self = shift; - my $sth = shift ; - my $table = shift; - - my $insert_id; - if ($sth->{"insertid_$table"}->execute()){ - $insert_id = ($sth->{"insertid_$table"}->fetchrow_array)[0]; - } - else{ - warn "No CURRVAL for SEQUENCE of table $table ",$sth->errstr,"\n"; - return; - } - return $insert_id; -} - - -#sub insertid { -# my $self = shift; -# my $insertid_sth = shift ; -# my $insert_id; -# if ($insertid_sth->execute){ -# $insert_id = ($insertid_sth->fetchrow_array)[0]; -# } -# else{ -# warn "No CURRVAL for SEQUENCE ",$insertid_sth->errstr,"\n"; -# return; -# } -# return $insert_id; -#} - -sub insert_sequence { - my $self = shift; - my($id,$offset,$seq) = @_; - my $sth = $self->{_insert_sequence} - ||= $self->dbh->prepare_delayed('insert into fdna values (?,?,?)'); - $sth->execute($id,$offset,$seq) or $self->throw($sth->errstr); -} - -=head2 search_notes - - Title : search_notes - Usage : @search_results = $db->search_notes("full text search string",$limit) - Function: Search the notes for a text string, using mysql full-text search - Returns : array of results - Args : full text search string, and an optional row limit - Status : public - -This is a mysql-specific method. Given a search string, it performs a -full-text search of the notes table and returns an array of results. -Each row of the returned array is a arrayref containing the following fields: - - column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment() - column 2 The text of the note - column 3 A relevance score. - column 4 A Bio::DB::GFF::Typename object - -=cut - -sub search_notes { - my $self = shift; - my ($search_string,$limit) = @_; - - $search_string =~ tr/*?//d; - - my @words = $search_string =~ /(\w+)/g; - my $regex = join '|',@words; - my @searches = map {"fattribute_value LIKE '%${_}%'"} @words; - my $search = join(' OR ',@searches); - - my $query = <dbh->do_query($query); - my @results; - while (my ($class,$name,$note,$method,$source) = $sth->fetchrow_array) { - next unless $class && $name; # sorry, ignore NULL objects - my @matches = $note =~ /($regex)/g; - my $relevance = 10*@matches; - my $featname = Bio::DB::GFF::Featname->new($class=>$name); - my $type = Bio::DB::GFF::Typename->new($method,$source); - push @results,[$featname,$note,$relevance,$type]; - last if $limit && @results >= $limit; - } - @results; -} - -=head2 make_meta_set_query - - Title : make_meta_set_query - Usage : $sql = $db->make_meta_set_query - Function: return SQL fragment for setting a meta parameter - Returns : SQL fragment - Args : none - Status : public - -By default this does nothing; meta parameters are not stored or -retrieved. - -=cut - -sub make_meta_set_query { - return 'INSERT INTO fmeta VALUES (?,?)'; -} - -sub make_classes_query { - my $self = shift; - return 'SELECT DISTINCT gclass FROM fgroup WHERE NOT gclass IS NULL'; -} - - -sub chunk_size { - my $self = shift; - $self->meta('chunk_size') || DEFAULT_CHUNK; -} - -sub getseqcoords_query { - my $self = shift; - return GETSEQCOORDS ; -} - -sub getaliascoords_query{ - my $self = shift; - return GETALIASCOORDS ; -} - - -sub getforcedseqcoords_query{ - my $self = shift; - return GETFORCEDSEQCOORDS ; -} - - -sub getaliaslike_query{ - my $self = shift; - return GETALIASLIKE ; -} - - -sub make_features_select_part { - my $self = shift; - my $options = shift || {}; - my $s; - if (my $b = $options->{bin_width}) { - - $s = <{attributes} && keys %{$options->{attributes}}>1; - $s; -} - -sub make_features_from_part_bkup { - my $self = shift; - my $sparse = shift; - my $options = shift || {}; - #my $index = $sparse ? ' USE INDEX(ftypeid)': ''; - my $index = ''; - return $options->{attributes} ? "fdata${index},ftype,fgroup,fattribute,fattribute_to_feature\n" - : "fdata${index},ftype,fgroup\n"; -} - - -#################################### -# moved from mysqlopt.pm -################################### -# meta values -sub default_meta_values { - my $self = shift; - my @values = $self->SUPER::default_meta_values; - return ( - @values, - max_bin => MAX_BIN, - min_bin => MIN_BIN, - straight_join_limit => STRAIGHT_JOIN_LIMIT, - ); -} - -sub min_bin { - my $self = shift; - return $self->meta('min_bin') || MIN_BIN; -} -sub max_bin { - my $self = shift; - return $self->meta('max_bin') || MAX_BIN; -} -sub straight_join_limit { - my $self = shift; - return $self->meta('straight_join_limit') || STRAIGHT_JOIN_LIMIT; -} - -1; diff --git a/lib/Bio/DB/GFF/Adaptor/dbi/pg.pm b/lib/Bio/DB/GFF/Adaptor/dbi/pg.pm deleted file mode 100644 index d1784409f..000000000 --- a/lib/Bio/DB/GFF/Adaptor/dbi/pg.pm +++ /dev/null @@ -1,1390 +0,0 @@ -package Bio::DB::GFF::Adaptor::dbi::pg; - -=head1 NAME - -Bio::DB::GFF::Adaptor::dbi::pg -- Database adaptor for a specific postgres schema - -=head1 NOTES - -SQL commands that need to be executed before this adaptor will work: - - CREATE DATABASE ; - -Also, select permission needs to be granted for each table in the -database to the owner of the httpd process (usually 'nobody', but -for some RedHat systems it is 'apache') if this adaptor is to be used -with the Generic Genome Browser (gbrowse): - - CREATE USER nobody; - GRANT SELECT ON TABLE fmeta TO nobody; - GRANT SELECT ON TABLE fgroup TO nobody; - GRANT SELECT ON TABLE fdata TO nobody; - GRANT SELECT ON TABLE fattribute_to_feature TO nobody; - GRANT SELECT ON TABLE fdna TO nobody; - GRANT SELECT ON TABLE fattribute TO nobody; - GRANT SELECT ON TABLE ftype TO nobody; - -=head2 Optimizing the database - -PostgreSQL generally requires some tuning before you get very good -performance for large databases. For general information on tuning -a PostgreSQL server, see http://www.varlena.com/GeneralBits/Tidbits/perf.html -Of particular importance is executing VACUUM FULL ANALYZE whenever -you change the database. - -Additionally, for a GFF database, there are a few items you can tune. -For each automatic class in your GBrowse conf file, there will be one -or two searches done when searching for a feature. If there are lots -of features, these search can take several seconds. To speed these searches, -do two things: - -=over - -=item 1 - -Set 'enable_seqscan = false' in your postgresql.conf file (and restart -your server). - -=item 2 - -Create 'partial' indexes for each automatic class, doing this for the -example class 'Allele': - - CREATE INDEX partial_allele_gclass ON - fgroup (lower('gname')) WHERE gclass='Allele'; - -And be sure to run VACUUM FULL ANALYZE after creating the indexes. - -=back - -=cut - -# a simple postgres adaptor -use strict; -use Bio::DB::GFF::Util::Binning; -use Bio::DB::GFF::Util::Rearrange; # for rearrange() -use base qw(Bio::DB::GFF::Adaptor::dbi); - -use constant MAX_SEGMENT => 100_000_000; # the largest a segment can get -use constant DEFAULT_CHUNK => 2000; - -use constant GETSEQCOORDS =><<<< < 100_000_000; - -# this is the smallest bin (1 K) -use constant MIN_BIN => 1000; - -# size of range over which it is faster to force mysql to use the range for indexing -use constant STRAIGHT_JOIN_LIMIT => 200_000; - -############################################################################## - -=head1 DESCRIPTION - -This adaptor implements a specific postgres database schema that is -compatible with Bio::DB::GFF. It inherits from -Bio::DB::GFF::Adaptor::dbi, which itself inherits from Bio::DB::GFF. - -The schema uses several tables: - -=over 4 - -=item fdata - -This is the feature data table. Its columns are: - - fid feature ID (integer) - fref reference sequence name (string) - fstart start position relative to reference (integer) - fstop stop position relative to reference (integer) - ftypeid feature type ID (integer) - fscore feature score (float); may be null - fstrand strand; one of "+" or "-"; may be null - fphase phase; one of 0, 1 or 2; may be null - gid group ID (integer) - ftarget_start for similarity features, the target start position (integer) - ftarget_stop for similarity features, the target stop position (integer) - -Note that it would be desirable to normalize the reference sequence -name, since there are usually many features that share the same -reference feature. However, in the current schema, query performance -suffers dramatically when this additional join is added. - -=item fgroup - -This is the group table. There is one row for each group. Columns: - - gid the group ID (integer) - gclass the class of the group (string) - gname the name of the group (string) - -The group table serves multiple purposes. As you might expect, it is -used to cluster features that logically belong together, such as the -multiple exons of the same transcript. It is also used to assign a -name and class to a singleton feature. Finally, the group table is -used to identify the target of a similarity hit. This is consistent -with the way in which the group field is used in the GFF version 2 -format. - -The fgroup.gid field joins with the fdata.gid field. - -Examples: - - sql> select * from fgroup where gname='sjj_2L52.1'; - +-------+-------------+------------+ - | gid | gclass | gname | - +-------+-------------+------------+ - | 69736 | PCR_product | sjj_2L52.1 | - +-------+-------------+------------+ - 1 row in set (0.70 sec) - - sql> select fref,fstart,fstop from fdata,fgroup - where gclass='PCR_product' and gname = 'sjj_2L52.1' - and fdata.gid=fgroup.gid; - +---------------+--------+-------+ - | fref | fstart | fstop | - +---------------+--------+-------+ - | CHROMOSOME_II | 1586 | 2355 | - +---------------+--------+-------+ - 1 row in set (0.03 sec) - -=item ftype - -This table contains the feature types, one per row. Columns are: - - ftypeid the feature type ID (integer) - fmethod the feature type method name (string) - fsource the feature type source name (string) - -The ftype.ftypeid field joins with the fdata.ftypeid field. Example: - - sql> select fref,fstart,fstop,fmethod,fsource from fdata,fgroup,ftype - where gclass='PCR_product' - and gname = 'sjj_2L52.1' - and fdata.gid=fgroup.gid - and fdata.ftypeid=ftype.ftypeid; - +---------------+--------+-------+-------------+-----------+ - | fref | fstart | fstop | fmethod | fsource | - +---------------+--------+-------+-------------+-----------+ - | CHROMOSOME_II | 1586 | 2355 | PCR_product | GenePairs | - +---------------+--------+-------+-------------+-----------+ - 1 row in set (0.08 sec) - -=item fdna - -This table holds the raw DNA of the reference sequences. It has three -columns: - - fref reference sequence name (string) - foffset offset of this sequence - fdna the DNA sequence (longblob) - -To overcome problems loading large blobs, DNA is automatically -fragmented into multiple segments when loading, and the position of -each segment is stored in foffset. The fragment size is controlled by -the -clump_size argument during initialization. - -=item fattribute_to_feature - -This table holds "attributes", which are tag/value pairs stuffed into -the GFF line. The first tag/value pair is treated as the group, and -anything else is treated as an attribute (weird, huh?). - - CHR_I assembly_tag Finished 2032 2036 . + . Note "Right: cTel33B" - CHR_I assembly_tag Polymorphism 668 668 . + . Note "A->C in cTel33B" - -The columns of this table are: - - fid feature ID (integer) - fattribute_id ID of the attribute (integer) - fattribute_value text of the attribute (text) - -The fdata.fid column joins with fattribute_to_feature.fid. - -=item fattribute - -This table holds the normalized names of the attributes. Fields are: - - fattribute_id ID of the attribute (integer) - fattribute_name Name of the attribute (varchar) - -=back - -=head2 Data Loading Methods - -In addition to implementing the abstract SQL-generating methods of -Bio::DB::GFF::Adaptor::dbi, this module also implements the data -loading functionality of Bio::DB::GFF. - -=cut - - -=head2 new - - Title : new - Usage : $db = Bio::DB::GFF->new(@args) - Function: create a new adaptor - Returns : a Bio::DB::GFF object - Args : see below - Status : Public - -The new constructor is identical to the "dbi" adaptor's new() method, -except that the prefix "dbi:pg" is added to the database DSN identifier -automatically if it is not there already. - - Argument Description - -------- ----------- - - -dsn the DBI data source, e.g. 'dbi:Pg:dbname=:ens0040' or "ens0040" - - -user username for authentication - - -pass the password for authentication - -=cut - -#' - -sub new { - my $class = shift; - my ($dsn,$other) = rearrange([ - [qw(FEATUREDB DB DSN)], - ],@_); - $dsn = "dbi:Pg:dbname=$dsn" if !ref($dsn) && $dsn !~ /^(dbi|DBI):/; - my $self = $class->SUPER::new(-dsn=>$dsn,%$other); - $self; -} - -=head2 schema - - Title : schema - Usage : $schema = $db->schema - Function: return the CREATE script for the schema - Returns : a list of CREATE statemetns - Args : none - Status : protected - -This method returns a list containing the various CREATE statements -needed to initialize the database tables. - -=cut - -sub schema { - my %schema = ( - fdata =>{ -table=> q{ -CREATE TABLE "fdata" ( - "fid" serial NOT NULL, - "fref" character varying(100) DEFAULT '' NOT NULL, - "fstart" integer DEFAULT '0' NOT NULL, - "fstop" integer DEFAULT '0' NOT NULL, - "fbin" double precision DEFAULT '0.000000' NOT NULL, - "ftypeid" integer DEFAULT '0' NOT NULL, - "fscore" double precision DEFAULT NULL, - "fstrand" character varying(3) DEFAULT NULL, - "fphase" character varying(3) DEFAULT NULL, - "gid" integer DEFAULT '0' NOT NULL, - "ftarget_start" integer DEFAULT NULL, - "ftarget_stop" integer DEFAULT NULL, - CONSTRAINT chk_fdata_fstrand CHECK (fstrand IN ('+','-')), - CONSTRAINT chk_fdata_fphase CHECK (fphase IN ('0','1','2')), - CONSTRAINT pk_fdata PRIMARY KEY (fid) -) -}, # fdata table - -#CONSTRAINT fref_fdata UNIQUE (fref, fbin, fstart, fstop, ftypeid, gid) -# fdata_fref_idx => q{ CREATE UNIQUE INDEX fdata_fref_idx ON fdata (fref,fbin,fstart,fstop,ftypeid,gid)}, - -index=>{ - fdata_fref_idx => q{ -CREATE INDEX fdata_fref_idx ON fdata (fref,fbin,fstart,fstop,ftypeid,gid) -}, - - fdata_ftypeid_idx => q{ -CREATE INDEX fdata_ftypeid_idx ON fdata (ftypeid) -}, - - fdata_gid_idx => q{ -CREATE INDEX fdata_gid_idx ON fdata (gid) -} - }, # fdata indexes - -}, # fdata - - - - fgroup => { -table => q{ -CREATE TABLE "fgroup" ( - "gid" serial NOT NULL, - "gclass" character varying(100) DEFAULT NULL, - "gname" character varying(100) DEFAULT NULL, - CONSTRAINT pk_fgroup PRIMARY KEY (gid) -) -}, # fgroup table - -index => { - fgroup_gclass_idx => q{ -CREATE UNIQUE INDEX fgroup_gclass_idx ON fgroup (gclass,gname) -}, - fgroup_gname_idx => q{ -CREATE INDEX fgroup_gname_idx ON fgroup(gname) -}, - fgroup_lower_gname_idx => q{ -CREATE INDEX fgroup_lower_gname_idx ON fgroup (lower(gname)) -}, - }, # fgroup indexes - -}, # fgroup - - ftype => { -table => q{ -CREATE TABLE "ftype" ( - "ftypeid" serial NOT NULL, - "fmethod" character varying(100) DEFAULT '' NOT NULL, - "fsource" character varying(100) DEFAULT NULL, - CONSTRAINT pk_ftype PRIMARY KEY (ftypeid), - CONSTRAINT ftype_ftype UNIQUE (fmethod, fsource) -) -}, # ftype table - -index => { - ftype_fmethod_idx => q{ -CREATE INDEX ftype_fmethod_idx ON ftype (fmethod) -}, - - ftype_fsource_idx => q{ -CREATE INDEX ftype_fsource_idx ON ftype (fsource) -}, - - ftype_ftype_idx => q{ -CREATE UNIQUE INDEX ftype_ftype_idx ON ftype (fmethod,fsource) -} - }, # ftype indexes - -}, # ftype - - - fdna => { -table => q{ -CREATE TABLE "fdna" ( - "fref" character varying(100) DEFAULT '' NOT NULL, - "foffset" integer DEFAULT '0' NOT NULL, - "fdna" bytea, - CONSTRAINT pk_fdna PRIMARY KEY (fref, foffset) -) -} #fdna table - }, #fdna - - fmeta => { -table => q{ -CREATE TABLE "fmeta" ( - "fname" character varying(255) DEFAULT '' NOT NULL, - "fvalue" character varying(255) DEFAULT '' NOT NULL, - CONSTRAINT pk_fmeta PRIMARY KEY (fname) -) -} # fmeta table - }, # fmeta - - - fattribute => { -table => q{ -CREATE TABLE "fattribute" ( - "fattribute_id" serial NOT NULL, - "fattribute_name" character varying(255) DEFAULT '' NOT NULL, - CONSTRAINT pk_fattribute PRIMARY KEY (fattribute_id) -) -}, # fattribute table - -}, # fattribute - - fattribute_to_feature => { -table => q{ -CREATE TABLE "fattribute_to_feature" ( - "fid" integer DEFAULT '0' NOT NULL, - "fattribute_id" integer DEFAULT '0' NOT NULL, - "fattribute_value" text -) -}, # fattribute_to_feature table - -index => { - fattribute_to_feature_fid => q{ -CREATE INDEX fattribute_to_feature_fid ON fattribute_to_feature (fid,fattribute_id) -}, - fattribute_txt_idx => q{ -CREATE INDEX fattribute_txt_idx ON fattribute_to_feature (fattribute_value) -}, - fattribute_lower_idx => q{ -CREATE INDEX fattribute_lower_idx ON fattribute_to_feature (lower(fattribute_value)) -}, - } # fattribute_to_feature indexes -}, # fattribute_to_feature - - finterval_stats => { -table=> q{ -CREATE TABLE "finterval_stats" ( - "ftypeid" integer DEFAULT '0' NOT NULL, - "fref" character varying(100) DEFAULT '' NOT NULL, - "fbin" integer DEFAULT '0' NOT NULL, - "fcum_count" integer DEFAULT '0' NOT NULL, - CONSTRAINT pk_finterval_stats PRIMARY KEY (ftypeid,fref,fbin) -) -} # finterval_stats table -},# finterval_stats - - - - -); - return \%schema; -} - - -=head2 setup_load - - Title : setup_load - Usage : $db->setup_load - Function: called before load_gff_line() - Returns : void - Args : none - Status : protected - -This method performs schema-specific initialization prior to loading a -set of GFF records. It prepares a set of DBI statement handlers to be -used in loading the data. - -=cut - -sub setup_load { - my $self = shift; - my $schema = $self->schema; - - my $dbh = $self->features_db; - - if ($self->lock_on_load) { - my @tables = map { "$_ WRITE"} $self->tables; - my $tables = join ', ',@tables; - $dbh->do("LOCK TABLES $tables"); - } - - my $lookup_type = $dbh->prepare_delayed('SELECT ftypeid FROM ftype WHERE fmethod=? AND fsource=?'); - my $insert_type = $dbh->prepare_delayed('INSERT INTO ftype (fmethod,fsource) VALUES (?,?)'); - my $insertid_type = $dbh->prepare_delayed("SELECT currval('ftype_ftypeid_seq')"); - - my $lookup_group = $dbh->prepare_delayed('SELECT gid FROM fgroup WHERE lower(gname)=lower(?) AND gclass=?'); - my $insert_group = $dbh->prepare_delayed('INSERT INTO fgroup (gname,gclass) VALUES (?,?)'); - my $insertid_group = $dbh->prepare_delayed("SELECT currval('fgroup_gid_seq')"); - - my $lookup_attribute = $dbh->prepare_delayed('SELECT fattribute_id FROM fattribute WHERE fattribute_name=?'); - my $insert_attribute = $dbh->prepare_delayed('INSERT INTO fattribute (fattribute_name) VALUES (?)'); - my $insertid_attribute = $dbh->prepare_delayed("SELECT currval('fattribute_fattribute_id_seq')"); - - my $insert_attribute_value = $dbh->prepare_delayed('INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) VALUES (?,?,?)'); - - my $insert_data = $dbh->prepare_delayed(<prepare_delayed('DELETE FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND fbin=? AND ftypeid=? AND GID=?'); - my $insertid_data = $dbh->prepare_delayed("SELECT currval('fdata_fid_seq')"); - - $self->{load_stuff}{sth}{lookup_ftype} = $lookup_type; - $self->{load_stuff}{sth}{insert_ftype} = $insert_type; - $self->{load_stuff}{sth}{insertid_ftype} = $insertid_type; - $self->{load_stuff}{sth}{lookup_fgroup} = $lookup_group; - $self->{load_stuff}{sth}{insert_fgroup} = $insert_group; - $self->{load_stuff}{sth}{insertid_fgroup} = $insertid_group; - $self->{load_stuff}{sth}{insertid_fdata} = $insertid_data; - $self->{load_stuff}{sth}{insert_fdata} = $insert_data; - $self->{load_stuff}{sth}{delete_existing_fdata} = $delete_existing_data; - $self->{load_stuff}{sth}{lookup_fattribute} = $lookup_attribute; - $self->{load_stuff}{sth}{insert_fattribute} = $insert_attribute; - $self->{load_stuff}{sth}{insertid_fattribute} = $insertid_attribute; - $self->{load_stuff}{sth}{insert_fattribute_value} = $insert_attribute_value; - $self->{load_stuff}{types} = {}; - $self->{load_stuff}{groups} = {}; - $self->{load_stuff}{counter} = 0; -} - -=head2 load_gff_line - - Title : load_gff_line - Usage : $db->load_gff_line($fields) - Function: called to load one parsed line of GFF - Returns : true if successfully inserted - Args : hashref containing GFF fields - Status : protected - -This method is called once per line of the GFF and passed a series of -parsed data items that are stored into the hashref $fields. The keys are: - - ref reference sequence - source annotation source - method annotation method - start annotation start - stop annotation stop - score annotation score (may be undef) - strand annotation strand (may be undef) - phase annotation phase (may be undef) - group_class class of annotation's group (may be undef) - group_name ID of annotation's group (may be undef) - target_start start of target of a similarity hit - target_stop stop of target of a similarity hit - attributes array reference of attributes, each of which is a [tag=>value] array ref - -=cut - -sub load_gff_line { - my $self = shift; - my $gff = shift; - - if (defined $gff->{phase}){ - chomp($gff->{phase}); - undef($gff->{phase}) if $gff->{phase} eq '.'; - } - - if (defined $gff->{strand} && $gff->{strand} eq '.'){undef($gff->{strand})}; - if (defined $gff->{score} && $gff->{score} eq '.'){undef($gff->{score})}; - - my $s = $self->{load_stuff}; - my $dbh = $self->features_db; - local $dbh->{PrintError} = 0; - - defined(my $typeid = $self->get_table_id('ftype', $gff->{method} => $gff->{source})) or return; - defined(my $groupid = $self->get_table_id('fgroup',$gff->{gname} => $gff->{gclass})) or return; - - my $bin = bin($gff->{start},$gff->{stop},$self->min_bin); - my $result = $s->{sth}{insert_fdata}->execute($gff->{ref}, - $gff->{start},$gff->{stop},$bin, - $typeid, - $gff->{score},$gff->{strand},$gff->{phase}, - $groupid, - $gff->{tstart},$gff->{tstop}); - - warn $dbh->errstr,"\n" and print "ref=",$gff->{ref}," start=",$gff->{start}," stop=",$gff->{stop}," bin=",$bin," typeid=",$typeid," groupid=",$groupid,"\n" - and return unless $result; - - my $fid = $self->insertid($s->{sth},'fdata') - || $self->get_feature_id($gff->{ref},$gff->{start},$gff->{stop},$typeid,$groupid); - - # insert attributes - foreach (@{$gff->{attributes}}) { - defined(my $attribute_id = $self->get_table_id('fattribute',$_->[0])) or return; - $s->{sth}{insert_fattribute_value}->execute($fid,$attribute_id,$_->[1]); - } - - if ( (++$s->{counter} % 1000) == 0) { - print STDERR "$s->{counter} records loaded..."; - print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n"; - } - - $fid; -} - - -sub insertid { - my $self = shift; - my $sth = shift ; - my $table = shift; - - my $insert_id; - if ($sth->{"insertid_$table"}->execute()){ - $insert_id = ($sth->{"insertid_$table"}->fetchrow_array)[0]; - } - else{ - warn "No CURRVAL for SEQUENCE of table $table ",$sth->errstr,"\n"; - return; - } - return $insert_id; -} - - -=head2 get_table_id - - Title : get_table_id - Usage : $integer = $db->get_table_id($table,@ids) - Function: get the ID of a group or type - Returns : an integer ID or undef - Args : none - Status : private - -This internal method is called by load_gff_line to look up the integer -ID of an existing feature type or group. The arguments are the name -of the table, and two string identifiers. For feature types, the -identifiers are the method and source. For groups, the identifiers -are group name and class. - -This method requires that a statement handler named I, -have been created previously by setup_load(). It is here to overcome -deficiencies in mysql's INSERT syntax. - -=cut - -#' -# get the object ID from a named table -sub get_table_id { - my $self = shift; - my $table = shift; - my @ids = @_; - - # irritating warning for null id - my $id_key; - { - local $^W=0; - $id_key = join ':',@ids; - } - - my $s = $self->{load_stuff}; - my $sth = $s->{sth}; - my $dbh = $self->features_db; - - unless (defined($s->{$table}{$id_key})) { - $sth->{"lookup_$table"}->execute(@ids); - my @result = $sth->{"lookup_$table"}->fetchrow_array; - if (@result > 0) { - $s->{$table}{$id_key} = $result[0]; - } else { - $sth->{"insert_$table"}->execute(@ids) - && ($s->{$table}{$id_key} = $self->insertid($sth,$table)); - #&& ($s->{$table}{$id_key} = $self->insertid($sth->{"insertid_$table"})); - #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}->insertid); - } - } - - my $id = $s->{$table}{$id_key}; - unless (defined $id) { - warn "No $table id for $id_key ",$dbh->errstr," Record skipped.\n"; - return; - } - $id; -} - - -#sub insertid { -# my $self = shift; -# my $insertid_sth = shift ; -# my $insert_id; -# if ($insertid_sth->execute){ -# $insert_id = ($insertid_sth->fetchrow_array)[0]; -# } -# else{ -# warn "No CURRVAL for SEQUENCE ",$insertid_sth->errstr,"\n"; -# return; -# } -# return $insert_id; -#} - -sub insert_sequence { - my $self = shift; - my($id,$offset,$seq) = @_; - my $sth = $self->{_insert_sequence} - ||= $self->dbh->prepare_delayed('insert into fdna values (?,?,?)'); - $sth->execute($id,$offset,$seq) or $self->throw($sth->errstr); -} - -=head2 range_query - - Title : range_query - Usage : $db->range_query($range_type,$refseq,$refclass,$start,$stop,$types,$order_by_group,$attributes,$binsize) - Function: create statement handle for range/overlap queries - Returns : a DBI statement handle - Args : see below - Status : Protected - -This method constructs the statement handle for this module's central -query: given a range and/or a list of feature types, fetch their GFF -records. It overrides a method in dbi.pm so that the overlaps query -can write SQL optimized for Postgres. Specifically, instead of writing -the bin related section as a set of ORs, each bin piece is place in -a separate select and then they are UNIONed together. This subroutine -requires several replacements for other subroutines in dbi.pm. In this -module, they are named the same as those in dbi.pm but prefixed with -"pg_". - -The positional arguments are as follows: - - Argument Description - - $isrange A flag indicating that this is a range. - query. Otherwise an overlap query is - assumed. - - $refseq The reference sequence name (undef if no range). - - $refclass The reference sequence class (undef if no range). - - $start The start of the range (undef if none). - - $stop The stop of the range (undef if none). - - $types Array ref containing zero or feature types in the - format [method,source]. - - $order_by_group A flag indicating that statement handler should group - the features by group id (handy for iterative fetches) - - $attributes A hash containing select attributes. - - $binsize A bin size for generating tables of feature density. - -=cut - -sub range_query { - my $self = shift; - my($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes,$bin) = @_; - - my $dbh = $self->features_db; - - # my @bin_parts = split /\n\s+OR/, $self->bin_query($start,$stop); - # warn "bin_part: @bin_parts\n"; - - my %a = (refseq=>$refseq,class=>$class,start=>$start,stop=>$stop,types=>$types,attributes=>$attributes,bin_width=>$bin); - my ($query, @args, $order_by); - - if ($rangetype ne 'overlaps') { - - my $select = $self->make_features_select_part(\%a); - my $from = $self->make_features_from_part($sparse,\%a); - my $join = $self->make_features_join_part(\%a); - my $where; - ($where,@args) = $self->make_features_by_range_where_part($rangetype,\%a); - my ($group_by,@more_args) = $self->make_features_group_by_part(\%a); - $order_by = $self->make_features_order_by_part(\%a) if $order_by_group; - - $query = "SELECT $select FROM $from WHERE $join"; - $query .= " AND $where" if $where; - - if ($group_by) { - $query .= " GROUP BY $group_by"; - push @args,@more_args; - } - - } else { # most common case: overlaps query - - my @bin_parts = split /\s*OR/, $self->bin_query($start,$stop); - my $select = $self->make_features_select_part(\%a); - my $from = $self->make_features_from_part($sparse,\%a); - my $join = $self->make_features_join_part(\%a); - my $where; - ($where,@args) = $self->pg_make_features_by_range_where_part($rangetype,\%a); - my ($group_by,@more_args)= $self->make_features_group_by_part(\%a); - $order_by = $self->pg_make_features_order_by_part(\%a) if $order_by_group; - - my @temp_args; - my @query_pieces; - foreach my $bin (@bin_parts) { - my $temp_query = "SELECT $select FROM $from WHERE $join AND $where AND $bin\n"; - push @temp_args, @args; - - if ($group_by) { - $temp_query .= " GROUP BY $group_by"; - push @temp_args,@more_args; - } - - push @query_pieces, $temp_query; - } - - @args = @temp_args; - $query = join("UNION\n", @query_pieces); - - } - - $query .= " ORDER BY $order_by" if $order_by; - - $self->dbh->do('set enable_seqscan=off'); - my $sth = $self->dbh->do_query($query,@args); - $sth; -} - -sub pg_make_features_by_range_where_part { - my $self = shift; - my ($rangetype,$options) = @_; - - return unless $rangetype eq 'overlaps'; - - $options ||= {}; - my ($refseq,$class,$start,$stop,$types,$attributes) = - @{$options}{qw(refseq class start stop types attributes)}; - - my (@query,@args); - - if ($refseq) { - my ($q,@a) = $self->refseq_query($refseq,$class); - push @query,$q; - push @args,@a; - } - - if (defined $start or defined $stop) { - $start = 0 unless defined($start); - $stop = MAX_SEGMENT unless defined($stop); - - my ($range_query,@range_args) = $self->pg_overlap_query($start,$stop); - - push @query,$range_query; - push @args,@range_args; - } - - if (defined $types && @$types) { - my ($type_query,@type_args) = $self->types_query($types); - push @query,$type_query; - push @args,@type_args; - } - - if ($attributes) { - my ($attribute_query,@attribute_args) = $self->make_features_by_attribute_where_part($attributes); - push @query,"($attribute_query)"; - push @args,@attribute_args; - } - - my $query = join "AND",@query; - return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); -} - -sub pg_overlap_query { - my $self = shift; - my ($start,$stop) = @_; - - my ($iq,@iargs) = $self->overlap_query_nobin($start,$stop); - my $query = "\n$iq\n"; - my @args = @iargs; - - return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); -} - -sub pg_make_features_order_by_part { - my $self = shift; - my $options = shift || {}; - return "gname"; -} - -=head2 search_notes - -This PostgreSQL adaptor does not implement the search notes method -because it can be very slow (although the code for the method is -contained in this method but commented out). -There is, however, a PostgreSQL adaptor that does implement it in -a more efficient way: L, -which inherits from this adaptor and uses the optional PostgreSQL -module TSearch2 for full text indexing. See that adaptor's -documentation for more information. - -See also L - - Title : search_notes - Usage : @search_results = $db->search_notes("full text search string",$limit) - Function: Search the notes for a text string, using mysql full-text search - Returns : array of results - Args : full text search string, and an optional row limit - Status : public - -This is a replacement for the mysql-specific method. Given a search string, it -performs a ILIKE search of the notes table and returns an array of results. -Each row of the returned array is a arrayref containing the following fields: - - column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment() - column 2 The text of the note - column 3 A relevance score. - -Note that for large databases this can be very slow and may result in -time out or 500-cgi errors. If this is happening on a regular basis, -you should look into using L which -implements the TSearch2 full text indexing scheme. - -=cut - -sub search_notes{ -# my $self = shift; -# my ($search_string,$limit) = @_; -# -# $search_string =~ tr/*/%/s; -# $search_string = '%'.$search_string unless $search_string =~ /^\%/; -# $search_string = $search_string.'%' unless $search_string =~ /\%$/; -# warn "search_string:$search_string"; -# my $query = FULLTEXTWILDCARD; -# $query .= " limit $limit" if defined $limit; -# my $sth = $self->dbh->do_query($query,$search_string); -# -# my @results; -# while (my ($class,$name,$note) = $sth->fetchrow_array) { -# -# next unless $class && $name; # sorry, ignore NULL objects -# my $featname = Bio::DB::GFF::Featname->new($class=>$name); -# -# push @results,[$featname,$note,0]; #gbrowse expects a score, but -# #pg doesn't give one, thus the 0 -# } -# warn @results; -# -# return @results; -} - - -=head2 make_meta_set_query - - Title : make_meta_set_query - Usage : $sql = $db->make_meta_set_query - Function: return SQL fragment for setting a meta parameter - Returns : SQL fragment - Args : none - Status : public - -By default this does nothing; meta parameters are not stored or -retrieved. - -=cut - -sub make_meta_set_query { - return 'INSERT INTO fmeta VALUES (?,?)'; -} - -sub make_classes_query { - my $self = shift; - return 'SELECT DISTINCT gclass FROM fgroup WHERE NOT gclass IS NULL'; -} - - -sub chunk_size { - my $self = shift; - $self->meta('chunk_size') || DEFAULT_CHUNK; -} - -sub getseqcoords_query { - my $self = shift; - return GETSEQCOORDS ; -} - -sub getaliascoords_query{ - my $self = shift; - return GETALIASCOORDS ; -} - - -sub getforcedseqcoords_query{ - my $self = shift; - return GETFORCEDSEQCOORDS ; -} - - -sub getaliaslike_query{ - my $self = shift; - return GETALIASLIKE ; -} - - -sub make_features_select_part { - my $self = shift; - my $options = shift || {}; - my $s; - if (my $b = $options->{bin_width}) { - - $s = <{attributes} && keys %{$options->{attributes}}>1; - $s; -} - -sub make_features_from_part_bkup { - my $self = shift; - my $sparse = shift; - my $options = shift || {}; - #my $index = $sparse ? ' USE INDEX(ftypeid)': ''; - my $index = ''; - return $options->{attributes} ? "fdata${index},ftype,fgroup,fattribute,fattribute_to_feature\n" - : "fdata${index},ftype,fgroup\n"; -} - - -#################################### -# moved from mysqlopt.pm -################################### -# meta values -sub default_meta_values { - my $self = shift; - my @values = $self->SUPER::default_meta_values; - return ( - @values, - max_bin => MAX_BIN, - min_bin => MIN_BIN, - straight_join_limit => STRAIGHT_JOIN_LIMIT, - ); -} - -sub min_bin { - my $self = shift; - return $self->meta('min_bin') || MIN_BIN; -} -sub max_bin { - my $self = shift; - return $self->meta('max_bin') || MAX_BIN; -} -sub straight_join_limit { - my $self = shift; - return $self->meta('straight_join_limit') || STRAIGHT_JOIN_LIMIT; -} - - -sub _feature_by_name { - my $self = shift; - my ($class,$name,$location,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - - my @bin_parts = split /\s*OR/, $self->bin_query($location->[1],$location->[2]) if $location; - my $select = $self->make_features_select_part; - my $from = $self->make_features_from_part(undef,{sparse_groups=>1}); - my ($where,@args) = $self->make_features_by_name_where_part($class,$name); - my $join = $self->make_features_join_part; - my $range = $self->pg_make_features_by_range_where_part('overlaps', - {refseq=>$location->[0], - class =>'', - start=>$location->[1], - stop =>$location->[2]}) if $location; - - my @temp_args; - my @query_pieces; - my $query; - if (@bin_parts) { - foreach my $bin (@bin_parts) { - my $temp_query = "SELECT $select FROM $from WHERE $join AND $where AND $range AND $bin\n"; - push @temp_args, @args; - push @query_pieces, $temp_query; - } - - @args = @temp_args; - $query = join("UNION\n", @query_pieces); - - } else { - $query = "SELECT $select FROM $from WHERE $where AND $join"; - } - - my $sth = $self->dbh->do_query($query,@args); - - my $count = 0; - while (my @row = $sth->fetchrow_array) { - $callback->(@row); - $count++; - } - $sth->finish; - return $count; -} - -sub update_sequences { - my $self = shift; - my $dbh = $self->features_db; - - $dbh->do("SELECT setval('public.fdata_fid_seq', max(fid)+1) FROM fdata"); - $dbh->do("SELECT setval('public.fattribute_fattribute_id_seq', max(fattribute_id)+1) FROM fattribute"); - $dbh->do("SELECT setval('public.fgroup_gid_seq', max(gid)+1) FROM fgroup"); - $dbh->do("SELECT setval('public.ftype_ftypeid_seq', max(ftypeid)+1) FROM ftype"); - - 1; -} - -=head2 make_features_by_name_where_part - - Title : make_features_by_name_where_part - Usage : $db->make_features_by_name_where_part - Function: Overrides a function in Bio::DB::GFF::Adaptor::dbi to insure - that searches will be case insensitive. It creates the SQL - fragment needed to select a feature by its group name & class - Returns : a SQL fragment and bind arguments - Args : see below - Status : Protected - -=cut - -sub make_features_by_name_where_part { - my $self = shift; - my ($class,$name) = @_; - - if ($name !~ /\*/) { - #allows utilization of an index on lower(gname) - return ("fgroup.gclass=? AND lower(fgroup.gname) = lower(?)",$class,$name); - } - else { - $name =~ tr/*/%/; - return ("fgroup.gclass=? AND lower(fgroup.gname) LIKE lower(?)",$class,$name); - } -} - -# -# Methods from dbi.pm that need to be overridden to make -# searching for fref case insensitive -# -# -sub get_dna { - my $self = shift; - my ($ref,$start,$stop,$class) = @_; - - my ($offset_start,$offset_stop); - - my $has_start = defined $start; - my $has_stop = defined $stop; - - my $reversed; - if ($has_start && $has_stop && $start > $stop) { - $reversed++; - ($start,$stop) = ($stop,$start); - } - - # turn start and stop into 0-based offsets - my $cs = $self->dna_chunk_size; - $start -= 1; $stop -= 1; - $offset_start = int($start/$cs)*$cs; - $offset_stop = int($stop/$cs)*$cs; - - my $sth; - # special case, get it all - if (!($has_start || $has_stop)) { - $sth = $self->dbh->do_query('select fdna,foffset from fdna where lower(fref)=lower(?) order by foffset',$ref); - } - - elsif (!$has_stop) { - $sth = $self->dbh->do_query('select fdna,foffset from fdna where lower(fref)=lower(?) and foffset>=? order by foffset', - $ref,$offset_start); - } - - else { # both start and stop defined - $sth = $self->dbh->do_query('select fdna,foffset from fdna where lower(fref)=lower(?) and foffset>=? and foffset<=? order by foffset', - $ref,$offset_start,$offset_stop); - } - - my $dna = ''; - while (my($frag,$offset) = $sth->fetchrow_array) { - substr($frag,0,$start-$offset) = '' if $has_start && $start > $offset; - $dna .= $frag; - } - substr($dna,$stop-$start+1) = '' if $has_stop && $stop-$start+1 < length($dna); - if ($reversed) { - $dna = reverse $dna; - $dna =~ tr/gatcGATC/ctagCTAG/; - } - - $sth->finish; - $dna; -} - - -sub refseq_query { - my $self = shift; - my ($refseq,$refclass) = @_; - my $query = "lower(fdata.fref)=lower(?)"; - return wantarray ? ($query,$refseq) : $self->dbh->dbi_quote($query,$refseq); -} - -sub make_types_where_part { - my $self = shift; - my ($srcseq,$start,$stop,$want_count,$typelist) = @_; - my (@query,@args); - if (defined($srcseq)) { - push @query,'lower(fdata.fref)=lower(?)'; - push @args,$srcseq; - if (defined $start or defined $stop) { - $start = 1 unless defined $start; - $stop = MAX_SEGMENT unless defined $stop; - my ($q,@a) = $self->overlap_query($start,$stop); - push @query,"($q)"; - push @args,@a; - } - } - if (defined $typelist && @$typelist) { - my ($q,@a) = $self->types_query($typelist); - push @query,($q); - push @args,@a; - } - my $query = @query ? join(' AND ',@query) : '1=1'; - return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args); -} - -sub get_feature_id { - my $self = shift; - my ($ref,$start,$stop,$typeid,$groupid) = @_; - my $s = $self->{load_stuff}; - unless ($s->{get_feature_id}) { - my $dbh = $self->features_db; - $s->{get_feature_id} = - $dbh->prepare_delayed('SELECT fid FROM fdata WHERE lower(fref)=lower(?) AND fstart=? AND fstop=? AND ftypeid=? AND gid=?'); - } - my $sth = $s->{get_feature_id} or return; - $sth->execute($ref,$start,$stop,$typeid,$groupid) or return; - my ($fid) = $sth->fetchrow_array; - return $fid; -} - -sub _delete { - my $self = shift; - my $delete_spec = shift; - my $ranges = $delete_spec->{segments} || []; - my $types = $delete_spec->{types} || []; - my $force = $delete_spec->{force}; - my $range_type = $delete_spec->{range_type}; - my $dbh = $self->features_db; - - my $query = 'delete from fdata'; - my @where; - - my @range_part; - for my $segment (@$ranges) { - my $ref = $dbh->quote($segment->abs_ref); - my $start = $segment->abs_start; - my $stop = $segment->abs_stop; - my $range = $range_type eq 'overlaps' ? $self->overlap_query($start,$stop) - : $range_type eq 'contains' ? $self->contains_query($start,$stop) - : $range_type eq 'contained_in' ? $self->contained_in_query($start,$stop) - : $self->throw("Invalid range type '$range_type'"); - push @range_part,"(lower(fref)=lower($ref) AND $range)"; - } - push @where,'('. join(' OR ',@range_part).')' if @range_part; - - # get all the types - if (@$types) { - my $types_where = $self->types_query($types); - my $types_query = "select ftypeid from ftype where $types_where"; - my $result = $dbh->selectall_arrayref($types_query); - my @typeids = map {$_->[0]} @$result; - my $typelist = join ',',map{$dbh->quote($_)} @typeids; - $typelist ||= "0"; # don't cause DBI to die with invalid SQL when - # unknown feature types were requested. - push @where,"(ftypeid in ($typelist))"; - } - $self->throw("This operation would delete all feature data and -force not specified") - unless @where || $force; - $query .= " where ".join(' and ',@where) if @where; - warn "$query\n" if $self->debug; - my $result = $dbh->do($query); - defined $result or $self->throw($dbh->errstr); - $result; -} - -sub make_abscoord_query { - my $self = shift; - my ($name,$class,$refseq) = @_; - #my $query = GETSEQCOORDS; - my $query = $self->getseqcoords_query(); - my $getforcedseqcoords = $self->getforcedseqcoords_query() ; - if ($name =~ /\*/) { - $name =~ s/%/\\%/g; - $name =~ s/_/\\_/g; - $name =~ tr/*/%/; - $query =~ s/gname\) = lower/gname) LIKE lower/; - } - defined $refseq - ? $self->dbh->do_query($getforcedseqcoords,$name,$class,$refseq) - : $self->dbh->do_query($query,$name,$class); -} - -sub make_aliasabscoord_query { - my $self = shift; - my ($name,$class) = @_; - #my $query = GETALIASCOORDS; - my $query = $self->getaliascoords_query(); - if ($name =~ /\*/) { - $name =~ s/%/\\%/g; - $name =~ s/_/\\_/g; - $name =~ tr/*/%/; - $query =~ s/gname\) = lower/gname) LIKE lower/; - } - $self->dbh->do_query($query,$name,$class); -} - - -1; diff --git a/lib/Bio/DB/GFF/Adaptor/dbi/pg_fts.pm b/lib/Bio/DB/GFF/Adaptor/dbi/pg_fts.pm deleted file mode 100644 index 5640d43c1..000000000 --- a/lib/Bio/DB/GFF/Adaptor/dbi/pg_fts.pm +++ /dev/null @@ -1,363 +0,0 @@ -package Bio::DB::GFF::Adaptor::dbi::pg_fts; - - -=head1 NAME - -Bio::DB::GFF::Adaptor::dbi::pg_fts -- Database adaptor for a specific postgres schema with a TSearch2 implementation - -=head1 SYNOPSIS - - #create new GFF database connection - my $db = Bio::DB::GFF->new( -adaptor => 'dbi::pg_fts', - -dsn => 'dbi:Pg:dbname=worm'); - - #add full text indexing 'stuff' - #assumes that TSearch2 is available to PostgreSQL - #this will take a VERY long time for a reasonably large database - $db->install_TSearch2(); - - ...some time later... - #we don't like full text searching... - $db->remove_TSearch2(); - -=head1 DESCRIPTION - -This adaptor is based on Bio::DB::GFF::Adaptor::dbi::pg but it implements -the TSearch2 PostgreSQL contrib module for fast full text searching. To -use this module with your PostgreSQL GFF database, you need to make -TSearch2 available in the database. - -To use this adaptor, follow these steps: - -=over - -=item Install TSearch2 contrib module for Pg - -Can be as easy as `sudo yum install postgresql-contrib`, or you may -need to recompile PostgreSQL to include it. See -L -for more details - -=item Load the TSearch2 functions to you database - - % cat tsearch2.sql | psql - -=item Load your data using the pg adaptor: - - % bp_pg_bulk_load_gff.pl -c -d yeast saccharomyces_cerevisiae.gff - -or - - % bp_load_gff.pl -c -d yeast -a dbi::pg saccharomyces_cerevisiae.gff - -=item Add GFF/TSearch2 specific modifications - -Execute a perl script like this one: - - #!/usr/bin/perl -w - use strict; - - use Bio::DB::GFF; - - my $db = Bio::DB::GFF->new( - -adaptor => 'dbi::pg_fts', - -dsn => 'dbi:Pg:dbname=yeast', - -user => 'scott', - ); - - print "Installing TSearch2 columns...\n"; - - $db->install_TSearch2(); - - print "Done\n"; - -=back - -Note that this last step will take a long time. For a S. cerevisiae -database with 15K rows, it took over an hour on my laptop, and -with a C. elegans database (~10 million rows) it took well over a day. - -If at some point you add more data you your database, you need to run -a similar script to the one above, only executing the update_TSearch2() -method. Finally, if you want to remove the TSearch2 columns from your -database and go back to using the pg adaptor, you can execute a script -like the one above, only executing the remove_TSearch2() method. - -=head1 NOTES ABOUT TSearch2 SEARCHING - -You should know a few things about how searching with TSearch2 works in -the GBrowse environment: - -=over - -=item 1 - -TSearch2 does not do wild cards, so you should encourage your users not -to use them. If wild cards are used, the adaptor will fall back on -an ILIKE search, which will be much slower. - -=item 2 - -However, TSearch2 does do 'word stemming'. That is, if you search -for 'copy', it will find 'copy', 'copies', and 'copied'. - -=item 3 - -TSearch2 does not do phrase searching; all of the terms in the -search string are ANDed together. - -=back - -=head1 ACKNOWLEDGEMENTS - -Special thanks to Russell Smithies and Paul Smale at AgResearch in -New Zealand for giving me their recipe for doing full text indexing -in a GFF database. - -=head1 BUGS - -Please report bugs to the BioPerl and/or GBrowse mailing lists -(L and L -respectively). - -=head1 SEE ALSO - -Please see L for more information -about tuning your PostgreSQL server for GFF data, and for general -information about GFF database access, see L. - -=head1 AUTHOR - -Scott Cain, cain@cshl.edu - -=head1 APPENDIX - -=cut - -# a simple postgres adaptor -use strict; -use Bio::DB::GFF::Adaptor::dbi; -use base qw(Bio::DB::GFF::Adaptor::dbi::pg); - -use constant FULLTEXTSEARCH => < <SUPER::new(@_); - return $self; -} - -=head2 search_notes - - Title : search_notes - Usage : @search_results = $db->search_notes("full text string",$limit) - Function: Search the notes for a text string, using PostgreSQL TSearch2 - Returns : array of results - Args : full text search string, and an optional row limit - Status : public - -This is based on the mysql-specific method that makes use of the TSearch2 -functionality in PosgreSQL's contrib directory. Given a search string, -it performs a full-text search of the notes table and returns an array -of results. Each row of the returned array is a arrayref containing -the following fields: - - column 1 A Bio::DB::GFF::Featname object, for passing to segment() - column 2 The text of the note - column 3 A relevance score. - -=cut - -sub search_notes { - my $self = shift; - my ($search_string,$limit) = @_; - - my @terms = split /\s+/, $search_string; - - my $sth; - if ($search_string =~ /\*/) { - $search_string =~ tr/*/%/s; - my $query = FULLTEXTWILDCARD; - $query .= " limit $limit" if defined $limit; - $sth = $self->dbh->do_query($query,$search_string); - } - elsif (@terms == 1) { - my $query = FULLTEXTSEARCH; - $query .= " limit $limit" if defined $limit; - $sth = $self->dbh->do_query($query,$search_string); - } - else { - my $query = FULLTEXTSEARCH; - my $andstring = join (' & ', @terms); -# $query .= qq{ AND (fattribute_to_feature.fattribute_value ILIKE '\%$search_string%')}; - $query .= " LIMIT $limit" if defined $limit; - $sth = $self->dbh->do_query($query,$andstring); - } - - my @results; - while (my ($class,$name,$note,$method,$source) = $sth->fetchrow_array) { - - next unless $class && $name; # sorry, ignore NULL objects - my $featname = Bio::DB::GFF::Featname->new($class=>$name); - my $type = Bio::DB::GFF::Typename->new($method,$source); - push @results,[$featname,$note,0,$type]; #gbrowse expects a score, but - #pg doesn't give one, thus the 0 - } - - return @results; -} - -=head2 make_features_by_name_where_part - - Title : make_features_by_name_where_part - Function: constructs a TSearch2-compliant WHERE clause for a name search - Status : protected - -=cut - -#need a make_features_by_name_where_part method to override pg -sub make_features_by_name_where_part { - my $self = shift; - my ($class,$name) = @_; - - my @terms = split /\s+/, $name; - - if ($name =~ /\*/) { - $name =~ tr/*/%/s; - return ("fgroup.gclass=? AND lower(fgroup.gname) LIKE lower(?)",$class,$name); - } - else { - my $where_str = "fgroup.gclass=? AND (fgroup.idxfti @@ to_tsquery('default', ?)) "; - if (@terms == 1) { - return ($where_str,$class,$name); - } - else { - my $andstring = join (' & ', @terms); -# $where_str .= qq{ AND (fgroup.gname ILIKE '\%$name%')}; - return ($where_str,$class,$andstring); - } - } -} - -=head2 install_TSearch2 - - Title : install_TSearch2 - Function: installs schema modifications for use with TSearch2 - Usage : $db->install_TSearch2 - Status : public - -=cut - - -#needs method for installing TSearch2 (does that mean that the SQL for -#creating the tables and functions should go in here? That would be -#the safest and easiest thing to do -sub install_TSearch2 { - my $self = shift; - - my $dbh = $self->features_db; - - $dbh->do('ALTER TABLE fattribute_to_feature ADD COLUMN idxFTI tsvector') - or $self->throw('adding FTI column to f_to_f failed'); - - $dbh->do('ALTER TABLE fgroup ADD COLUMN idxFTI tsvector') - or $self->throw('adding FTI column to fgroup failed'); - - $self->update_TSearch2(); - - return; -} - -=head2 update_TSearch2 - - Title : update_TSearch2 - Function: Updates TSearch2 columns - Usage : $db->update_TSearch2 - Status : public - -=cut - - -sub update_TSearch2 { - my $self = shift; - - my $dbh = $self->features_db; - - $self->warn('updating full text column; this may take a very long time...'); - $dbh->do("UPDATE fattribute_to_feature " - ."SET idxFTI= to_tsvector('default', fattribute_value) " - ."WHERE idxFTI IS NULL") - or $self->throw('updating fti column failed'); - $dbh->do("UPDATE fgroup " - ."SET idxFTI= to_tsvector('default', gname) " - ."WHERE idxFTI IS NULL") - or $self->throw('updating fgroup fti column failed'); - - $self->warn('Preliminary optimization of database; this may also take a long time...'); - $dbh->do('VACUUM FULL ANALYZE') - or $self->throw('vacuum failed'); - - $self->warn('Updating full text index; again, this may take a long time'); - $dbh->do('CREATE INDEX idxFTI_idx ON fattribute_to_feature ' - .'USING gist(idxFTI)') - or $self->warn('creating full text index failed'); - $dbh->do('CREATE INDEX fgroup_idxFTI_idx ON fgroup ' - .'USING gist(idxFTI)') - or $self->warn('creating fgroup full text index failed'); - - $self->warn('Optimizing database; hopefully, this will not take as long as other steps'); - $dbh->do('VACUUM FULL ANALYZE'); - $dbh->do("SELECT set_curcfg('default')"); - - return; -} - -=head2 remove_TSearch2 - - Title : remove_TSearch2 - Function: Removes TSearch2 columns - Usage : $db->remove_TSearch2 - Status : public - -=cut - -sub remove_TSearch2 { - my $self = shift; - - my $dbh = $self->features_db; - - $self->warn('Removing full text search capabilities'); - $dbh->do('DROP INDEX idxFTI_idx') - or $self->throw('dropping full text index failed'); - $dbh->do('DROP INDEX fgroup_idxFTI_idx') - or $self->throw('dropping full text index failed'); - - $dbh->do('ALTER TABLE fattribute_to_feature DROP COLUMN idxFTI') - or $self->throw('dropping full text column failed'); - $dbh->do('ALTER TABLE fgroup DROP COLUMN idxFTI') - or $self->throw('dropping full text column failed'); - - - return; -} - - -1; diff --git a/lib/Bio/DB/GFF/Adaptor/memory.pm b/lib/Bio/DB/GFF/Adaptor/memory.pm deleted file mode 100644 index ed14def2b..000000000 --- a/lib/Bio/DB/GFF/Adaptor/memory.pm +++ /dev/null @@ -1,718 +0,0 @@ -package Bio::DB::GFF::Adaptor::memory; - -=head1 NAME - -Bio::DB::GFF::Adaptor::memory -- Bio::DB::GFF database adaptor for in-memory databases - -=head1 SYNOPSIS - - use Bio::DB::GFF; - my $db = Bio::DB::GFF->new(-adaptor=> 'memory', - -gff => 'my_features.gff', - -fasta => 'my_dna.fa' - ); - -or - - my $db = Bio::DB::GFF->new(-adaptor=>'memory'); - $db->load_gff_file('my_features.gff'); - $db->load_fasta_file('my_dna.fa'); - -See L for other methods. - -=head1 DESCRIPTION - -This adaptor implements an in-memory version of Bio::DB::GFF. It can be used to -store and retrieve SHORT GFF files. It inherits from Bio::DB::GFF. - -=head1 CONSTRUCTOR - -Use Bio::DB::GFF-Enew() to construct new instances of this class. -Three named arguments are recommended: - - Argument Description - - -adaptor Set to "memory" to create an instance of this class. - -gff Read the indicated file or directory of .gff file. - -fasta Read the indicated file or directory of fasta files. - -dir Indicates a directory containing .gff and .fa files - -If you use the -dir option and the indicated directory is writable by -the current process, then this library will create a FASTA file index -that greatly diminishes the memory usage of this module. - -Alternatively you may create an empty in-memory object using just the --adaptor=E'memory' argument and then call the load_gff_file() and -load_fasta_file() methods to load GFF and/or sequence -information. This is recommended in CGI/mod_perl/fastCGI environments -because these methods do not modify STDIN, unlike the constructor. - -=head1 METHODS - -See L for inherited methods. - -=head1 BUGS - -none ;-) - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Shuly Avraham Eavraham@cshl.orgE. - -Copyright (c) 2002 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -use strict; -# AUTHOR: Shulamit Avraham -# This module needs to be cleaned up and documented - -# Bio::DB::GFF::Adaptor::memory -- in-memory db adaptor -# implements the low level handling of data which stored in memory. -# This adaptor implements a specific in memory schema that is compatible with Bio::DB::GFF. -# Inherits from Bio::DB::GFF. - - -use Bio::DB::GFF::Util::Rearrange; # for rearrange() -use Bio::DB::GFF::Adaptor::memory::iterator; -use File::Basename 'dirname'; -use Bio::DB::GFF::Adaptor::memory::feature_serializer qw(@hash2array_map); - - -use constant MAX_SEGMENT => 1_000_000_000; # the largest a segment can get - -use base qw(Bio::DB::GFF); - -sub new { - my $class = shift ; - my ($file,$fasta,$dbdir,$preferred_groups) = rearrange([ - [qw(GFF FILE)], - 'FASTA', - [qw(DSN DB DIR DIRECTORY)], - 'PREFERRED_GROUPS', - ],@_); - - # fill in object - my $self = bless{ data => [] },$class; - $self->preferred_groups($preferred_groups) if defined $preferred_groups; - $file ||= $dbdir; - $fasta ||= $dbdir; - $self->load_gff($file) if $file; - $self->load_or_store_fasta($fasta) if $fasta; - return $self; -} - -sub load_or_store_fasta { - my $self = shift; - my $fasta = shift; - if ((-f $fasta && -w dirname($fasta)) - or - (-d $fasta && -w $fasta)) { - require Bio::DB::Fasta; - my $dna_db = eval {Bio::DB::Fasta->new($fasta);} - or warn "$@\nCan't open sequence file(s). Use -gff instead of -dir if you wish to load features without sequence.\n"; - $dna_db && $self->dna_db($dna_db); - } else { - $self->load_fasta($fasta); - } -} - -sub dna_db { - my $self = shift; - my $d = $self->{dna_db}; - $self->{dna_db} = shift if @_; - $d; -} - -sub insert_sequence { - my $self = shift; - my($id,$offset,$seq) = @_; - $self->{dna}{$id} .= $seq; -} - -# low-level fetch of a DNA substring given its -# name, class and the desired range. -sub get_dna { - my $self = shift; - my ($id,$start,$stop,$class) = @_; - if (my $dna_db = $self->dna_db) { - return $dna_db->seq($id,$start=>$stop); - } - return '' unless $self->{dna}; - - return $self->{dna}{$id} unless defined $start || defined $stop; - $start = 1 if !defined $start; - - my $reversed = 0; - if ($start > $stop) { - $reversed++; - ($start,$stop) = ($stop,$start); - } - my $dna = substr($self->{dna}{$id},$start-1,$stop-$start+1); - if ($reversed) { - $dna =~ tr/gatcGATC/ctagCTAG/; - $dna = reverse $dna; - } - - $dna; -} - -sub setup_load { - my $self = shift; - $self->{tmp} = {}; - $self->{data} = []; - 1; -} - -sub finish_load { - my $self = shift; - my $idx = 0; - foreach my $arrayref (values %{$self->{tmp}}) { - foreach (@$arrayref) {$_->{feature_id} = $idx++; } - push @{$self->{data}},@$arrayref; - } - 1; -} - -# this method loads the feature as a hash into memory - -# keeps an array of features-hashes as an in-memory db -sub load_gff_line { - my $self = shift; - my $feature_hash = shift; - $feature_hash->{strand} = '' if $feature_hash->{strand} && $feature_hash->{strand} eq '.'; - $feature_hash->{phase} = '' if $feature_hash->{phase} && $feature_hash->{phase} eq '.'; - $feature_hash->{gclass} = 'Sequence' unless length $feature_hash->{gclass} > 0; - # sort by group please - push @{$self->{tmp}{$feature_hash->{gclass},$feature_hash->{gname}}},$feature_hash; -} - -# given sequence name, return (reference,start,stop,strand) -sub get_abscoords { - my $self = shift; - my ($name,$class,$refseq) = @_; - my %refs; - my $regexp; - - if ($name =~ /[*?]/) { # uh oh regexp time - $name = quotemeta($name); - $name =~ s/\\\*/.*/g; - $name =~ s/\\\?/.?/g; - $regexp++; - } - - # Find all features that have the requested name and class. - # Sort them by reference point. - for my $feature (@{$self->{data}}) { - - my $no_match_class_name; - my $empty_class_name; - my $class_matches = !defined($feature->{gclass}) || - length($feature->{gclass}) == 0 || - $feature->{gclass} eq $class; - - if (defined $feature->{gname}) { - my $matches = $class_matches - && ($regexp ? $feature->{gname} =~ /$name/i : lc($feature->{gname}) eq lc($name)); - $no_match_class_name = !$matches; # to accomodate Shuly's interesting logic - } - - else{ - $empty_class_name = 1; - } - - if ($no_match_class_name){ - my $feature_attributes = $feature->{attributes}; - my $attributes = {Alias => $name}; - if (!$self->_matching_attributes($feature_attributes,$attributes)){ - next; - } - } - - push @{$refs{$feature->{ref}}},$feature; - } - - # find out how many reference points we recovered - if (! %refs) { - $self->error("$name not found in database"); - return; - } - - # compute min and max - my ($ref) = keys %refs; - my @found = @{$refs{$ref}}; - my ($strand,$start,$stop); - - my @found_segments; - foreach my $ref (keys %refs) { - next if defined($refseq) and lc($ref) ne lc($refseq); - my @found = @{$refs{$ref}}; - my ($strand,$start,$stop,$name); - foreach (@found) { - $strand ||= $_->{strand}; - $strand = '+' if $strand && $strand eq '.'; - $start = $_->{start} if !defined($start) || $start > $_->{start}; - $stop = $_->{stop} if !defined($stop) || $stop < $_->{stop}; - $name ||= $_->{gname}; - } - push @found_segments,[$ref,$class,$start,$stop,$strand,$name]; - - } - - return \@found_segments; -} - -sub search_notes { - my $self = shift; - my ($search_string,$limit) = @_; - - $search_string =~ tr/*?//d; - - my @results; - my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g; - my $search = join '|',@words; - - for my $feature (@{$self->{data}}) { - next unless defined $feature->{gclass} && defined $feature->{gname}; # ignore NULL objects - next unless $feature->{attributes}; - my @attributes = @{$feature->{attributes}}; - my @values = map {$_->[1]} @attributes; - my $value = "@values"; - my $matches = 0; - for my $w (@words) { - my @hits = $value =~ /($w)/ig; - $matches += @hits; - } - next unless $matches; - - my $relevance = 10 * $matches; - my $featname = Bio::DB::GFF::Featname->new($feature->{gclass}=>$feature->{gname}); - my $note; - $note = join ' ',map {$_->[1]} grep {$_->[0] eq 'Note'} @{$feature->{attributes}}; - $note .= join ' ',grep /$search/,map {$_->[1]} grep {$_->[0] ne 'Note'} @{$feature->{attributes}}; - my $type = Bio::DB::GFF::Typename->new($feature->{method},$feature->{source}); - push @results,[$featname,$note,$relevance,$type]; - last if defined $limit && @results >= $limit; - } - - #added result filtering so that this method returns the expected results - #this section of code used to be in GBrowse's do_keyword_search method - - my $match_sub = 'sub {'; - foreach (split /\s+/,$search_string) { - $match_sub .= "return unless \$_[0] =~ /\Q$_\E/i; "; - } - $match_sub .= "};"; - my $match = eval $match_sub; - - my @matches = grep { $match->($_->[1]) } @results; - - return @matches; -} - -sub _delete_features { - my $self = shift; - my @feature_ids = sort {$b<=>$a} @_; - my $removed = 0; - foreach (@feature_ids) { - next unless $_ >= 0 && $_ < @{$self->{data}}; - $removed += defined splice(@{$self->{data}},$_,1); - } - $removed; -} - -sub _delete { - my $self = shift; - my $delete_spec = shift; - my $ranges = $delete_spec->{segments} || []; - my $types = $delete_spec->{types} || []; - my $force = $delete_spec->{force}; - my $range_type = $delete_spec->{range_type}; - - my $deleted = 0; - if (@$ranges) { - my @args = @$types ? (-type=>$types) : (); - push @args,(-range_type => $range_type); - my %ids_to_remove = map {$_->id => 1} map {$_->features(@args)} @$ranges; - $deleted = $self->delete_features(keys %ids_to_remove); - } elsif (@$types) { - my %ids_to_remove = map {$_->id => 1} $self->features(-type=>$types); - $deleted = $self->delete_features(keys %ids_to_remove); - } else { - $self->throw("This operation would delete all feature data and -force not specified") - unless $force; - $deleted = @{$self->{data}}; - @{$self->{data}} = (); - } - $deleted; -} - -# attributes - - -# Some GFF version 2 files use the groups column to store a series of -# attribute/value pairs. In this interpretation of GFF, the first such -# pair is treated as the primary group for the feature; subsequent pairs -# are treated as attributes. Two attributes have special meaning: -# "Note" is for backward compatibility and is used for unstructured text -# remarks. "Alias" is considered as a synonym for the feature name. -# If no name is provided, then attributes() returns a flattened hash, of -# attribute=>value pairs. - -sub do_attributes{ - my $self = shift; - my ($feature_id,$tag) = @_; - my $attr ; - - #my $feature = ${$self->{data}}[$feature_id]; - my $feature = $self->_basic_features_by_id($feature_id); - - my @result; - for my $attr (@{$feature->{attributes}}) { - my ($attr_name,$attr_value) = @$attr ; - if (defined($tag) && lc($attr_name) eq lc($tag)){push @result,$attr_value;} - elsif (!defined($tag)) {push @result,($attr_name,$attr_value);} - } - return @result; -} - - -#sub get_feature_by_attribute{ -sub _feature_by_attribute{ - my $self = shift; - my ($attributes,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - my $count = 0; - my $feature_id = -1; - my $feature_group_id = undef; - - for my $feature (@{$self->{data}}) { - - $feature_id++; - for my $attr (@{$feature->{attributes}}) { - my ($attr_name,$attr_value) = @$attr ; - #there could be more than one set of attributes...... - foreach (keys %$attributes) { - if (lc($_) eq lc($attr_name) && lc($attributes->{$_}) eq lc($attr_value)) { - $callback->($self->_hash_to_array($feature)); - $count++; - } - } - } - } - -} - - -# This is the low-level method that is called to retrieve GFF lines from -# the database. It is responsible for retrieving features that satisfy -# range and feature type criteria, and passing the GFF fields to a -# callback subroutine. - -sub get_features{ - my $self = shift; - my $count = 0; - my ($search,$options,$callback) = @_; - - my $found_features; - - $found_features = $self->_get_features_by_search_options($search,$options); - - # only true if the sort by group option was specified - @{$found_features} = sort {lc("$a->{gclass}:$a->{gname}") cmp lc("$b->{gclass}:$b->{gname}")} - @{$found_features} if $options->{sort_by_group} ; - - for my $feature (@{$found_features}) { # only true if the sort by group option was specified - $count++; - $callback->( - $self->_hash_to_array($feature) - ); - } - - return $count; -} - - -# Low level implementation of fetching a named feature. -# GFF annotations are named using the group class and name fields. -# May return zero, one, or several Bio::DB::GFF::Feature objects. - -=head2 _feature_by_name - - Title : _feature_by_name - Usage : $db->get_features_by_name($name,$class,$callback) - Function: get a list of features by name and class - Returns : count of number of features retrieved - Args : name of feature, class of feature, and a callback - Status : protected - -This method is used internally. The callback arguments are those used -by make_feature(). - -=cut - -sub _feature_by_name { - my $self = shift; - my ($class,$name,$location,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - my $count = 0; - my $regexp; - - if ($name =~ /[*?]/) { # uh oh regexp time - $name = quotemeta($name); - $name =~ s/\\\*/.*/g; - $name =~ s/\\\?/.?/g; - $regexp++; - } - - for my $feature (@{$self->{data}}) { - next unless ($regexp && $feature->{gname} =~ /$name/i) || lc($feature->{gname}) eq lc($name); - next if defined($feature->{gclass}) && length($feature->{gclass}) > 0 && $feature->{gclass} ne $class; - - if ($location) { - next if $location->[0] ne $feature->{ref}; - next if $location->[1] && $location->[1] > $feature->{stop}; - next if $location->[2] && $location->[2] < $feature->{start}; - } - $count++; - $callback->($self->_hash_to_array($feature),0); - } - return $count; -} - -# Low level implementation of fetching a feature by it's id. -# The id of the feature as implemented in the in-memory db, is the location of the -# feature in the features hash array. -sub _feature_by_id{ - my $self = shift; - my ($ids,$type,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - - my $feature_group_id = undef; - - my $count = 0; - if ($type eq 'feature'){ - for my $feature_id (@$ids){ - my $feature = $self->_basic_features_by_id($feature_id); - $callback->($self->_hash_to_array($feature)) if $callback; - $count++; - } - } -} - -sub _basic_features_by_id{ - my $self = shift; - my ($ids) = @_; - - $ids = [$ids] unless ref $ids =~ /ARRAY/; - - my @result; - for my $feature_id (@$ids){ - push @result, ${$self->{data}}[$feature_id]; - } - return wantarray() ? @result : $result[0]; -} - -# This method is similar to get_features(), except that it returns an -# iterator across the query. -# See Bio::DB::GFF::Adaptor::memory::iterator. - -sub get_features_iterator { - my $self = shift; - my ($search,$options,$callback) = @_; - $callback || $self->throw('must provide a callback argument'); - - my $results = $self->_get_features_by_search_options($search,$options); - my $results_array = $self->_convert_feature_hash_to_array($results); - - return Bio::DB::GFF::Adaptor::memory::iterator->new($results_array,$callback); -} - - -# This method is responsible for fetching the list of feature type names. -# The query may be limited to a particular range, in -# which case the range is indicated by a landmark sequence name and -# class and its subrange, if any. These arguments may be undef if it is -# desired to retrieve all feature types. - -# If the count flag is false, the method returns a simple list of -# Bio::DB::GFF::Typename objects. If $count is true, the method returns -# a list of $name=>$count pairs, where $count indicates the number of -# times this feature occurs in the range. - -sub get_types { - my $self = shift; - my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_; - - my(%result,%obj); - - for my $feature (@{$self->{data}}) { - my $feature_start = $feature->{start}; - my $feature_stop = $feature->{stop}; - my $feature_ref = $feature->{ref}; - my $feature_class = $feature->{class}; - my $feature_method = $feature->{method}; - my $feature_source = $feature->{source}; - - if (defined $srcseq){ - next unless lc($feature_ref) eq lc($srcseq); - } - - if (defined $class){ - next unless defined $feature_class && $feature_class eq $class ; - } - - # the requested range should OVERLAP the retrieved features - if (defined $start or defined $stop) { - $start = 1 unless defined $start; - $stop = MAX_SEGMENT unless defined $stop; - next unless $feature_stop >= $start && $feature_start <= $stop; - } - - if (defined $typelist && @$typelist){ - next unless $self->_matching_typelist($feature_method,$feature_source,$typelist); - } - - my $type = Bio::DB::GFF::Typename->new($feature_method,$feature_source); - $result{$type}++; - $obj{$type} = $type; - - } #end features loop - - return $want_count ? %result : values %obj; -} - -sub classes { - my $self = shift; - my %classes; - for my $feature (@{$self->{data}}) { - $classes{$feature->{gclass}}++; - } - my @classes = sort keys %classes; - return @classes; -} - -# Internal method that performs a search on the features array, -# sequentialy retrieves the features, and performs a check on each feature -# according to the search options. -sub _get_features_by_search_options{ - my $count = 0; - my ($self, $search,$options) = @_; - my ($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes) = - (@{$search}{qw(rangetype refseq refclass start stop types)}, - @{$options}{qw(sparse sort_by_group ATTRIBUTES)}) ; - - my @found_features; - my $data = $self->{data}; - - my $feature_id = -1 ; - my $feature_group_id = undef; - - for my $feature (@{$data}) { - - $feature_id++; - - my $feature_start = $feature->{start}; - my $feature_stop = $feature->{stop}; - my $feature_ref = $feature->{ref}; - - if (defined $refseq){ - next unless lc($feature_ref) eq lc($refseq); - } - - if (defined $start or defined $stop) { - $start = 0 unless defined($start); - $stop = MAX_SEGMENT unless defined($stop); - - if ($rangetype eq 'overlaps') { - next unless $feature_stop >= $start && $feature_start <= $stop; - } elsif ($rangetype eq 'contains') { - next unless $feature_start >= $start && $feature_stop <= $stop; - } elsif ($rangetype eq 'contained_in') { - next unless $feature_start <= $start && $feature_stop >= $stop; - } else { - next unless $feature_start == $start && $feature_stop == $stop; - } - - } - - my $feature_source = $feature->{source}; - my $feature_method = $feature->{method}; - - if (defined $types && @$types){ - next unless $self->_matching_typelist($feature_method,$feature_source,$types); - } - - my $feature_attributes = $feature->{attributes}; - if (defined $attributes){ - next unless $self->_matching_attributes($feature_attributes,$attributes); - } - - # if we get here, then we have a feature that meets the criteria. - # Then we just push onto an array - # of found features and continue. - - my $found_feature = $feature ; - $found_feature->{feature_id} = $feature_id; - $found_feature->{group_id} = $feature_group_id; - push @found_features,$found_feature; - } - - return \@found_features; -} - - -sub _hash_to_array { - my ($self,$feature_hash) = @_; - my @array = @{$feature_hash}{@hash2array_map}; - return wantarray ? @array : \@array; -} - -# this subroutine is needed for convertion of the feature from hash to array in order to -# pass it to the callback subroutine -sub _convert_feature_hash_to_array{ - my ($self, $feature_hash_array) = @_; - my @features_array_array = map {scalar $self->_hash_to_array($_)} @$feature_hash_array; - return \@features_array_array; -} - -sub _matching_typelist{ - my ($self, $feature_method,$feature_source,$typelist) = @_; - foreach (@$typelist) { - my ($search_method,$search_source) = @$_; - next if lc($search_method) ne lc($feature_method); - next if defined($search_source) && lc($search_source) ne lc($feature_source); - return 1; - } - return 0; -} - -sub _matching_attributes { - my ($self, $feature_attributes,$attributes) = @_ ; - foreach (keys %$attributes) { - return 0 if !_match_all_attr_in_feature($_,$attributes->{$_},$feature_attributes) - } - return 1; -} - -sub _match_all_attr_in_feature{ - my ($attr_name,$attr_value,$feature_attributes) = @_; - for my $attr (@$feature_attributes) { - my ($feature_attr_name,$feature_attr_value) = @$attr ; - next if ($attr_name ne $feature_attr_name || $attr_value ne $feature_attr_value); - return 1; - } - return 0; -} - - -sub do_initialize { 1; } -sub get_feature_by_group_id{ 1; } - -1; - diff --git a/lib/Bio/DB/GFF/Adaptor/memory/feature_serializer.pm b/lib/Bio/DB/GFF/Adaptor/memory/feature_serializer.pm deleted file mode 100644 index e305df568..000000000 --- a/lib/Bio/DB/GFF/Adaptor/memory/feature_serializer.pm +++ /dev/null @@ -1,37 +0,0 @@ -package Bio::DB::GFF::Adaptor::memory::feature_serializer; - - -=head1 NAME - -Bio::DB::GFF::Adaptor::memory::feature_serializer - utility methods for serializing and deserializing GFF features - -=cut - -use strict; - -require Exporter; -use vars qw(@EXPORT @EXPORT_OK @hash2array_map); -use base qw(Exporter); -@EXPORT_OK = qw(feature2string string2feature @hash2array_map); -@EXPORT = @EXPORT_OK; - -@hash2array_map = qw(ref start stop source method score strand phase gclass gname tstart tstop feature_id group_id bin); - -sub feature2string { - my $feature = shift; - local $^W = 0; - my @a = @{$feature}{@hash2array_map}; - push @a,map {join "\0",@$_} @{$feature->{attributes}} if $feature->{attributes}; - return join $;,@a; -} - -sub string2feature { - my $string = shift; - my (@attributes,%feature); - (@feature{@hash2array_map},@attributes) = split $;,$string; - $feature{attributes} = [map {[split "\0",$_]} @attributes]; - undef $feature{group_id}; - \%feature; -} - -1; diff --git a/lib/Bio/DB/GFF/Adaptor/memory/iterator.pm b/lib/Bio/DB/GFF/Adaptor/memory/iterator.pm deleted file mode 100644 index 2eaa8b57a..000000000 --- a/lib/Bio/DB/GFF/Adaptor/memory/iterator.pm +++ /dev/null @@ -1,79 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Adaptor::memory::iterator - iterator for Bio::DB::GFF::Adaptor::memory - -=head1 SYNOPSIS - -For internal use only - -=head1 DESCRIPTION - -This is an internal module that is used by the Bio::DB::GFF in-memory -adaptor to return an iterator across a sequence feature query. The -object has a single method, next_feature(), that returns the next -feature from the query. The method next_seq() is an alias for -next_feature(). - -=head1 BUGS - -None known yet. - -=head1 SEE ALSO - -L, - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -package Bio::DB::GFF::Adaptor::memory::iterator; -use strict; -# this module needs to be cleaned up and documented -use Bio::Root::Version; - -*next_seq = \&next_feature; - -sub new { - my $class = shift; - my ($data,$callback) = @_; - my $pos = 0; - return bless {data => $data, - pos => $pos, - callback => $callback, - cache => []},$class; - #return bless [$sth,$callback,[]],$class; -} - -sub next_feature { - my $self = shift; - return shift @{$self->{cache}} if @{$self->{cache}}; - - my $data = $self->{data} or return; - my $callback = $self->{callback}; - - my $features; - while (1) { - my $feature = $data->[$self->{pos}++]; - if ($feature) { - $features = $callback->(@{$feature}); - last if $features; - } else { - $features = $callback->(); - undef $self->{pos}; - undef $self->{data}; - undef $self->{cache}; - last; - } - } - $self->{cache} = $features or return; - shift @{$self->{cache}}; -} - -1; diff --git a/lib/Bio/DB/GFF/Aggregator.pm b/lib/Bio/DB/GFF/Aggregator.pm deleted file mode 100644 index d9162d791..000000000 --- a/lib/Bio/DB/GFF/Aggregator.pm +++ /dev/null @@ -1,642 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator -- Aggregate GFF groups into composite features - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - my $agg1 = Bio::DB::GFF::Aggregator->new(-method => 'cistron', - -main_method => 'locus', - -sub_parts => ['allele','variant'] - ); - - my $agg2 = Bio::DB::GFF::Aggregator->new(-method => 'splice_group', - -sub_parts => 'transcript'); - - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -aggregator => [$agg1,$agg2], - -dsn => 'dbi:mysql:elegans42', - ); - - -=head1 DESCRIPTION - -Bio::DB::GFF::Aggregator is used to aggregate GFF groups into -composite features. Each composite feature has a "main part", the -top-level feature, and a series of zero or more subparts, retrieved -with the sub_SeqFeature() method. The aggregator class is designed to -be subclassable, allowing a variety of GFF feature types to be -supported. - -The base Bio::DB::GFF::Aggregator class is generic, and can be used to -create specific instances to be passed to the -aggregator argument of -Bio::DB::GFF-Enew() call. The various subclasses of -Bio::DB::GFF::Aggregator are tuned for specific common feature types -such as clones, gapped alignments and transcripts. - -Instances of Bio::DB::GFF::Aggregator have three attributes: - -=over 3 - -=item * - -method - -This is the GFF method field of the composite feature as a whole. For -example, "transcript" may be used for a composite feature created by -aggregating individual intron, exon and UTR features. - -=item * - -main method - -Sometimes GFF groups are organized hierarchically, with one feature -logically containing another. For example, in the C. elegans schema, -methods of type "Sequence:curated" correspond to regions covered by -curated genes. There can be zero or one main methods. - -=item * - -subparts - -This is a list of one or more methods that correspond to the component -features of the aggregates. For example, in the C. elegans database, -the subparts of transcript are "intron", "exon" and "CDS". - -=back - -Aggregators have two main methods that can be overridden in -subclasses: - -=over 4 - -=item * - -disaggregate() - -This method is called by the Adaptor object prior to fetching a list -of features. The method is passed an associative array containing the -[method,source] pairs that the user has requested, and it returns a -list of raw features that it would like the adaptor to fetch. - -=item * - -aggregate() - -This method is called by the Adaptor object after it has fetched -features. The method is passed a list of raw features and is expected -to add its composite features to the list. - -=back - -The disaggregate() and aggregate() methods provided by the base -Aggregator class should be sufficient for many applications. In this -case, it suffices for subclasses to override the following methods: - -=over 4 - -=item * - -method() - -Return the default method for the composite feature as a whole. - -=item * - -main_name() - -Return the default main method name. - -=item * - -part_names() - -Return a list of subpart method names. - -=back - -Provided that method() and part_names() are overridden (and optionally -main_name() as well), then the bare name of the aggregator subclass -can be passed to the -aggregator of Bio::DB::GFF-Enew(). For example, -this is a small subclass that will aggregate features of type "allele" -and "polymorphism" into an aggregate named "mutant": - - package Bio::DB::GFF::Aggregator::mutant; - - use strict; - use Bio::DB::GFF::Aggregator; - - use base qw(Bio::DB::GFF::Aggregator); - - sub method { 'mutant' } - - sub part_names { - return qw(allele polymorphism); - } - - 1; - -Once installed, this aggregator can be passed to Bio::DB::GFF-Enew() -by name like so: - - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -aggregator => 'mutant', - -dsn => 'dbi:mysql:elegans42', - ); - -=head1 API - -The remainder of this document describes the public and private -methods implemented by this module. - -=cut - -package Bio::DB::GFF::Aggregator; - -use strict; -use Bio::DB::GFF::Util::Rearrange; # for rearrange() -use Bio::DB::GFF::Feature; - -use base qw(Bio::Root::Root); - -my $ALWAYS_TRUE = sub { 1 }; - -=head2 new - - Title : new - Usage : $a = Bio::DB::GFF::Aggregator->new(@args) - Function: create a new aggregator - Returns : a Bio::DB::GFF::Aggregator object - Args : see below - Status : Public - -This is the constructor for Bio::DB::GFF::Aggregator. Named arguments -are as follows: - - -method the method for the composite feature - - -main_method the top-level raw feature, if any - - -sub_parts the list of raw features that will form the subparts - of the composite feature (array reference or scalar) - -=cut - -sub new { - my $class = shift; - my ($method,$main,$sub_parts,$whole_object) = rearrange(['METHOD', - ['MAIN_PART','MAIN_METHOD'], - ['SUB_METHODS','SUB_PARTS'], - 'WHOLE_OBJECT' - ],@_); - return bless { - method => $method, - main_method => $main, - sub_parts => $sub_parts, - require_whole_object => $whole_object, - },$class; -} - -=head2 disaggregate - - Title : disaggregate - Usage : $a->disaggregate($types,$factory) - Function: disaggregate type list into components - Returns : a true value if this aggregator should be called to reaggregate - Args : see below - Status : Public - -This method is called to disaggregate a list of types into the set of -low-level features to be retrieved from the GFF database. The list of -types is passed as an array reference containing a series of -[method,source] pairs. This method synthesizes a new set of -[method,source] pairs, and appends them to the list of requested -types, changing the list in situ. - -Arguments: - - $types reference to an array of [method,source] pairs - - $factory reference to the Adaptor object that is calling - this method - -Note that the API allows disaggregate() to remove types from the type -list. This feature is probably not desirable and may be deprecated in -the future. - -=cut - -# this is called at the beginning to turn the pseudo-type -# into its component feature types -sub disaggregate { - my $self = shift; - my $types = shift; - my $factory = shift; - - my $sub_features = $factory->parse_types($self->get_part_names); - my $main_feature = $factory->parse_types($self->get_main_name); - - if (@$types) { - my (@synthetic_types,@unchanged); - foreach (@$types) { - my ($method,$source) = @$_; - if (lc $method eq lc $self->get_method) { # e.g. "transcript" - push @synthetic_types,map { [$_->[0],$_->[1] || $source] } @$sub_features,@$main_feature; - } - else { - push @unchanged,$_; - } - } - # remember what we're searching for - $self->components(\@synthetic_types); - $self->passthru(\@unchanged); - @$types = (@unchanged,@synthetic_types); - } - - # we get here when no search types are listed - else { - my @stypes = map { [$_->[0],$_->[1]] } @$sub_features,@$main_feature; - $self->components(\@stypes); - $self->passthru(undef); - } - - return $self->component_count > 0; -} - - -=head2 aggregate - - Title : aggregate - Usage : $features = $a->aggregate($features,$factory) - Function: aggregate a feature list into composite features - Returns : an array reference containing modified features - Args : see below - Status : Public - -This method is called to aggregate a list of raw GFF features into the -set of composite features. The method is called an array reference to -a set of Bio::DB::GFF::Feature objects. It runs through the list, -creating new composite features when appropriate. The method result -is an array reference containing the composite features. - -Arguments: - - $features reference to an array of Bio::DB::GFF::Feature objects - - $factory reference to the Adaptor object that is calling - this method - -NOTE: The reason that the function result contains the raw features as -well as the aggregated ones is to allow queries like this one: - - @features = $segment->features('exon','transcript:curated'); - -Assuming that "transcript" is the name of an aggregated feature and -that "exon" is one of its components, we do not want the transcript -aggregator to remove features of type "exon" because the user asked -for them explicitly. - -=cut - -sub aggregate { - my $self = shift; - my $features = shift; - my $factory = shift; - - my $main_method = $self->get_main_name; - my $matchsub = $self->match_sub($factory) or return; - my $strictmatch = $self->strict_match(); - my $passthru = $self->passthru_sub($factory); - - my (%aggregates,@result); - for my $feature (@$features) { - - if ($feature->group && $matchsub->($feature)) { - my $key = $strictmatch->{lc $feature->method,lc $feature->source} - ? join ($;,$feature->group,$feature->refseq,$feature->source) - : join ($;,$feature->group,$feature->refseq); - if ($main_method && lc $feature->method eq lc $main_method) { - $aggregates{$key}{base} ||= $feature->clone; - } else { - push @{$aggregates{$key}{subparts}},$feature; - } - push @result,$feature if $passthru && $passthru->($feature); - - } else { - push @result,$feature; - } - } - - # aggregate components - my $pseudo_method = $self->get_method; - my $require_whole_object = $self->require_whole_object; - foreach (keys %aggregates) { - if ($require_whole_object && $self->components) { - next unless $aggregates{$_}{base}; # && $aggregates{$_}{subparts}; - } - my $base = $aggregates{$_}{base}; - unless ($base) { # no base, so create one - my $first = $aggregates{$_}{subparts}[0]; - $base = $first->clone; # to inherit parent coordinate system, etc - $base->score(undef); - $base->phase(undef); - } - $base->method($pseudo_method); - $base->add_subfeature($_) foreach @{$aggregates{$_}{subparts}}; - $base->adjust_bounds; - $base->compound(1); # set the compound flag - push @result,$base; - } - @$features = @result; -} - - -=head2 method - - Title : method - Usage : $string = $a->method - Function: get the method type for the composite feature - Returns : a string - Args : none - Status : Protected - -This method is called to get the method to be assigned to the -composite feature once it is aggregated. It is called if the user did -not explicitly supply a -method argument when the aggregator was -created. - -This is the method that should be overridden in aggregator subclasses. - -=cut - -# default method - override in subclasses -sub method { - my $self = shift; - $self->{method}; -} - -=head2 main_name - - Title : main_name - Usage : $string = $a->main_name - Function: get the method type for the "main" component of the feature - Returns : a string - Args : none - Status : Protected - -This method is called to get the method of the "main component" of the -composite feature. It is called if the user did not explicitly supply -a -main-method argument when the aggregator was created. - -This is the method that should be overridden in aggregator subclasses. - -=cut - -# no default main method -sub main_name { - my $self = shift; - return; -} - -=head2 part_names - - Title : part_names - Usage : @methods = $a->part_names - Function: get the methods for the non-main various components of the feature - Returns : a list of strings - Args : none - Status : Protected - -This method is called to get the list of methods of the "main component" of the -composite feature. It is called if the user did not explicitly supply -a -main-method argument when the aggregator was created. - -This is the method that should be overridden in aggregator subclasses. - -=cut - -# no default part names -sub part_names { - my $self = shift; - return; -} - -=head2 require_whole_object - - Title : require_whole_object - Usage : $bool = $a->require_whole_object - Function: see below - Returns : a boolean flag - Args : none - Status : Internal - -This method returns true if the aggregator should refuse to aggregate -an object unless both its main part and its subparts are present. - -=cut - -sub require_whole_object { - my $self = shift; - my $d = $self->{require_whole_object}; - $self->{require_whole_object} = shift if @_; - $d; -} - -=head2 match_sub - - Title : match_sub - Usage : $coderef = $a->match_sub($factory) - Function: generate a code reference that will match desired features - Returns : a code reference - Args : see below - Status : Internal - -This method is used internally to generate a code sub that will -quickly filter out the raw features that we're interested in -aggregating. The returned sub accepts a Feature and returns true if -we should aggregate it, false otherwise. - -=cut - -#' make emacs happy - -sub match_sub { - my $self = shift; - my $factory = shift; - my $types_to_aggregate = $self->components() or return; # saved from disaggregate call - return unless @$types_to_aggregate; - return $factory->make_match_sub($types_to_aggregate); -} - -=head2 strict_match - - Title : strict_match - Usage : $strict = $a->strict_match - Function: generate a hashref that indicates which subfeatures - need to be tested strictly for matching sources before - aggregating - Returns : a hash ref - Status : Internal - -=cut - -sub strict_match { - my $self = shift; - my $types_to_aggregate = $self->components(); - my %strict; - for my $t (@$types_to_aggregate) { - $strict{lc $t->[0],lc $t->[1]}++ if defined $t->[1]; - } - \%strict; -} - -sub passthru_sub { - my $self = shift; - my $factory = shift; - my $passthru = $self->passthru() or return; - return unless @$passthru; - return $factory->make_match_sub($passthru); -} - -=head2 components - - Title : components - Usage : @array= $a->components([$components]) - Function: get/set stored list of parsed raw feature types - Returns : an array in list context, an array ref in scalar context - Args : new arrayref of feature types - Status : Internal - -This method is used internally to remember the parsed list of raw -features that we will aggregate. The need for this subroutine is -seen when a user requests a composite feature of type -"clone:cosmid". This generates a list of components in which the -source is appended to the method, like "clone_left_end:cosmid" and -"clone_right_end:cosmid". components() stores this information for -later use. - -=cut - -sub components { - my $self = shift; - my $d = $self->{components}; - $self->{components} = shift if @_; - return unless ref $d; - return wantarray ? @$d : $d; -} - -sub component_count { - my @c = shift->components; - scalar @c; -} - -sub passthru { - my $self = shift; - my $d = $self->{passthru}; - $self->{passthru} = shift if @_; - return unless ref $d; - return wantarray ? @$d : $d; -} - -sub clone { - my $self = shift; - my %new = %{$self}; - return bless \%new,ref($self); -} - -=head2 get_part_names - - Title : get_part_names - Usage : @array = $a->get_part_names - Function: get list of sub-parts for this type of feature - Returns : an array - Args : none - Status : Internal - -This method is used internally to fetch the list of feature types that -form the components of the composite feature. Type names in the -format "method:source" are recognized, as are "method" and -Bio::DB::GFF::Typename objects as well. It checks instance variables -first, and if not defined calls the part_names() method. - -=cut - -sub get_part_names { - my $self = shift; - if ($self->{sub_parts}) { - return ref $self->{sub_parts} ? @{$self->{sub_parts}} : $self->{sub_parts}; - } else { - return $self->part_names; - } -} - -=head2 get_main_name - - Title : get_main_name - Usage : $string = $a->get_main_name - Function: get the "main" method type for this feature - Returns : a string - Args : none - Status : Internal - -This method is used internally to fetch the type of the "main part" of -the feature. It checks instance variables first, and if not defined -calls the main_name() method. - -=cut - -sub get_main_name { - my $self = shift; - return $self->{main_method} if defined $self->{main_method}; - return $self->main_name; -} - -=head2 get_method - - Title : get_method - Usage : $string = $a->get_method - Function: get the method type for the composite feature - Returns : a string - Args : none - Status : Internal - -This method is used internally to fetch the type of the method that -will be assigned to the composite feature once it is synthesized. - -=cut - -sub get_method { - my $self = shift; - return $self->{method} if defined $self->{method}; - return $self->method; -} - -1; - -=head1 BUGS - -None known yet. - -=head1 SEE ALSO - -L, -L, -L, -L, -L, -L, -L, -L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/alignment.pm b/lib/Bio/DB/GFF/Aggregator/alignment.pm deleted file mode 100644 index 439dfcf78..000000000 --- a/lib/Bio/DB/GFF/Aggregator/alignment.pm +++ /dev/null @@ -1,138 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::alignment -- Alignment aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['alignment'], - ); - - ----------------------------- - Aggregator method: alignment - Main method: (none) - Sub methods: nucleotide_match,EST_match,cDNA_match,expressed_sequence_match, - translated_nucleotide_match,protein_match,HSP - ----------------------------- - -=head1 DESCRIPTION - -Bio::DB::GFF::Aggregator::alignment is one of the default aggregators, -and was written to be compatible with the C elegans GFF files. It -aggregates raw "similarity" features into composite features of type -"alignment". A better name for this class might be -"gapped_alignment." - -This aggregator does not insist that there be a single top-level -feature that spans one end of the alignment to the other. As a -result, it can produce truncated alignments if the entire alignment is -not contained within the segment of interest. - -=cut - -package Bio::DB::GFF::Aggregator::alignment; - -use strict; - - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 aggregate - - Title : aggregate - Usage : $features = $a->aggregate($features,$factory) - Function: aggregate a feature list into composite features - Returns : an array reference containing modified features - Args : see L - Status : Public - -Because of the large number of similarity features, the aggregate() -method is overridden in order to perform some optimizations. - -=cut - -# we look for features of type Sequence and add them to a pseudotype transcript -sub aggregate { - my $self = shift; - my $features = shift; - my $factory = shift; - - my $matchsub = $self->match_sub($factory) or return; - my $passthru = $self->passthru_sub($factory); - my $method = $self->get_method; - - my (%alignments,%targets,@result); - - warn "running alignment aggregator" if $factory->debug; - for my $feature (@$features) { - - if ($matchsub->($feature)) { - - my $group = $feature->{group}; - my $source = $feature->source; - unless (exists $alignments{$group,$source}) { - my $type = Bio::DB::GFF::Typename->new($method,$source); - - my $f = $feature->clone; - # this is a violation of OO encapsulation, but need to do it this way - # to achieve desired performance - @{$f}{qw(type score phase)} = ($type,undef,undef); - - $alignments{$group,$source} = $f or next; - } - - my $main = $alignments{$group,$source}; - $main->add_subfeature($feature); - push @result,$feature if $passthru && $passthru->($feature); - } else { - push @result,$feature; - } - } - - warn "running aligner adjuster" if $factory->debug; - for my $alignment (values %alignments) { - $alignment->adjust_bounds; - $alignment->compound(1); - push @result,$alignment; - } - warn "aligner done" if $factory->debug; - @$features = @result; -} - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "alignment" - Args : none - Status : Public - -=cut - -sub method { 'alignment' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : the full list of aggregated methods - Args : none - Status : Public - -=cut - -sub part_names { - my $self = shift; - return qw(nucleotide_match EST_match cDNA_match - expressed_sequence_match - translated_nucleotide_match - protein_match HSP); -} - -1; diff --git a/lib/Bio/DB/GFF/Aggregator/clone.pm b/lib/Bio/DB/GFF/Aggregator/clone.pm deleted file mode 100644 index edc6550f5..000000000 --- a/lib/Bio/DB/GFF/Aggregator/clone.pm +++ /dev/null @@ -1,160 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::clone -- Clone aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ---------------------------------------------------------------------------- - Aggregator method: clone - Main method: -none- - Sub methods: Clone_left_end Clone_right_end region:Genomic_canonical - ---------------------------------------------------------------------------- - -=head1 DESCRIPTION - -Bio::DB::GFF::Aggregator::clone is one of the default aggregators, and -was written to be compatible with the C elegans GFF files. It -aggregates raw "Clone_left_end", "Clone_right_end", and -"region:Genomic_canonical" features into composite features of type -"clone". - -=cut - -package Bio::DB::GFF::Aggregator::clone; - -use strict; - - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 aggregate - - Title : aggregate - Usage : $features = $a->aggregate($features,$factory) - Function: aggregate a feature list into composite features - Returns : an array reference containing modified features - Args : see L - Status : Public - -The WormBase GFF model is unusual in that clones aren't identified as -a single feature with start and stop positions, but as two features, a -"left end" and a "right end". One or both of these features may be -absent. In order to accommodate this, the aggregator will return undef -for the start and/or stop if one or both of the ends are missing. - -=cut - -#' - -# we look for features of type Sequence and add them to a pseudotype transcript -sub aggregate { - my $self = shift; - my $features = shift; - my $factory = shift; - - my $matchsub = $self->match_sub($factory) or return; - my $passthru = $self->passthru_sub($factory); - my $method = $self->get_method; - - my (%clones,%types,@result); - for my $feature (@$features) { - - if ($feature->group && $matchsub->($feature)) { - - if ($feature->method =~ /^region|Sequence$/ && $feature->source eq 'Genomic_canonical') { - $clones{$feature->group}{canonical} = $feature; - } elsif ($feature->method eq 'Clone_left_end') { - $clones{$feature->group}{left} = $feature; - } elsif ($feature->method eq 'Clone_right_end') { - $clones{$feature->group}{right} = $feature; - } - push @result,$feature if $passthru && $passthru->($feature); - } else { - push @result,$feature; - } - } - - for my $clone (keys %clones) { - my $canonical = $clones{$clone}{canonical} or next; - - # the genomic_canonical doesn't tell us where the clone starts and stops - # so don't assume it - my $duplicate = $canonical->clone; # make a duplicate of the feature - # munge the method and source fields - my $source = $duplicate->source; - my $type = $types{$method,$source} ||= Bio::DB::GFF::Typename->new($method,$source); - $duplicate->type($type); - - my ($start,$stop) = $duplicate->strand > 0 ? ('start','stop') : ('stop','start'); - @{$duplicate}{$start,$stop} =(undef,undef); - - $duplicate->{$start} = $clones{$clone}{left}{$start} if exists $clones{$clone}{left}; - $duplicate->{$stop} = $clones{$clone}{right}{$stop} if exists $clones{$clone}{right}; - $duplicate->method($self->method); - push @result,$duplicate; - } - - @$features = @result; -} - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "clone" - Args : none - Status : Public - -=cut - -sub method { 'clone' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : the list ("Clone_left_end", "Clone_right_end", "region:Genomic_canonical") - Args : none - Status : Public - -=cut - -sub part_names { - my $self = shift; - return qw(Clone_left_end Clone_right_end region:Genomic_canonical Sequence:Genomic_canonical); -} - -1; - -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/coding.pm b/lib/Bio/DB/GFF/Aggregator/coding.pm deleted file mode 100644 index 0e9e367bf..000000000 --- a/lib/Bio/DB/GFF/Aggregator/coding.pm +++ /dev/null @@ -1,102 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::coding -- The Coding Region Aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['coding'], - ); - - ------------------------------------------------------------------------ - Aggregator method: coding - Main method: mRNA - Sub methods: CDS - ------------------------------------------------------------------------ - -=head1 DESCRIPTION - -Bio::DB::GFF::Aggregator::coding aggregates "CDS" features into a -feature called "coding" and was written to be compatible with the -Sequence Ontology canonical gene. The CDS features are expected to -belong to a parent of type "mRNA," but the aggregator will work even -if this isn't the case. - -=cut - -package Bio::DB::GFF::Aggregator::coding; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "coding" - Args : none - Status : Public - -=cut - -sub method { 'coding' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : the list (CDS cds) - Args : none - Status : Public - -=cut - -sub part_names { - return qw(CDS cds); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "mRNA" - Args : none - Status : Public - -=cut - -sub main_name { - return 'mRNA'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/gene.pm b/lib/Bio/DB/GFF/Aggregator/gene.pm deleted file mode 100644 index 0213b2fc0..000000000 --- a/lib/Bio/DB/GFF/Aggregator/gene.pm +++ /dev/null @@ -1,108 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::gene -- Sequence Ontology Geene - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['gene'], - ); - - ------------------------------------------------------------------------ - Aggregator method: gene - Main method: mRNA - Sub methods: CDS exon five_prime_UTR three_prime_UTR transcription_start_site polyA_site - ------------------------------------------------------------------------ - -=head1 DESCRIPTION - -Bio::DB::GFF::Aggregator::gene is identical to so_transcript, but is -used in those cases where you would like the name of the aggregated -feature to be "gene" rather than "processed_transcript". It aggregates -raw "exon," "CDS", "five_prime_UTR", "three_prime_UTR", -"transcription_start_site" and "polyA_site" features into "mRNA" -features. The UTRs may also be named "untranslated_region," -"five_prime_untranslated_region," "three_prime_untranslated_region,", -"5'-UTR," and other synonyms. - -=cut - -package Bio::DB::GFF::Aggregator::gene; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "gene" - Args : none - Status : Public - -=cut - -sub method { 'gene' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : the list CDS 5'-UTR 3'-UTR transcription_start_site polyA_site - Args : none - Status : Public - -=cut - -sub part_names { - return qw(CDS transcription_start_site - polyA_site UTR five_prime_untranslated_region - three_prime_untranslated_region - five_prime_UTR three_prime_UTR exon); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "mRNA" - Args : none - Status : Public - -=cut - -sub main_name { - return 'mRNA'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2008 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/match.pm b/lib/Bio/DB/GFF/Aggregator/match.pm deleted file mode 100644 index ea61855b8..000000000 --- a/lib/Bio/DB/GFF/Aggregator/match.pm +++ /dev/null @@ -1,105 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::match -- Match aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['match'], - ); - - ------------------------------------------------- - Aggregator method: match - Main method: match - Sub methods: similarity HSP - ------------------------------------------------- - -=head1 DESCRIPTION - -This aggregator is used for Sequence Ontology-compatible gapped -alignments, in which there is a single top-level alignment called -"match" and a series of subalignments called either "similarity" or -"HSP". - -Also see the "alignment" aggregator. - -=cut - -package Bio::DB::GFF::Aggregator::match; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "match" - Args : none - Status : Public - -=cut - -sub method { 'match' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : the list "similarity", "HSP" - Args : none - Status : Public - -=cut - -sub part_names { - return qw(similarity HSP); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "match" - Args : none - Status : Public - -=cut - -sub main_name { - return 'match'; -} - -sub require_whole_object {1} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/none.pm b/lib/Bio/DB/GFF/Aggregator/none.pm deleted file mode 100644 index a11dcfab3..000000000 --- a/lib/Bio/DB/GFF/Aggregator/none.pm +++ /dev/null @@ -1,43 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::none -- No aggregation - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => 'none' - ); - - -=head1 DESCRIPTION - -Bio::DB::GFF::Aggregator::none can be used to indicate that you do not -want any aggregation performed. It is equivalent to providing undef -to the B<-aggregator> argument. It overrides disaggregate() and -aggregate() so that they do exactly nothing. - -=cut - -package Bio::DB::GFF::Aggregator::none; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -sub disaggregate { - my $self = shift; - my $types = shift; - # no change -} - -sub aggregate { - my $self = shift; - my $features = shift; - return; # no change -} - -1; diff --git a/lib/Bio/DB/GFF/Aggregator/orf.pm b/lib/Bio/DB/GFF/Aggregator/orf.pm deleted file mode 100644 index d79274320..000000000 --- a/lib/Bio/DB/GFF/Aggregator/orf.pm +++ /dev/null @@ -1,91 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::orf -- An aggregator for orf regions - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['orf','clone'], - ); - - --------------------------- - Aggregator method: orf - Main method: -none- - Sub methods: ORF - --------------------------- - -=head1 DESCRIPTION - -Bio::DB::GFF::Aggregator::orf was written to work with the "cds" -glyph. GFF files. It aggregates raw "ORF" features into "coding" -features. This is basically identical to the "coding" aggregator, -except that it looks for features of type "ORF" rather than "cds". - -=cut - -package Bio::DB::GFF::Aggregator::orf; - -use strict; -use Bio::DB::GFF::Aggregator; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "orf" - Args : none - Status : Public - -=cut - -sub method { 'orf' } - -# sub require_whole_object { 1; } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : the list "CDS" - Args : none - Status : Public - -=cut - -sub part_names { - return qw(ORF); -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L, -L, L - - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/processed_transcript.pm b/lib/Bio/DB/GFF/Aggregator/processed_transcript.pm deleted file mode 100644 index 751d0518a..000000000 --- a/lib/Bio/DB/GFF/Aggregator/processed_transcript.pm +++ /dev/null @@ -1,107 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::processed_transcript -- Sequence Ontology Transcript - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['processed_transcript'], - ); - - ------------------------------------------------------------------------ - Aggregator method: processed_transcript - Main method: mRNA - Sub methods: CDS exon five_prime_UTR three_prime_UTR transcription_start_site polyA_site 5'-UTR 3'-UTR - ------------------------------------------------------------------------ - -=head1 DESCRIPTION - -Bio::DB::GFF::Aggregator::processed_transcript is one of the default -aggregators, and was written to be compatible with the Sequence -Ontology canonical gene. It aggregates raw "exon," "CDS", -"five_prime_UTR", "three_prime_UTR", "transcription_start_site" and -"polyA_site" features into "mRNA" features. The UTRs may also be -named "untranslated_region," "five_prime_untranslated_region," -"three_prime_untranslated_region,", "5'-UTR," and other synonyms. - -=cut - -package Bio::DB::GFF::Aggregator::processed_transcript; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "processed_transcript" - Args : none - Status : Public - -=cut - -sub method { 'processed_transcript' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : the list CDS 5'-UTR 3'-UTR transcription_start_site polyA_site - Args : none - Status : Public - -=cut - -sub part_names { - return qw(CDS 5'-UTR 3'-UTR transcription_start_site - polyA_site UTR five_prime_untranslated_region - three_prime_untranslated_region - five_prime_UTR three_prime_UTR exon); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "mRNA" - Args : none - Status : Public - -=cut - -sub main_name { - return 'mRNA'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/so_transcript.pm b/lib/Bio/DB/GFF/Aggregator/so_transcript.pm deleted file mode 100644 index f967efb5f..000000000 --- a/lib/Bio/DB/GFF/Aggregator/so_transcript.pm +++ /dev/null @@ -1,110 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::so_transcript -- Sequence Ontology Transcript - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['so_transcript'], - ); - - ------------------------------------------------------------------------ - Aggregator method: processed_transcript - Main method: mRNA - Sub methods: CDS exon five_prime_UTR three_prime_UTR transcription_start_site polyA_site 5'-UTR 3'-UTR - ------------------------------------------------------------------------ - -=head1 DESCRIPTION - -Bio::DB::GFF::Aggregator::so_transcript is identical to the -processed_transcript aggregator, which was designed to be compatible -with the Sequence Ontology canonical gene. It aggregates raw "exon," -"CDS", "five_prime_UTR", "three_prime_UTR", "transcription_start_site" -and "polyA_site" features into "mRNA" features. The UTRs may also be -named "untranslated_region," "five_prime_untranslated_region," -"three_prime_untranslated_region,", "5'-UTR," and other synonyms. - -The processed_transcript aggregator is loaded by default, so this is -only needed for backward compatibility. - -=cut - -package Bio::DB::GFF::Aggregator::so_transcript; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "processed_transcript" - Args : none - Status : Public - -=cut - -sub method { 'so_transcript' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : the list CDS 5'-UTR 3'-UTR transcription_start_site polyA_site - Args : none - Status : Public - -=cut - -sub part_names { - return qw(CDS 5'-UTR 3'-UTR transcription_start_site - polyA_site UTR five_prime_untranslated_region - three_prime_untranslated_region - five_prime_UTR three_prime_UTR exon); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "mRNA" - Args : none - Status : Public - -=cut - -sub main_name { - return 'mRNA'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/transcript.pm b/lib/Bio/DB/GFF/Aggregator/transcript.pm deleted file mode 100644 index b689d62f2..000000000 --- a/lib/Bio/DB/GFF/Aggregator/transcript.pm +++ /dev/null @@ -1,114 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::transcript -- Transcript aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ------------------------------------------------- - Aggregator method: transcript - Main method: transcript - Sub methods: exon CDS 5'UTR 3'UTR TSS PolyA - ------------------------------------------------- - -=head1 DESCRIPTION - -Bio::DB::GFF::Aggregator::transcript is one of the default -aggregators, and was written to be compatible with the C elegans GFF -files. It aggregates raw ""exon", "CDS", "5'UTR", "3'UTR", "polyA" -and "TSS" features into "transcript" features. For compatibility with -the idiosyncrasies of the Sanger GFF format, it expects that the full -range of the transcript is contained in a main feature of type -"Transcript" (notice the capital "T"). - -Internally this module is very simple. To override it with one that -recognizes a main feature named "gene", simply follow this -template: - - my $db = Bio::DB::GFF->new(...etc...) - my $aggregator = Bio::DB::GFF::Aggregator->new(-method => 'transcript', - -main_method => 'gene', - -sub_parts => ['exon','CDS']); - $db->add_aggregator($aggregator); - -=cut - -package Bio::DB::GFF::Aggregator::transcript; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "transcript" - Args : none - Status : Public - -=cut - -sub method { 'transcript' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : the list "intron", "exon" and "CDS" - Args : none - Status : Public - -=cut - -sub part_names { - return qw(exon CDS 5'UTR 3'UTR TSS PolyA); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "transcript" - Args : none - Status : Public - -=cut - -sub main_name { - return 'transcript'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/ucsc_acembly.pm b/lib/Bio/DB/GFF/Aggregator/ucsc_acembly.pm deleted file mode 100644 index 30ab6fa10..000000000 --- a/lib/Bio/DB/GFF/Aggregator/ucsc_acembly.pm +++ /dev/null @@ -1,99 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::ucsc_acembly -- UCSC acembly aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ------------------------------------------------- - Aggregator method: transcript - Main method: transcript - Sub methods: exon CDS 5'UTR 3'UTR TSS PolyA - ------------------------------------------------- - -=head1 DESCRIPTION - -L - -=cut - -package Bio::DB::GFF::Aggregator::ucsc_acembly; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "acembly" - Args : none - Status : Public - -=cut - -sub method { 'acembly' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : empty list - Args : none - Status : Public - -=cut - -sub part_names { - return (); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "transcript:acembly" - Args : none - Status : Public - -=cut - -sub main_name { - return 'transcript:acembly'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Allen Day Eallenday@ucla.eduE. - -Copyright (c) 2002 Allen Day, University of California, Los Angeles. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/ucsc_ensgene.pm b/lib/Bio/DB/GFF/Aggregator/ucsc_ensgene.pm deleted file mode 100644 index 16ededc43..000000000 --- a/lib/Bio/DB/GFF/Aggregator/ucsc_ensgene.pm +++ /dev/null @@ -1,99 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::ucsc_ensgene -- UCSC ensGene aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ------------------------------------------------- - Aggregator method: ensgene - Main method: transcript - Sub methods: ensGene - ------------------------------------------------- - -=head1 DESCRIPTION - -L - -=cut - -package Bio::DB::GFF::Aggregator::ucsc_ensgene; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "ensgene" - Args : none - Status : Public - -=cut - -sub method { 'ensgene' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : empty list - Args : none - Status : Public - -=cut - -sub part_names { - return (); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "transcript:ensGene" - Args : none - Status : Public - -=cut - -sub main_name { - return 'transcript:ensGene'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Allen Day Eallenday@ucla.eduE. - -Copyright (c) 2002 Allen Day, University of California, Los Angeles. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/ucsc_genscan.pm b/lib/Bio/DB/GFF/Aggregator/ucsc_genscan.pm deleted file mode 100644 index f0bec573c..000000000 --- a/lib/Bio/DB/GFF/Aggregator/ucsc_genscan.pm +++ /dev/null @@ -1,98 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::ucsc_genscan -- UCSC genscan aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ------------------------------------------------- - Aggregator method: genscan - Main method: transcript - Sub methods: genscan - ------------------------------------------------- - -=head1 DESCRIPTION - -L - -=cut - -package Bio::DB::GFF::Aggregator::ucsc_genscan; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "genscan" - Args : none - Status : Public - -=cut - -sub method { 'genscan' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : empty list - Args : none - Status : Public - -=cut - -sub part_names { - return (); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "transcript:genscan" - Args : none - Status : Public - -=cut - -sub main_name { - return 'transcript:genscan'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Allen Day Eallenday@ucla.eduE. - -Copyright (c) 2002 Allen Day, University of California, Los Angeles. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/ucsc_refgene.pm b/lib/Bio/DB/GFF/Aggregator/ucsc_refgene.pm deleted file mode 100644 index 06f656312..000000000 --- a/lib/Bio/DB/GFF/Aggregator/ucsc_refgene.pm +++ /dev/null @@ -1,98 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::ucsc_refgene -- UCSC refGene aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ------------------------------------------------- - Aggregator method: refgene - Main method: transcript - Sub methods: refGene - ------------------------------------------------- - -=head1 DESCRIPTION - -L - -=cut - -package Bio::DB::GFF::Aggregator::ucsc_refgene; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "refgene" - Args : none - Status : Public - -=cut - -sub method { 'refgene' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : empty list - Args : none - Status : Public - -=cut - -sub part_names { - return (); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "transcript:refGene" - Args : none - Status : Public - -=cut - -sub main_name { - return 'transcript:refGene'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Allen Day Eallenday@ucla.eduE. - -Copyright (c) 2002 Allen Day, University of California, Los Angeles. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/ucsc_sanger22.pm b/lib/Bio/DB/GFF/Aggregator/ucsc_sanger22.pm deleted file mode 100644 index a81148b27..000000000 --- a/lib/Bio/DB/GFF/Aggregator/ucsc_sanger22.pm +++ /dev/null @@ -1,99 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::ucsc_sanger22 -- UCSC sanger22 aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ------------------------------------------------- - Aggregator method: sanger22 - Main method: transcript - Sub methods: sanger22 - ------------------------------------------------- - -=head1 DESCRIPTION - -L - -=cut - -package Bio::DB::GFF::Aggregator::ucsc_sanger22; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "sanger22" - Args : none - Status : Public - -=cut - -sub method { 'sanger22' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : empty list - Args : none - Status : Public - -=cut - -sub part_names { - return (); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "transcript:sanger22" - Args : none - Status : Public - -=cut - -sub main_name { - return 'transcript:sanger22'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Allen Day Eallenday@ucla.eduE. - -Copyright (c) 2002 Allen Day, University of California, Los Angeles. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/ucsc_sanger22pseudo.pm b/lib/Bio/DB/GFF/Aggregator/ucsc_sanger22pseudo.pm deleted file mode 100644 index 3b701247e..000000000 --- a/lib/Bio/DB/GFF/Aggregator/ucsc_sanger22pseudo.pm +++ /dev/null @@ -1,99 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::ucsc_sanger22pseudo -- UCSC sanger22pseudo aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ------------------------------------------------- - Aggregator method: sanger22pseudo - Main method: transcript - Sub methods: sanger22pseudo - ------------------------------------------------- - -=head1 DESCRIPTION - -L - -=cut - -package Bio::DB::GFF::Aggregator::ucsc_sanger22pseudo; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "sanger22pseudo" - Args : none - Status : Public - -=cut - -sub method { 'sanger22pseudo' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : empty list - Args : none - Status : Public - -=cut - -sub part_names { - return (); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "transcript:sanger22pseudo" - Args : none - Status : Public - -=cut - -sub main_name { - return 'transcript:sanger22pseudo'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Allen Day Eallenday@ucla.eduE. - -Copyright (c) 2002 Allen Day, University of California, Los Angeles. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/ucsc_softberry.pm b/lib/Bio/DB/GFF/Aggregator/ucsc_softberry.pm deleted file mode 100644 index 8050f5d39..000000000 --- a/lib/Bio/DB/GFF/Aggregator/ucsc_softberry.pm +++ /dev/null @@ -1,98 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::ucsc_softberry -- UCSC softberry aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ------------------------------------------------- - Aggregator method: softberry - Main method: transcript - Sub methods: softberryGene - ------------------------------------------------- - -=head1 DESCRIPTION - -L - -=cut - -package Bio::DB::GFF::Aggregator::ucsc_softberry; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "softberry" - Args : none - Status : Public - -=cut - -sub method { 'softberry' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : empty list - Args : none - Status : Public - -=cut - -sub part_names { - return (); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "transcript:softberryGene" - Args : none - Status : Public - -=cut - -sub main_name { - return 'transcript:softberryGene'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Allen Day Eallenday@ucla.eduE. - -Copyright (c) 2002 Allen Day, University of California, Los Angeles. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/ucsc_twinscan.pm b/lib/Bio/DB/GFF/Aggregator/ucsc_twinscan.pm deleted file mode 100644 index fc326e99d..000000000 --- a/lib/Bio/DB/GFF/Aggregator/ucsc_twinscan.pm +++ /dev/null @@ -1,98 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::ucsc_twinscan -- UCSC twinscan aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ------------------------------------------------- - Aggregator method: twinscan - Main method: transcript - Sub methods: twinscan - ------------------------------------------------- - -=head1 DESCRIPTION - -L - -=cut - -package Bio::DB::GFF::Aggregator::ucsc_twinscan; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "twinscan" - Args : none - Status : Public - -=cut - -sub method { 'twinscan' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : empty list - Args : none - Status : Public - -=cut - -sub part_names { - return (); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "transcript:twinscan" - Args : none - Status : Public - -=cut - -sub main_name { - return 'transcript:twinscan'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Allen Day Eallenday@ucla.eduE. - -Copyright (c) 2002 Allen Day, University of California, Los Angeles. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Aggregator/ucsc_unigene.pm b/lib/Bio/DB/GFF/Aggregator/ucsc_unigene.pm deleted file mode 100644 index 8002aeaa9..000000000 --- a/lib/Bio/DB/GFF/Aggregator/ucsc_unigene.pm +++ /dev/null @@ -1,100 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Aggregator::ucsc_unigene -- UCSC UniGene aggregator - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - # Open the sequence database - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42', - -aggregator => ['transcript','clone'], - ); - - ------------------------------------------------- - Aggregator method: unigene - Main method: transcript - Sub methods: unigene_2 - ------------------------------------------------- - -=head1 DESCRIPTION - -L - -=cut - -package Bio::DB::GFF::Aggregator::ucsc_unigene; - -use strict; - -use base qw(Bio::DB::GFF::Aggregator); - - -=head2 method - - Title : method - Usage : $aggregator->method - Function: return the method for the composite object - Returns : the string "unigene" - Args : none - Status : Public - -=cut - -sub method { 'unigene' } - -=head2 part_names - - Title : part_names - Usage : $aggregator->part_names - Function: return the methods for the sub-parts - Returns : empty list - Args : none - Status : Public - -=cut - -sub part_names { - return (); -} - -=head2 main_name - - Title : main_name - Usage : $aggregator->main_name - Function: return the method for the main component - Returns : the string "transcript" - Args : none - Status : Public - -=cut - -sub main_name { -#transcript - return 'transcript:uniGene_2'; -} - -1; -__END__ - -=head1 BUGS - -None reported. - - -=head1 SEE ALSO - -L, L - -=head1 AUTHOR - -Allen Day Eallenday@ucla.eduE. - -Copyright (c) 2002 Allen Day, University of California, Los Angeles. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Featname.pm b/lib/Bio/DB/GFF/Featname.pm deleted file mode 100644 index a0d64da8b..000000000 --- a/lib/Bio/DB/GFF/Featname.pm +++ /dev/null @@ -1,153 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Featname -- The name of a feature - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', - -dsn => 'dbi:mysql:elegans42'); - - my $feature = Bio::DB::GFF::Featname->new(Locus => 'unc-19'); - my $segment = $db->segment($feature); - -=head1 DESCRIPTION - -Bio::DB::GFF::Featname is the name of a feature. It contains two -fields: name and class. It is typically used by the Bio::DB::GFF -module to denote a group, and is accepted by -Bio::DB::Relsegment-Enew() and Bio::DB::GFF-Esegment() as a -replacement for the -name and -class arguments. - -=head1 METHODS - -=cut - -package Bio::DB::GFF::Featname; -use strict; -use base qw(Bio::Root::RootI); - -use overload - '""' => 'asString', - fallback => 1; - -=head2 new - - Title : new - Usage : $name = Bio::DB::GFF::Featname->new($class,$name) - Function: create a new Bio::DB::GFF::Featname object - Returns : a new Bio::DB::GFF::Featname object - Args : class and ID - Status : Public - -=cut - -sub new { - # use a blessed array for speed - my $pack = shift; - bless [@_],$pack; # class,name -} - -sub _cleanup_methods { return; } - -=head2 id - - Title : id - Usage : $id = $name->id - Function: return a unique ID for the combination of class and name - Returns : a string - Args : none - Status : Public - -This method returns a unique combination of the name and class in the -form "class:name". Coincidentally, this is the same format used -by AceDB. - -=cut - -sub id { - my $self = shift; - return join ':',@$self; -} - -=head2 name - - Title : name - Usage : $name = $name->name - Function: return the name of the Featname - Returns : a string - Args : none - Status : Public - -=cut - -sub name { shift->[1] } - -=head2 class - - Title : class - Usage : $class = $name->class - Function: return the name of the Featname - Returns : a string - Args : none - Status : Public - -=cut - -sub class { shift->[0] } - -=head2 asString - - Title : asString - Usage : $string = $name->asString - Function: same as name() - Returns : a string - Args : none - Status : Public - -This method is used to overload the "" operator. It is equivalent to -calling name(). - -=cut - -sub asString { shift->name } - -=head2 clone - - Title : clone - Usage : $new_clone = $type->clone; - Function: clone this object - Returns : a new Bio::DB::GFF::Featname object - Args : none - Status : Public - -This method creates an exact copy of the object. - -=cut - -sub clone { - my $self = shift; - return bless [@$self],ref $self; -} - -=head1 BUGS - -This module is still under development. - -=head1 SEE ALSO - -L, L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -1; diff --git a/lib/Bio/DB/GFF/Feature.pm b/lib/Bio/DB/GFF/Feature.pm deleted file mode 100644 index 334184859..000000000 --- a/lib/Bio/DB/GFF/Feature.pm +++ /dev/null @@ -1,1374 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Feature -- A relative segment identified by a feature type - -=head1 SYNOPSIS - -See L. - -=head1 DESCRIPTION - -Bio::DB::GFF::Feature is a stretch of sequence that corresponding to a -single annotation in a GFF database. It inherits from -Bio::DB::GFF::RelSegment, and so has all the support for relative -addressing of this class and its ancestors. It also inherits from -Bio::SeqFeatureI and so has the familiar start(), stop(), -primary_tag() and location() methods (it implements Bio::LocationI -too, if needed). - -Bio::DB::GFF::Feature adds new methods to retrieve the annotation -type, group, and other GFF attributes. Annotation types are -represented by Bio::DB::GFF::Typename objects, a simple class that has -two methods called method() and source(). These correspond to the -method and source fields of a GFF file. - -Annotation groups serve the dual purpose of giving the annotation a -human-readable name, and providing higher-order groupings of -subfeatures into features. The groups returned by this module are -objects of the Bio::DB::GFF::Featname class. - -Bio::DB::GFF::Feature inherits from and implements the abstract -methods of Bio::SeqFeatureI, allowing it to interoperate with other -Bioperl modules. - -Generally, you will not create or manipulate Bio::DB::GFF::Feature -objects directly, but use those that are returned by the -Bio::DB::GFF::RelSegment-Efeatures() method. - -=head2 Important note about start() vs end() - -If features are derived from segments that use relative addressing -(which is the default), then start() will be less than end() if the -feature is on the opposite strand from the reference sequence. This -breaks Bio::SeqI compliance, but is necessary to avoid having the real -genomic locations designated by start() and end() swap places when -changing reference points. - -To avoid this behavior, call $segment-Eabsolute(1) before fetching -features from it. This will force everything into absolute -coordinates. - -For example: - - my $segment = $db->segment('CHROMOSOME_I'); - $segment->absolute(1); - my @features = $segment->features('transcript'); - -=head1 API - -The remainder of this document describes the public and private -methods implemented by this module. - -=cut - -package Bio::DB::GFF::Feature; - -use strict; - -use Bio::DB::GFF::Util::Rearrange; -use Bio::DB::GFF::Featname; -use Bio::DB::GFF::Typename; -use Bio::DB::GFF::Homol; -use Bio::LocationI; -use Data::Dumper; - -use vars qw($AUTOLOAD); -use base qw(Bio::DB::GFF::RelSegment Bio::SeqFeatureI Bio::Root::Root); - -#' - -*segments = *get_SeqFeatures = \&sub_SeqFeature; - -my %CONSTANT_TAGS = (method=>1, source=>1, score=>1, phase=>1, notes=>1, id=>1, group=>1); - -=head2 new_from_parent - - Title : new_from_parent - Usage : $f = Bio::DB::GFF::Feature->new_from_parent(@args); - Function: create a new feature object - Returns : new Bio::DB::GFF::Feature object - Args : see below - Status : Internal - -This method is called by Bio::DB::GFF to create a new feature using -information obtained from the GFF database. It is one of two similar -constructors. This one is called when the feature is generated from a -RelSegment object, and should inherit the coordinate system of that -object. - -The 13 arguments are positional (sorry): - - $parent a Bio::DB::GFF::RelSegment object (or descendent) - $start start of this feature - $stop stop of this feature - $method this feature's GFF method - $source this feature's GFF source - $score this feature's score - $fstrand this feature's strand (relative to the source - sequence, which has its own strandedness!) - $phase this feature's phase - $group this feature's group (a Bio::DB::GFF::Featname object) - $db_id this feature's internal database ID - $group_id this feature's internal group database ID - $tstart this feature's target start - $tstop this feature's target stop - -tstart and tstop are not used for anything at the moment, since the -information is embedded in the group object. - -=cut - -# this is called for a feature that is attached to a parent sequence, -# in which case it inherits its coordinate reference system and strandedness -sub new_from_parent { - my $package = shift; - my ($parent, - $start,$stop, - $method,$source,$score, - $fstrand,$phase, - $group,$db_id,$group_id, - $tstart,$tstop) = @_; - - ($start,$stop) = ($stop,$start) if defined($fstrand) and $fstrand eq '-'; - my $class = $group ? $group->class : $parent->class; - - my $self = bless { - factory => $parent->{factory}, - sourceseq => $parent->{sourceseq}, - strand => $parent->{strand}, - ref => $parent->{ref}, - refstart => $parent->{refstart}, - refstrand => $parent->{refstrand}, - absolute => $parent->{absolute}, - start => $start, - stop => $stop, - type => Bio::DB::GFF::Typename->new($method,$source), - fstrand => $fstrand, - score => $score, - phase => $phase, - group => $group, - db_id => $db_id, - group_id => $group_id, - class => $class, - },$package; - $self; -} - -=head2 new - - Title : new - Usage : $f = Bio::DB::GFF::Feature->new(@args); - Function: create a new feature object - Returns : new Bio::DB::GFF::Feature object - Args : see below - Status : Internal - -This method is called by Bio::DB::GFF to create a new feature using -information obtained from the GFF database. It is one of two similar -constructors. This one is called when the feature is generated -without reference to a RelSegment object, and should therefore use its -default coordinate system (relative to itself). - -The 11 arguments are positional: - - $factory a Bio::DB::GFF adaptor object (or descendent) - $srcseq the source sequence - $start start of this feature - $stop stop of this feature - $method this feature's GFF method - $source this feature's GFF source - $score this feature's score - $fstrand this feature's strand (relative to the source - sequence, which has its own strandedness!) - $phase this feature's phase - $group this feature's group - $db_id this feature's internal database ID - -=cut - -# 'This is called when creating a feature from scratch. It does not have -# an inherited coordinate system. - -sub new { - my $package = shift; - my ($factory, - $srcseq, - $start,$stop, - $method,$source, - $score,$fstrand,$phase, - $group,$db_id,$group_id, - $tstart,$tstop) = @_; - - my $self = bless { },$package; - ($start,$stop) = ($stop,$start) if defined($fstrand) and $fstrand eq '-'; - - my $class = $group ? $group->class : 'Sequence'; - - @{$self}{qw(factory sourceseq start stop strand class)} = - ($factory,$srcseq,$start,$stop,$fstrand,$class); - - # if the target start and stop are defined, then we use this information to create - # the reference sequence - # THIS SHOULD BE BUILT INTO RELSEGMENT - if (0 && $tstart ne '' && $tstop ne '') { - if ($tstart < $tstop) { - @{$self}{qw(ref refstart refstrand)} = ($group,$start - $tstart + 1,'+'); - } else { - @{$self}{'start','stop'} = @{$self}{'stop','start'}; - @{$self}{qw(ref refstart refstrand)} = ($group,$tstop + $stop - 1,'-'); - } - - } else { - @{$self}{qw(ref refstart refstrand)} = ($srcseq,1,'+'); - } - - @{$self}{qw(type fstrand score phase group db_id group_id absolute)} = - (Bio::DB::GFF::Typename->new($method,$source),$fstrand,$score,$phase, - $group,$db_id,$group_id,$factory->{absolute}); - - $self; -} - -=head2 type - - Title : type - Usage : $type = $f->type([$newtype]) - Function: get or set the feature type - Returns : a Bio::DB::GFF::Typename object - Args : a new Typename object (optional) - Status : Public - -This method gets or sets the type of the feature. The type is a -Bio::DB::GFF::Typename object, which encapsulates the feature method -and source. - -The method() and source() methods described next provide shortcuts to -the individual fields of the type. - -=cut - -sub type { - my $self = shift; - my $d = $self->{type}; - $self->{type} = shift if @_; - $d; -} - -=head2 method - - Title : method - Usage : $method = $f->method([$newmethod]) - Function: get or set the feature method - Returns : a string - Args : a new method (optional) - Status : Public - -This method gets or sets the feature method. It is a convenience -feature that delegates the task to the feature's type object. - -=cut - -sub method { - my $self = shift; - my $d = $self->{type}->method; - $self->{type}->method(shift) if @_; - $d; -} - -=head2 source - - Title : source - Usage : $source = $f->source([$newsource]) - Function: get or set the feature source - Returns : a string - Args : a new source (optional) - Status : Public - -This method gets or sets the feature source. It is a convenience -feature that delegates the task to the feature's type object. - -=cut - -sub source { - my $self = shift; - my $d = $self->{type}->source; - $self->{type}->source(shift) if @_; - $d; -} - -=head2 score - - Title : score - Usage : $score = $f->score([$newscore]) - Function: get or set the feature score - Returns : a string - Args : a new score (optional) - Status : Public - -This method gets or sets the feature score. - -=cut - -sub score { - my $self = shift; - my $d = $self->{score}; - $self->{score} = shift if @_; - $d; -} - -=head2 phase - - Title : phase - Usage : $phase = $f->phase([$phase]) - Function: get or set the feature phase - Returns : a string - Args : a new phase (optional) - Status : Public - -This method gets or sets the feature phase. - -=cut - -sub phase { - my $self = shift; - my $d = $self->{phase}; - $self->{phase} = shift if @_; - $d; -} - -=head2 strand - - Title : strand - Usage : $strand = $f->strand - Function: get the feature strand - Returns : +1, 0 -1 - Args : none - Status : Public - -Returns the strand of the feature. Unlike the other methods, the -strand cannot be changed once the object is created (due to coordinate -considerations). - -=cut - -sub strand { - my $self = shift; - return 0 unless $self->{fstrand}; - if ($self->absolute) { - return Bio::DB::GFF::RelSegment::_to_strand($self->{fstrand}); - } - return $self->SUPER::strand || Bio::DB::GFF::RelSegment::_to_strand($self->{fstrand}); -} - -=head2 group - - Title : group - Usage : $group = $f->group([$new_group]) - Function: get or set the feature group - Returns : a Bio::DB::GFF::Featname object - Args : a new group (optional) - Status : Public - -This method gets or sets the feature group. The group is a -Bio::DB::GFF::Featname object, which has an ID and a class. - -=cut - -sub group { - my $self = shift; - my $d = $self->{group}; - $self->{group} = shift if @_; - $d; -} - -=head2 display_id - - Title : display_id - Usage : $display_id = $f->display_id([$display_id]) - Function: get or set the feature display id - Returns : a Bio::DB::GFF::Featname object - Args : a new display_id (optional) - Status : Public - -This method is an alias for group(). It is provided for -Bio::SeqFeatureI compatibility. - -=cut - -=head2 info - - Title : info - Usage : $info = $f->info([$new_info]) - Function: get or set the feature group - Returns : a Bio::DB::GFF::Featname object - Args : a new group (optional) - Status : Public - -This method is an alias for group(). It is provided for AcePerl -compatibility. - -=cut - -*info = \&group; -*display_id = \&group; -*display_name = \&group; - -=head2 target - - Title : target - Usage : $target = $f->target([$new_target]) - Function: get or set the feature target - Returns : a Bio::DB::GFF::Homol object - Args : a new group (optional) - Status : Public - -This method works like group(), but only returns the group if it -implements the start() method. This is typical for -similarity/assembly features, where the target encodes the start and -stop location of the alignment. - -The returned object is of type Bio::DB::GFF::Homol, which is a -subclass of Bio::DB::GFF::Segment. - -=cut - - -sub target { - my $self = shift; - my $group = $self->group or return; - return unless $group->can('start'); - $group; -} - -=head2 flatten_target - - Title : flatten_target - Usage : $target = $f->flatten_target($f->target) - Function: flatten a target object - Returns : a string (GFF2), an array [GFF2.5] or an array ref [GFF3] - Args : a target object (required), GFF version (optional) - Status : Public - -This method flattens a target object into text for -GFF dumping. If a second argument is provided, version-specific -vocabulary is used for the flattened target. - -=cut - -sub flatten_target { - my $self = shift; - my $t = shift || return; - my $v = shift; - - return 0 unless $t->can('start'); - my $class = $t->class; - my $name = $t->name; - my $start = $t->start; - my $stop = $t->stop; - - $v ||=2; - if ( $v == 2.5 ) { - - print STDERR qq(Target "$class:$name"), "tstart $start", "tstop $stop\n"; - return (qq(Target "$class:$name"), "tstart $start", "tstop $stop"); - } - elsif ( $v == 3 ) { - return [Target=>"$name $start $stop"]; - } - else { - return qq(Target "$class:$name" $start $stop); - } -} - -# override parent a smidgeon so that setting the ref for top-level feature -# sets ref for all subfeatures -sub refseq { - my $self = shift; - my $result = $self->SUPER::refseq(@_); - if (@_) { - my $newref = $self->SUPER::refseq; - for my $sub ($self->get_SeqFeatures) { - $sub->refseq(@_); - } - } - $result; -} - - -=head2 hit - - Title : hit - Usage : $hit = $f->hit([$new_hit]) - Function: get or set the feature hit - Returns : a Bio::DB::GFF::Featname object - Args : a new group (optional) - Status : Public - -This is the same as target(), for compatibility with -Bio::SeqFeature::SimilarityPair. - -=cut - -*hit = \⌖ - -=head2 id - - Title : id - Usage : $id = $f->id - Function: get the feature ID - Returns : a database identifier - Args : none - Status : Public - -This method retrieves the database identifier for the feature. It -cannot be changed. - -=cut - -sub id { shift->{db_id} } -sub primary_id { shift->{db_id} } - -=head2 group_id - - Title : group_id - Usage : $id = $f->group_id - Function: get the feature ID - Returns : a database identifier - Args : none - Status : Public - -This method retrieves the database group identifier for the feature. -It cannot be changed. Often the group identifier is more useful than -the feature identifier, since it is used to refer to a complex object -containing subparts. - -=cut - -sub group_id { shift->{group_id} } - -=head2 clone - - Title : clone - Usage : $feature = $f->clone - Function: make a copy of the feature - Returns : a new Bio::DB::GFF::Feature object - Args : none - Status : Public - -This method returns a copy of the feature. - -=cut - -sub clone { - my $self = shift; - my $clone = $self->SUPER::clone; - - if (ref(my $t = $clone->type)) { - my $type = $t->can('clone') ? $t->clone : bless {%$t},ref $t; - $clone->type($type); - } - - if (ref(my $g = $clone->group)) { - my $group = $g->can('clone') ? $g->clone : bless {%$g},ref $g; - $clone->group($group); - } - - if (my $merged = $self->{merged_segs}) { - $clone->{merged_segs} = { %$merged }; - } - - $clone; -} - -=head2 compound - - Title : compound - Usage : $flag = $f->compound([$newflag]) - Function: get or set the compound flag - Returns : a boolean - Args : a new flag (optional) - Status : Public - -This method gets or sets a flag indicated that the feature is not a -primary one from the database, but the result of aggregation. - -=cut - -sub compound { - my $self = shift; - my $d = $self->{compound}; - $self->{compound} = shift if @_; - $d; -} - -=head2 sub_SeqFeature - - Title : sub_SeqFeature - Usage : @feat = $feature->sub_SeqFeature([$method]) - Function: get subfeatures - Returns : a list of Bio::DB::GFF::Feature objects - Args : a feature method (optional) - Status : Public - -This method returns a list of any subfeatures that belong to the main -feature. For those features that contain heterogeneous subfeatures, -you can retrieve a subset of the subfeatures by providing a method -name to filter on. - -This method may also be called as segments() or get_SeqFeatures(). - -=cut - -sub sub_SeqFeature { - my $self = shift; - my $type = shift; - my $subfeat = $self->{subfeatures} or return; - $self->sort_features; - my @a; - if ($type) { - my $features = $subfeat->{lc $type} or return; - @a = @{$features}; - } else { - @a = map {@{$_}} values %{$subfeat}; - } - return @a; -} - -=head2 add_subfeature - - Title : add_subfeature - Usage : $feature->add_subfeature($feature) - Function: add a subfeature to the feature - Returns : nothing - Args : a Bio::DB::GFF::Feature object - Status : Public - -This method adds a new subfeature to the object. It is used -internally by aggregators, but is available for public use as well. - -=cut - -sub add_subfeature { - my $self = shift; - my $feature = shift; - my $type = $feature->method; - my $subfeat = $self->{subfeatures}{lc $type} ||= []; - push @{$subfeat},$feature; -} - -=head2 attach_seq - - Title : attach_seq - Usage : $sf->attach_seq($seq) - Function: Attaches a Bio::Seq object to this feature. This - Bio::Seq object is for the *entire* sequence: ie - from 1 to 10000 - Example : - Returns : TRUE on success - Args : a Bio::PrimarySeqI compliant object - -=cut - -sub attach_seq { } - - -=head2 location - - Title : location - Usage : my $location = $seqfeature->location() - Function: returns a location object suitable for identifying location - of feature on sequence or parent feature - Returns : Bio::LocationI object - Args : none - -=cut - -sub location { - my $self = shift; - require Bio::Location::Split unless Bio::Location::Split->can('new'); - require Bio::Location::Simple unless Bio::Location::Simple->can('new'); - - my $location; - if (my @segments = $self->segments) { - $location = Bio::Location::Split->new(-seq_id => $self->seq_id); - foreach (@segments) { - $location->add_sub_Location($_->location); - } - } else { - $location = Bio::Location::Simple->new(-start => $self->start, - -end => $self->stop, - -strand => $self->strand, - -seq_id => $self->seq_id); - } - $location; -} - -=head2 entire_seq - - Title : entire_seq - Usage : $whole_seq = $sf->entire_seq() - Function: gives the entire sequence that this seqfeature is attached to - Example : - Returns : a Bio::PrimarySeqI compliant object, or undef if there is no - sequence attached - Args : none - - -=cut - -sub entire_seq { - my $self = shift; - $self->factory->segment($self->sourceseq); -} - -=head2 merged_segments - - Title : merged_segments - Usage : @segs = $feature->merged_segments([$method]) - Function: get merged subfeatures - Returns : a list of Bio::DB::GFF::Feature objects - Args : a feature method (optional) - Status : Public - -This method acts like sub_SeqFeature, except that it merges -overlapping segments of the same time into contiguous features. For -those features that contain heterogeneous subfeatures, you can -retrieve a subset of the subfeatures by providing a method name to -filter on. - -A side-effect of this method is that the features are returned in -sorted order by their start tposition. - -=cut - -#' - -sub merged_segments { - my $self = shift; - my $type = shift; - $type ||= ''; # prevent uninitialized variable warnings - - my $truename = overload::StrVal($self); - - return @{$self->{merged_segs}{$type}} if exists $self->{merged_segs}{$type}; - my @segs = map { $_->[0] } - sort { $a->[1] <=> $b->[1] || - $a->[2] cmp $b->[2] } - map { [$_, $_->start, $_->type] } $self->sub_SeqFeature($type); - - # attempt to merge overlapping segments - my @merged = (); - for my $s (@segs) { - my $previous = $merged[-1] if @merged; - my ($pscore,$score) = (eval{$previous->score}||0,eval{$s->score}||0); - if (defined($previous) - && $previous->stop+1 >= $s->start - && $pscore == $score - && $previous->method eq $s->method - ) { - if ($self->absolute && $self->strand < 0) { - $previous->{start} = $s->{start}; - } else { - $previous->{stop} = $s->{stop}; - } - # fix up the target too - my $g = $previous->{group}; - if ( ref($g) && $g->isa('Bio::DB::GFF::Homol')) { - my $cg = $s->{group}; - $g->{stop} = $cg->{stop}; - } - } - elsif (defined($previous) - && $previous->start == $s->start - && $previous->stop == $s->stop - && $previous->method eq $s->method - ) { - next; - } - - else { - my $copy = $s->clone; - push @merged,$copy; - } - } - $self->{merged_segs}{$type} = \@merged; - @merged; -} - -=head2 sub_types - - Title : sub_types - Usage : @methods = $feature->sub_types - Function: get methods of all sub-seqfeatures - Returns : a list of method names - Args : none - Status : Public - -For those features that contain subfeatures, this method will return a -unique list of method names of those subfeatures, suitable for use -with sub_SeqFeature(). - -=cut - -sub sub_types { - my $self = shift; - my $subfeat = $self->{subfeatures} or return; - return keys %$subfeat; -} - -=head2 attributes - - Title : attributes - Usage : @attributes = $feature->attributes($name) - Function: get the "attributes" on a particular feature - Returns : an array of string - Args : feature ID - Status : public - -Some GFF version 2 files use the groups column to store a series of -attribute/value pairs. In this interpretation of GFF, the first such -pair is treated as the primary group for the feature; subsequent pairs -are treated as attributes. Two attributes have special meaning: -"Note" is for backward compatibility and is used for unstructured text -remarks. "Alias" is considered as a synonym for the feature name. - - @gene_names = $feature->attributes('Gene'); - @aliases = $feature->attributes('Alias'); - -If no name is provided, then attributes() returns a flattened hash, of -attribute=Evalue pairs. This lets you do: - - %attributes = $db->attributes; - -=cut - -sub attributes { - my $self = shift; - my $factory = $self->factory; - defined(my $id = $self->id) or return; - $factory->attributes($id,@_) -} - - -=head2 notes - - Title : notes - Usage : @notes = $feature->notes - Function: get the "notes" on a particular feature - Returns : an array of string - Args : feature ID - Status : public - -Some GFF version 2 files use the groups column to store various notes -and remarks. Adaptors can elect to store the notes in the database, -or just ignore them. For those adaptors that store the notes, the -notes() method will return them as a list. - -=cut - -sub notes { - my $self = shift; - $self->attributes('Note'); -} - -=head2 aliases - - Title : aliases - Usage : @aliases = $feature->aliases - Function: get the "aliases" on a particular feature - Returns : an array of string - Args : feature ID - Status : public - -This method will return a list of attributes of type 'Alias'. - -=cut - -sub aliases { - my $self = shift; - $self->attributes('Alias'); -} - - - -=head2 Autogenerated Methods - - Title : AUTOLOAD - Usage : @subfeat = $feature->Method - Function: Return subfeatures using autogenerated methods - Returns : a list of Bio::DB::GFF::Feature objects - Args : none - Status : Public - -Any method that begins with an initial capital letter will be passed -to AUTOLOAD and treated as a call to sub_SeqFeature with the method -name used as the method argument. For instance, this call: - - @exons = $feature->Exon; - -is equivalent to this call: - - @exons = $feature->sub_SeqFeature('exon'); - -=cut - -=head2 SeqFeatureI methods - -The following Bio::SeqFeatureI methods are implemented: - -primary_tag(), source_tag(), all_tags(), has_tag(), each_tag_value() [renamed get_tag_values()]. - -=cut - -*primary_tag = \&method; -*source_tag = \&source; -sub all_tags { - my $self = shift; - my %atts = $self->attributes; - my @tags = keys %atts; - - # autogenerated methods - #if (my $subfeat = $self->{subfeatures}) { - # push @tags,keys %$subfeat; - #} - - @tags; -} -*get_all_tags = \&all_tags; - -sub has_tag { - my $self = shift; - my $tag = shift; - my %att = $self->attributes; - my %tags = map {$_=>1} ( $self->all_tags ); - - return $tags{$tag}; -} - -*each_tag_value = \&get_tag_values; - -sub get_tag_values { - my $self = shift; - my $tag = shift; - return $self->$tag() if $CONSTANT_TAGS{$tag}; - - my $atts = $self->attributes; - return @{$atts->{$tag}} if $atts && $atts->{$tag}; - - $tag = ucfirst $tag; - return $self->$tag(); # try autogenerated tag -} - -sub AUTOLOAD { - my($pack,$func_name) = $AUTOLOAD=~/(.+)::([^:]+)$/; - my $sub = $AUTOLOAD; - my $self = $_[0]; - - # ignore DESTROY calls - return if $func_name eq 'DESTROY'; - - # fetch subfeatures if func_name has an initial cap -# return sort {$a->start <=> $b->start} $self->sub_SeqFeature($func_name) if $func_name =~ /^[A-Z]/; - return $self->sub_SeqFeature($func_name) if $func_name =~ /^[A-Z]/; - - # error message of last resort - $self->throw(qq(Can't locate object method "$func_name" via package "$pack")); -}#' - -=head2 adjust_bounds - - Title : adjust_bounds - Usage : $feature->adjust_bounds - Function: adjust the bounds of a feature - Returns : ($start,$stop,$strand) - Args : none - Status : Public - -This method adjusts the boundaries of the feature to enclose all its -subfeatures. It returns the new start, stop and strand of the -enclosing feature. - -=cut - -# adjust a feature so that its boundaries are synched with its subparts' boundaries. -# this works recursively, so subfeatures can contain other features -sub adjust_bounds { - my $self = shift; - my $shrink = shift; - my $g = $self->{group}; - - my $first = 0; - my $tfirst = 0; - if (my $subfeat = $self->{subfeatures}) { - for my $list (values %$subfeat) { - for my $feat (@$list) { - # fix up our bounds to hold largest subfeature - my($start,$stop,$strand) = $feat->adjust_bounds($shrink); - - if (defined($self->{fstrand})) { - $self->debug("Subfeature's strand ($strand) doesn't match parent strand ($self->{fstrand})\n") if $self->{fstrand} ne $strand; - } else { - $self->{fstrand} = $strand; - } - - my ($low,$high) = $start < $stop ? ($start,$stop) : ($stop,$start); - if ($shrink && !$first++) { - # first subfeature resets start & stop: - $self->{start} = $self->{fstrand} ne '-' ? $low : $high; - $self->{stop} = $self->{fstrand} ne '-' ? $high : $low; - } else { - if ($self->{fstrand} ne '-') { - $self->{start} = $low - if (!defined($self->{start})) || $low < $self->{start}; - $self->{stop} = $high - if (!defined($self->{stop})) || $high > $self->{stop}; - } else { - $self->{start} = $high - if (!defined($self->{start})) || $high > $self->{start}; - $self->{stop} = $low - if (!defined($self->{stop})) || $low < $self->{stop}; - } - } - - # fix up endpoints of targets too (for homologies only) - my $h = $feat->group; - next unless $h && $h->isa('Bio::DB::GFF::Homol'); - next unless $g && $g->isa('Bio::DB::GFF::Homol'); - - ($start,$stop) = ($h->{start},$h->{stop}); - if ($shrink && !$tfirst++) { - $g->{start} = $start; - $g->{stop} = $stop; - } else { - if ($start <= $stop) { - $g->{start} = $start if (!defined($g->{start})) || $start < $g->{start}; - $g->{stop} = $stop if (!defined($g->{stop})) || $stop > $g->{stop}; - } else { - $g->{start} = $start if (!defined($g->{start})) || $start > $g->{start}; - $g->{stop} = $stop if (!defined($g->{stop})) || $stop < $g->{stop}; - } - } - } - } - } - - ($self->{start},$self->{stop},$self->strand); -} - -=head2 sort_features - - Title : sort_features - Usage : $feature->sort_features - Function: sort features - Returns : nothing - Args : none - Status : Public - -This method sorts subfeatures in ascending order by their start -position. For reverse strand features, it sorts subfeatures in -descending order. After this is called sub_SeqFeature will return the -features in order. - -This method is called internally by merged_segments(). - -=cut - -# sort features -sub sort_features { - my $self = shift; - return if $self->{sorted}++; - my $strand = $self->strand or return; - my $subfeat = $self->{subfeatures} or return; - for my $type (keys %$subfeat) { - $subfeat->{$type} = [map { $_->[0] } - sort {$a->[1] <=> $b->[1] } - map { [$_,$_->start] } - @{$subfeat->{$type}}] if $strand > 0; - $subfeat->{$type} = [map { $_->[0] } - sort {$b->[1] <=> $a->[1]} - map { [$_,$_->start] } - @{$subfeat->{$type}}] if $strand < 0; - } -} - -=head2 asString - - Title : asString - Usage : $string = $feature->asString - Function: return human-readabled representation of feature - Returns : a string - Args : none - Status : Public - -This method returns a human-readable representation of the feature and -is called by the overloaded "" operator. - -=cut - -sub asString { - my $self = shift; - my $type = $self->type; - my $name = $self->group; - return "$type($name)" if $name; - return $type; -# my $type = $self->method; -# my $id = $self->group || 'unidentified'; -# return join '/',$id,$type,$self->SUPER::asString; -} - -sub name { - my $self =shift; - return $self->group || $self->SUPER::name; -} - -=head2 gff_string - - Title : gff_string - Usage : $string = $feature->gff_string - Function: return GFF2 of GFF2.5 representation of feature - Returns : a string - Args : none - Status : Public - -=cut - -sub gff_string { - my $self = shift; - my $version = $self->version; - - # gff3_string and gff_string are synonymous if the version is set to 3 - return $self->gff3_string(@_) if $version == 3; - - my ($start,$stop) = ($self->start,$self->stop); - - # the defined() tests prevent uninitialized variable warnings, when dealing with clone objects - # whose endpoints may be undefined - ($start,$stop) = ($stop,$start) if defined($start) && defined($stop) && $start > $stop; - - my ($class,$name) = ('',''); - my $strand = ('-','.','+')[$self->strand+1]; - - my @group; - - if (my $t = $self->target) { - push @group, $version == 2.5 ? $self->flatten_target($t,2.5) - : $self->flatten_target($t); - } - elsif (my $g = $self->group) { - $class = $g->class || ''; - $name = $g->name || ''; - ($name =~ /\S\s\S/)?(push @group, "$class '$name'"):(push @group,"$class $name"); - } - - # add exhaustive list of attributes - my $att = $self->attributes; - for ( keys %$att ) { - for my $v ( @{$att->{$_}} ) { - $v = qq("$v") if $v=~ /\S\s+\S/; - push @group, qq($_ $v); - } - } - - my $group_field = join ' ; ',@group; - my $ref = $self->refseq; - my $n = ref($ref) ? $ref->name : $ref; - my $phase = $self->phase; - $phase = '.' unless defined $phase; - return join("\t", - $n, - $self->source,$self->method, - (defined $start ? $start : '.'), - (defined $stop ? $stop : '.'), - (defined $self->score ? $self->score : '.'), - (defined $strand ? $strand : '.'), - $phase, - $group_field); -} - -=head2 gff3_string - - Title : gff3_string - Usage : $string = $feature->gff3_string([$recurse]) - Function: return GFF3 representation of feature - Returns : a string - Args : An optional flag, which if true, will cause the feature to recurse over - subfeatures. - Status : Public - -=cut - -sub gff3_string { - my $self = shift; - my ($recurse,$parent) = @_; - my ($start,$stop) = ($self->start,$self->stop); - - # the defined() tests prevent uninitialized variable warnings, when dealing with clone objects - # whose endpoints may be undefined - ($start,$stop) = ($stop,$start) if defined($start) && defined($stop) && $start > $stop; - - my $strand = ('-','.','+')[$self->strand+1]; - my $ref = $self->refseq; - my $n = ref($ref) ? $ref->name : $ref; - my $phase = $self->phase; - $phase = '.' unless defined $phase; - - my ($class,$name) = ('',''); - my @group; - if (my $g = $self->group) { - $class = $g->class || ''; - $name = $g->name || ''; - $name = "$class:$name" if defined $class; - push @group,[ID => $name] if !defined($parent) || $name ne $parent; - } - - push @group,[Parent => $parent] if defined $parent && $parent ne ''; - - if (my $t = $self->target) { - $strand = '-' if $t->stop < $t->start; - push @group, $self->flatten_target($t,3); - } - - my @attributes = $self->attributes; - while (@attributes) { - push @group,[shift(@attributes),shift(@attributes)] - } - my $group_field = join ';',map {join '=',_escape($_->[0]),_escape($_->[1])} @group; - my $string = join("\t",$n,$self->source,$self->method,$start||'.',$stop||'.', - $self->score||'.',$strand||'.',$phase,$group_field); - $string .= "\n"; - if ($recurse) { - foreach ($self->sub_SeqFeature) { - $string .= $_->gff3_string(1,$name); - } - } - $string; -} - -=head2 version - - Title : version - Usage : $feature->version() - Function: get/set the GFF version to be returned by gff_string - Returns : the GFF version (default is 2) - Args : the GFF version (2, 2.5 of 3) - Status : Public - -=cut - -sub version { - my ($self, $version) = @_; - $self->{version} = $version if $version; - return $self->{version} || 2; -} - - -sub _escape { - my $toencode = shift; - $toencode =~ s/([^a-zA-Z0-9_. :?^*\(\)\[\]@!-])/uc sprintf("%%%02x",ord($1))/eg; - $toencode =~ tr/ /+/; - $toencode; -} - -=head2 cmap_link() - - Title : cmap_link - Usage : $link = $feature->cmap_link - Function: returns a URL link to the corresponding feature in cmap - Returns : a string - Args : none - Status : Public - -If integrated cmap/gbrowse installation, it returns a link to the map otherwise -it returns a link to a feature search on the feature name. See the cmap -documentation for more information. - -This function is intended primarily to be used in gbrowse conf files. -For example: - - link = sub {my $self = shift; return $self->cmap_viewer_link(data_source);} - -=cut - - -sub cmap_viewer_link { - # Use ONLY if CMap is installed - my $self = shift; - my $data_source = shift; - my $group_id = $self->group_id; - my $factory = $self->factory; # aka adaptor - - my $link_str; - - if ($factory->can("create_cmap_viewer_link")){ - $link_str = $factory->create_cmap_viewer_link( - data_source => $data_source, - group_id => $group_id, - ); - } - my $name = $self->name(); - $link_str = '/cgi-bin/cmap/feature_search?features=' - . $name - . '&search_field=feature_name&order_by=&data_source=' - . $data_source - . '&submit=Submit' - unless $link_str; - - return $link_str; - -} - -=head1 A Note About Similarities - -The current default aggregator for GFF "similarity" features creates a -composite Bio::DB::GFF::Feature object of type "gapped_alignment". -The target() method for the feature as a whole will return a -RelSegment object that is as long as the extremes of the similarity -hit target, but will not necessarily be the same length as the query -sequence. The length of each "similarity" subfeature will be exactly -the same length as its target(). These subfeatures are essentially -the HSPs of the match. - -The following illustrates this: - - @similarities = $segment->feature('similarity:BLASTN'); - $sim = $similarities[0]; - - print $sim->type; # yields "gapped_similarity:BLASTN" - - $query_length = $sim->length; - $target_length = $sim->target->length; # $query_length != $target_length - - @matches = $sim->Similarity; # use autogenerated method - $query1_length = $matches[0]->length; - $target1_length = $matches[0]->target->length; # $query1_length == $target1_length - -If you merge segments by calling merged_segments(), then the length of -the query sequence segments will no longer necessarily equal the -length of the targets, because the alignment information will have -been lost. Nevertheless, the targets are adjusted so that the first -and last base pairs of the query match the first and last base pairs -of the target. - -=cut - -1; - -=head1 BUGS - -This module is still under development. - -=head1 SEE ALSO - -L, L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Homol.pm b/lib/Bio/DB/GFF/Homol.pm deleted file mode 100644 index 70cd1017f..000000000 --- a/lib/Bio/DB/GFF/Homol.pm +++ /dev/null @@ -1,100 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Homol -- A segment of DNA that is homologous to another - -=head1 SYNOPSIS - -See L. - -=head1 DESCRIPTION - -Bio::DB::GFF::Homol is a named subtype of Bio::DB::GFF::Segment. It -inherits all the methods of its parent, and was created primarily to -allow for isa() queries and for compatibility with -Ace::Sequence::Homol. - -A Homol object is typically returned as the method result of the -Bio::DB::GFF::Feature-Etarget() method. - -=head1 METHODS - -=cut - -package Bio::DB::GFF::Homol; -use strict; - -use base qw(Bio::DB::GFF::Segment); - -=head2 name - - Title : name - Usage : $name = $homol->name - Function: get the ID of the homology object - Returns : a string - Args : none - Status : Public - -=cut - -sub name { shift->refseq } - -=head2 asString - - Title : asString - Usage : $name = $homol->asString - Function: same as name(), for operator overloading - Returns : a string - Args : none - Status : Public - -=cut - -sub asString { shift->name } - - -=head2 id - - Title : id - Usage : $id = $homol->id - Function: get database ID in class:id format - Returns : a string - Args : none - Status : Public - -=cut - -sub id { - my $self = shift; - return "$self->{class}:$self->{name}"; -} - -sub new_from_segment { - my $package = shift; - $package = ref $package if ref $package; - my $segment = shift; - my $new = {}; - @{$new}{qw(factory sourceseq start stop strand class ref refstart refstrand)} - = @{$segment}{qw(factory sourceseq start stop strand class ref refstart refstrand)}; - return bless $new,__PACKAGE__; -} - -=head1 BUGS - -This module is still under development. - -=head1 SEE ALSO - -L, L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -1; diff --git a/lib/Bio/DB/GFF/RelSegment.pm b/lib/Bio/DB/GFF/RelSegment.pm deleted file mode 100644 index c43f89876..000000000 --- a/lib/Bio/DB/GFF/RelSegment.pm +++ /dev/null @@ -1,1168 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::RelSegment -- Sequence segment with relative coordinate support - -=head1 SYNOPSIS - -See L. - -=head1 DESCRIPTION - -Bio::DB::GFF::RelSegment is a stretch of sequence that can handle -relative coordinate addressing. It inherits from -Bio::DB::GFF::Segment, and is the base class for -Bio::DB::GFF::Feature. - -In addition to the source sequence, a relative segment has a -"reference sequence", which is used as the basis for its coordinate -system. The reference sequence can be changed at will, allowing you -freedom to change the "frame of reference" for features contained -within the segment. For example, by setting a segment's reference -sequence to the beginning of a gene, you can view all other features -in gene-relative coordinates. - -The reference sequence and the source sequence must be on the same -physical stretch of DNA, naturally. However, they do not have to be -on the same strand. The strandedness of the reference sequence -determines whether coordinates increase to the right or the left. - -Generally, you will not create or manipulate Bio::DB::GFF::RelSeg0ment -objects directly, but use those that are returned by the Bio::DB::GFF -module. - -=head2 An Example - -To understand how relative coordinates work, consider the following -example from the C. elegans database. First we create the appropriate -GFF accessor object (the factory): - - my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:elegans', - -adaptor=>'dbi:mysqlopt'); - -Now we fetch out a segment based on cosmid clone ZK909: - - my $seg = $db->segment('ZK909'); - -If we call the segment's refseq() method, we see that the base of the -coordinate system is the sequence "ZK154", and that its start and -stop positions are 1 and the length of the cosmid: - - print $seg->refseq; - => ZK909 - - print $seg->start,' - ',$seg->stop; - => 1 - 33782 - -As a convenience, the "" operator is overloaded in this class, to give -the reference sequence, and start and stop positions: - - print $seg; - => ZK909:1,33782 - -Internally, Bio::DB::GFF::RelSegment has looked up the absolute -coordinates of this segment and maintains the source sequence and the -absolute coordinates relative to the source sequence. We can see this -information using sourceseq() (inherited from Bio::DB::GFF::Segment) -and the abs_start() and abs_end() methods: - - print $seg->sourceseq; - => CHROMOSOME_I - - print $seg->abs_start,' - ',$seg->abs_end; - => 14839545 - 14873326 - -We can also put the segment into absolute mode, so that it behaves -like Bio::DB::Segment, and always represents coordinates on the source -sequence. This is done by passing a true value to the absolute() -method: - - $seq->absolute(1); - print $seg; - => CHROMOSOME_I:14839545,14873326 - -We can change the reference sequence at any time. One way is to call -the segment's ref() method, giving it the ID (and optionally the -class) of another landmark on the genome. For example, if we know -that cosmid ZK337 is adjacent to ZK909, then we can view ZK909 in -ZK337-relative coordinates: - - $seg->refseq('ZK337'); - print $seg; - => ZK337:-33670,111 - -We can call the segment's features() method in order to get the list -of contigs that overlap this segment (in the C. elegans database, -contigs have feature type "Sequence:Link"): - - @links = $seg->features('Sequence:Link'); - -We can now set the reference sequence to the first of these contigs like so: - - $seg->refseq($links[0]); - print $seg; - => Sequence:Link(LINK_Y95D11A):3997326,4031107 - -=cut - -package Bio::DB::GFF::RelSegment; - -use strict; - -use Bio::DB::GFF::Feature; -use Bio::DB::GFF::Util::Rearrange; -use Bio::RangeI; - -use base qw(Bio::DB::GFF::Segment); - -use overload '""' => 'asString', - 'bool' => sub { overload::StrVal(shift) }, - fallback=>1; - -=head1 API - -The remainder of this document describes the API for -Bio::DB::GFF::Segment. - -=cut - -=head2 new - - Title : new - Usage : $s = Bio::DB::GFF::RelSegment->new(@args) - Function: create a new relative segment - Returns : a new Bio::DB::GFF::RelSegment object - Args : see below - Status : Public - -This method creates a new Bio::DB::GFF::RelSegment object. Generally -this is called automatically by the Bio::DB::GFF module and -derivatives. - -This function uses a named-argument style: - - -factory a Bio::DB::GFF::Adaptor to use for database access - -seq ID of the source sequence - -class class of the source sequence - -start start of the desired segment relative to source sequence - -stop stop of the desired segment relative to source sequence - -ref ID of the reference sequence - -refclass class of the reference sequence - -offset 0-based offset from source sequence to start of segment - -length length of desired segment - -absolute, -force_absolute - use absolute coordinates, rather than coordinates relative - to the start of self or the reference sequence - -The -seq argument accepts the ID of any landmark in the database. The -stored source sequence becomes whatever the GFF file indicates is the -proper sequence for this landmark. A class of "Sequence" is assumed -unless otherwise specified in the -class argument. - -If the argument to -seq is a Bio::GFF::Featname object (such as -returned by the group() method), then the class is taken from that. - -The optional -start and -stop arguments specify the end points for the -retrieved segment. For those who do not like 1-based indexing, --offset and -length are provided. If both -start/-stop and --offset/-length are provided, the latter overrides the former. -Generally it is not a good idea to mix metaphors. - --ref and -refclass together indicate a sequence to be used for -relative coordinates. If not provided, the source sequence indicated -by -seq is used as the reference sequence. If the argument to -ref is -a Bio::GFF::Featname object (such as returned by the group() method), -then the class is taken from that. - --force_absolute should be used if you wish to skip the lookup of the -absolute position of the source sequence that ordinarily occurs when -you create a relative segment. In this case, the source sequence must -be a sequence that has been specified as the "source" in the GFF file. - -=cut - -# Create a new Bio::DB::GFF::RelSegment Object -# arguments are: -# -factory => factory and DBI interface -# -seq => $sequence_name -# -start => $start_relative_to_sequence -# -stop => $stop_relative_to_sequence -# -ref => $sequence which establishes coordinate system -# -offset => 0-based offset relative to sequence -# -length => length of segment -# -nocheck => turn off checking, force segment to be constructed -# -absolute => use absolute coordinate addressing - -sub new { - my $package = shift; - my ($factory,$name,$start,$stop,$refseq,$class,$refclass,$offset,$length,$force_absolute,$nocheck) = - rearrange([ - 'FACTORY', - [qw(NAME SEQ SEQUENCE SOURCESEQ)], - [qw(START BEGIN)], - [qw(STOP END)], - [qw(REFSEQ REF REFNAME)], - [qw(CLASS SEQCLASS)], - qw(REFCLASS), - [qw(OFFSET OFF)], - [qw(LENGTH LEN)], - [qw(ABSOLUTE)], - [qw(NOCHECK FORCE)], - ],@_); - - $package = ref $package if ref $package; - $factory or $package->throw("new(): provide a -factory argument"); - - # to allow people to use segments as sources - if (ref($name) && $name->isa('Bio::DB::GFF::Segment')) { - $start = 1 unless defined $start; - $stop = $name->length unless defined $stop; - return $name->subseq($start,$stop); - } - - my @object_results; - - # support for Featname objects - if (ref($name) && $name->can('class')) { - $class = $name->class; - $name = $name->name; - } - - # if the class of the landmark is not specified then default to 'Sequence' - $class ||= eval{$factory->default_class} || 'Sequence'; - - # confirm that indicated sequence is actually in the database! - my @abscoords; - - # abscoords() will now return an array ref, each element of which is - # ($absref,$absclass,$absstart,$absstop,$absstrand) - - if ($nocheck) { - $force_absolute++; - $start = 1; - } - -# if ($force_absolute && defined($start)) { # absolute position is given to us -# @abscoords = ([$name,$class,$start,$stop,'+']); -# } else { - my $result = $factory->abscoords($name,$class,$force_absolute ? $name : ()) or return; - @abscoords = @$result; -# } - - foreach (@abscoords) { - my ($absref,$absclass,$absstart,$absstop,$absstrand,$sname) = @$_; - $sname = $name unless defined $sname; - my ($this_start,$this_stop,$this_length) = ($start,$stop,$length); - - # partially fill in object - my $self = bless { factory => $factory },$package; - - $absstrand ||= '+'; - - if ($absstart > $absstop) { # AAARGH! DATA FORMAT ERROR! FIX. - ($absstart,$absstop) = ($absstop,$absstart); - $absstrand = $absstrand eq '+' ? '-' : '+'; - } - - # an explicit length overrides start and stop - if (defined $offset) { - warn "new(): bad idea to call new() with both a start and an offset" - if defined $this_start; - $this_start = $offset+1; - } - if (defined $this_length) { - warn "new(): bad idea to call new() with both a stop and a length" - if defined $this_stop; - $this_stop = $this_start + $length - 1; - } - - # this allows a SQL optimization way down deep - $self->{whole}++ if $absref eq $sname and !defined($this_start) and !defined($this_stop); - - $this_start = 1 if !defined $this_start; - $this_stop = $absstop-$absstart+1 if !defined $this_stop; - $this_length = $this_stop - $this_start + 1; - - # now offset to correct subsegment based on desired start and stop - if ($force_absolute) { -# ($this_start,$this_stop) = ($absstart,$absstop); - $self->absolute(1); - } elsif ($absstrand eq '+') { - $this_start = $absstart + $this_start - 1; - $this_stop = $this_start + $this_length - 1; - } else { - $this_start = $absstop - ($this_start - 1); - $this_stop = $absstop - ($this_stop - 1); - } - - # handle truncation in either direction - # This only happens if the segment runs off the end of - # the reference sequence - if ($factory->strict_bounds_checking && - (($this_start < $absstart) || ($this_stop > $absstop))) { - # return empty if we are completely off the end of the ref se - next unless $this_start<=$absstop && $this_stop>=$absstart; - if (my $a = $factory->abscoords($absref,'Sequence')) { - my $refstart = $a->[0][2]; - my $refstop = $a->[0][3]; - if ($this_start < $refstart) { - $this_start = $refstart; - $self->{truncated}{start}++; - } - if ($this_stop > $refstop) { - $this_stop = $absstop; - $self->{truncated}{stop}++; - } - } - } - - @{$self}{qw(sourceseq start stop strand class)} - = ($absref,$this_start,$this_stop,$absstrand,$absclass); - - # handle reference sequence - if (defined $refseq) { - $refclass = $refseq->class if $refseq->can('class'); - $refclass ||= 'Sequence'; - my ($refref,$refstart,$refstop,$refstrand) = $factory->abscoords($refseq,$refclass); - unless ($refref eq $absref) { - $self->error("reference sequence is on $refref but source sequence is on $absref"); - return; - } - $refstart = $refstop if $refstrand eq '-'; - @{$self}{qw(ref refstart refstrand)} = ($refseq,$refstart,$refstrand); - } else { - $absstart = $absstop if $absstrand eq '-'; - @{$self}{qw(ref refstart refstrand)} = ($sname,$absstart,$absstrand); - } - push @object_results,$self; - } - - return wantarray ? @object_results : $object_results[0]; -} - -# overridden methods -# start, stop, length -sub start { - my $self = shift; - return $self->strand < 0 ? $self->{stop} : $self->{start} if $self->absolute; - $self->_abs2rel($self->{start}); -} -sub end { - my $self = shift; - return $self->strand < 0 ? $self->{start} : $self->{stop} if $self->absolute; - $self->_abs2rel($self->{stop}); -} -*stop = \&end; - -sub length { - my $self = shift; - return unless defined $self->abs_end; - abs($self->abs_end - $self->abs_start) + 1; -} - -sub abs_start { - my $self = shift; - if ($self->absolute) { - my ($a,$b) = ($self->SUPER::abs_start,$self->SUPER::abs_end); - return ($a<$b) ? $a : $b; - } - else { - return $self->SUPER::abs_start(@_); - } -} -sub abs_end { - my $self = shift; - if ($self->absolute) { - my ($a,$b) = ($self->SUPER::abs_start,$self->SUPER::abs_end); - return ($a>$b) ? $a : $b; - } - - else { - return $self->SUPER::abs_end(@_); - } -} - -*abs_stop = \&abs_end; - -=head2 refseq - - Title : refseq - Usage : $ref = $s->refseq([$newseq] [,$newseqclass]) - Function: get/set reference sequence - Returns : current reference sequence - Args : new reference sequence and class (optional) - Status : Public - -This method will get or set the reference sequence. Called with no -arguments, it returns the current reference sequence. Called with -either a sequence ID and class, a Bio::DB::GFF::Segment object (or -subclass) or a Bio::DB::GFF::Featname object, it will set the current -reference sequence and return the previous one. - -The method will generate an exception if you attempt to set the -reference sequence to a sequence that isn't contained in the database, -or one that has a different source sequence from the segment. - -=cut - -#' -sub refseq { - my $self = shift; - my $g = $self->{ref}; - if (@_) { - my ($newref,$newclass); - if (@_ == 2) { - $newclass = shift; - $newref = shift; - } else { - $newref = shift; - $newclass = 'Sequence'; - } - - defined $newref or $self->throw('refseq() called with an undef reference sequence'); - - # support for Featname objects - $newclass = $newref->class if ref($newref) && $newref->can('class'); - - # $self->throw("Cannot define a segment's reference sequence in terms of itself!") - # if ref($newref) and overload::StrVal($newref) eq overload::StrVal($self); - - my ($refsource,undef,$refstart,$refstop,$refstrand); - if ($newref->isa('Bio::DB::GFF::RelSegment')) { - ($refsource,undef,$refstart,$refstop,$refstrand) = - ($newref->sourceseq,undef,$newref->abs_start,$newref->abs_end,$newref->abs_strand >= 0 ? '+' : '-'); - } else { - my $coords = $self->factory->abscoords($newref,$newclass); - foreach (@$coords) { # find the appropriate one - ($refsource,undef,$refstart,$refstop,$refstrand) = @$_; - last if $refsource eq $self->{sourceseq}; - } - - } - $self->throw("can't set reference sequence: $newref and $self are on different sequence segments") - unless $refsource eq $self->{sourceseq}; - - @{$self}{qw(ref refstart refstrand)} = ($newref,$refstart,$refstrand); - $self->absolute(0); - } - return $self->absolute ? $self->sourceseq : $g; -} - - -=head2 abs_low - - Title : abs_low - Usage : $s->abs_low - Function: the absolute lowest coordinate of the segment - Returns : an integer - Args : none - Status : Public - -This is for GadFly compatibility, and returns the low coordinate in -absolute coordinates; - -=cut - -sub abs_low { - my $self = shift; - my ($a,$b) = ($self->abs_start,$self->abs_end); - return ($a<$b) ? $a : $b; -} - -=head2 abs_high - - Title : abs_high - Usage : $s->abs_high - Function: the absolute highest coordinate of the segment - Returns : an integer - Args : none - Status : Public - -This is for GadFly compatibility, and returns the high coordinate in -absolute coordinates; - -=cut - -sub abs_high { - my $self = shift; - my ($a,$b) = ($self->abs_start,$self->abs_end); - return ($a>$b) ? $a : $b; -} - - -=head2 asString - - Title : asString - Usage : $s->asString - Function: human-readable representation of the segment - Returns : a string - Args : none - Status : Public - -This method will return a human-readable representation of the -segment. It is the overloaded method call for the "" operator. - -Currently the format is: - - refseq:start,stop - -=cut - -sub asString { - my $self = shift; - return $self->SUPER::asString if $self->absolute; - my $label = $self->{ref}; - my $start = $self->start || ''; - my $stop = $self->stop || ''; - if (ref($label) && overload::StrVal($self) eq overload::StrVal($label->ref)) { - $label = $self->abs_ref; - $start = $self->abs_start; - $stop = $self->abs_end; - } - return "$label:$start,$stop"; -} - -=head2 name - - Title : name - Usage : Alias for asString() - -=cut - -sub name { shift->asString } - -=head2 absolute - - Title : absolute - Usage : $abs = $s->absolute([$abs]) - Function: get/set absolute coordinates - Returns : a boolean flag - Args : new setting for flag (optional) - Status : Public - -Called with a boolean flag, this method controls whether to display -relative coordinates (relative to the reference sequence) or absolute -coordinates (relative to the source sequence). It will return the -previous value of the setting. - -=cut - -sub absolute { - my $self = shift; - my $g = $self->{absolute}; - $self->{absolute} = shift if @_; - $g; -} - -=head2 features - - Title : features - Usage : @features = $s->features(@args) - Function: get features that overlap this segment - Returns : a list of Bio::DB::GFF::Feature objects - Args : see below - Status : Public - -This method will find all features that overlap the segment and return -a list of Bio::DB::GFF::Feature objects. The features will use -coordinates relative to the reference sequence in effect at the time -that features() was called. - -The returned list can be limited to certain types of feature by -filtering on their method and/or source. In addition, it is possible -to obtain an iterator that will step through a large number of -features sequentially. - -Arguments can be provided positionally or using the named arguments -format. In the former case, the arguments are a list of feature types -in the format "method:source". Either method or source can be -omitted, in which case the missing component is treated as a wildcard. -If no colon is present, then the type is treated as a method name. -Multiple arguments are ORed together. - -Examples: - - @f = $s->features('exon:curated'); # all curated exons - @f = $s->features('exon:curated','intron'); # curated exons and all introns - @f = $s->features('similarity:.*EST.*'); # all similarities - # having something to do - # with ESTs - -The named parameter form gives you control over a few options: - - -types an array reference to type names in the format - "method:source" - - -merge Whether to apply aggregators to the generated features (default yes) - - -rare Turn on an optimization suitable for a relatively rare feature type, - where it will be faster to filter by feature type first - and then by position, rather than vice versa. - - -attributes a hashref containing a set of attributes to match - - -range_type One of 'overlapping', 'contains', or 'contained_in' - - -iterator Whether to return an iterator across the features. - - -binsize A true value will create a set of artificial features whose - start and stop positions indicate bins of the given size, and - whose scores are the number of features in the bin. The - class and method of the feature will be set to "bin", - its source to "method:source", and its group to "bin:method:source". - This is a handy way of generating histograms of feature density. - --merge is a boolean flag that controls whether the adaptor's -aggregators wll be applied to the features returned by this method. - -If -iterator is true, then the method returns a single scalar value -consisting of a Bio::SeqIO object. You can call next_seq() repeatedly -on this object to fetch each of the features in turn. If iterator is -false or absent, then all the features are returned as a list. - -The -attributes argument is a hashref containing one or more -attributes to match against: - - -attributes => { Gene => 'abc-1', - Note => 'confirmed' } - -Attribute matching is simple string matching, and multiple attributes -are ANDed together. - -=cut - -#' - -# return all features that overlap with this segment; -# optionally modified by a list of types to filter on -sub features { - my $self = shift; - my @args = $self->_process_feature_args(@_); - return $self->factory->overlapping_features(@args); -} - -=head2 get_SeqFeatures - - Title : get_SeqFeatures - Usage : - Function: returns the top level sequence features - Returns : L objects - Args : none - -Segments do not ordinarily return any subfeatures. - -=cut - -# A SEGMENT DOES NOT HAVE SUBFEATURES! -sub get_SeqFeatures { return } - -=head2 feature_count - - Title : feature_count - Usage : $seq->feature_count() - Function: Return the number of SeqFeatures attached to a sequence - Returns : integer representing the number of SeqFeatures - Args : none - -This method comes through extension of Bio::FeatureHolderI. See -L for more information. - -=cut - -sub feature_count { - my $self = shift; - my $ct = 0; - my %type_counts = $self->types(-enumerate=>1); - map { $ct += $_ } values %type_counts; - $ct; -} - -=head2 get_feature_stream - - Title : features - Usage : $stream = $s->get_feature_stream(@args) - Function: get a stream of features that overlap this segment - Returns : a Bio::SeqIO::Stream-compliant stream - Args : see below - Status : Public - -This is the same as features(), but returns a stream. Use like this: - - $stream = $s->get_feature_stream('exon'); - while (my $exon = $stream->next_seq) { - print $exon->start,"\n"; - } - -=cut - -sub get_feature_stream { - my $self = shift; - my @args = defined($_[0]) && $_[0] =~ /^-/ ? (@_,-iterator=>1) : (-types=>\@_,-iterator=>1); - $self->features(@args); -} - -=head2 get_seq_stream - - Title : get_seq_stream - Usage : $stream = $s->get_seq_stream(@args) - Function: get a stream of features that overlap this segment - Returns : a Bio::SeqIO::Stream-compliant stream - Args : see below - Status : Public - -This is the same as feature_stream(), and is provided for Bioperl -compatibility. Use like this: - - $stream = $s->get_seq_stream('exon'); - while (my $exon = $stream->next_seq) { - print $exon->start,"\n"; - } - -=cut - -*get_seq_stream = \&get_feature_stream; - - -=head2 overlapping_features - - Title : overlapping_features - Usage : @features = $s->overlapping_features(@args) - Function: get features that overlap this segment - Returns : a list of Bio::DB::GFF::Feature objects - Args : see features() - Status : Public - -This is an alias for the features() method, and takes the same -arguments. - -=cut - -*overlapping_features = \&features; - -=head2 contained_features - - Title : contained_features - Usage : @features = $s->contained_features(@args) - Function: get features that are contained by this segment - Returns : a list of Bio::DB::GFF::Feature objects - Args : see features() - Status : Public - -This is identical in behavior to features() except that it returns -only those features that are completely contained within the segment, -rather than any that overlap. - -=cut - -# return all features completely contained within this segment -sub contained_features { - my $self = shift; - local $self->{whole} = 0; - my @args = $self->_process_feature_args(@_); - return $self->factory->contained_features(@args); -} - -# *contains = \&contained_features; - -=head2 contained_in - - Title : contained_in - Usage : @features = $s->contained_in(@args) - Function: get features that contain this segment - Returns : a list of Bio::DB::GFF::Feature objects - Args : see features() - Status : Public - -This is identical in behavior to features() except that it returns -only those features that completely contain the segment. - -=cut - -# return all features completely contained within this segment -sub contained_in { - my $self = shift; - local $self->{whole} = 0; - my @args = $self->_process_feature_args(@_); - return $self->factory->contained_in(@args); -} - -=head2 delete - - Title : delete - Usage : $db->delete(@args) - Function: delete features - Returns : count of features deleted -- if available - Args : numerous, see below - Status : public - -This method deletes all features that overlap the specified region or -are of a particular type. If no arguments are provided and the -force -argument is true, then deletes ALL features. - -Arguments: - - -type,-types Either a single scalar type to be deleted, or an - reference to an array of types. - - -range_type Control the range type of the deletion. One of "overlaps" (default) - "contains" or "contained_in" - -Examples: - - $segment->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats - $segment->delete(-type=>['intron','repeat:repeatMasker'] - -range_type => 'contains'); # remove all introns & repeats - # strictly contained in segment - -IMPORTANT NOTE: This method only deletes features. It does *NOT* -delete the names of groups that contain the deleted features. Group -IDs will be reused if you later load a feature with the same group -name as one that was previously deleted. - -NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the -result code from the SQL DELETE operation. Some dbd drivers return the -count of rows deleted, while others return 0E0. Caveat emptor. - -=cut - -# return all features completely contained within this segment -sub delete { - my $self = shift; - my ($type,$range_type) = - rearrange([[qw(TYPE TYPES)],'RANGE_TYPE'],@_); - my $types = $self->factory->parse_types($type); # parse out list of types - $range_type ||= 'overlaps'; - return $self->factory->_delete({ - segments => [$self], - types => $types, - range_type => $range_type - }); -} - -=head2 _process_feature_args - - Title : _process_feature_args - Usage : @args = $s->_process_feature_args(@args) - Function: preprocess arguments passed to features, - contained_features, and overlapping_features - Returns : a list of parsed arguents - Args : see feature() - Status : Internal - -This is an internal method that is used to check and format the -arguments to features() before passing them on to the adaptor. - -=cut - -sub _process_feature_args { - my $self = shift; - - my ($ref,$class,$start,$stop,$strand,$whole) - = @{$self}{qw(sourceseq class start stop strand whole)}; - - ($start,$stop) = ($stop,$start) if defined $strand && $strand eq '-'; - - my @args = (-ref=>$ref,-class=>$class); - - # indicating that we are fetching the whole segment allows certain - # SQL optimizations. - push @args,(-start=>$start,-stop=>$stop) unless $whole; - - if (@_) { - if ($_[0] =~ /^-/) { - push @args,@_; - } else { - my @types = @_; - push @args,-types=>\@types; - } - } - push @args,-parent=>$self; - @args; -} - -=head2 types - - Title : types - Usage : @types = $s->types([-enumerate=>1]) - Function: list feature types that overlap this segment - Returns : a list of Bio::DB::GFF::Typename objects or a hash - Args : see below - Status : Public - -The types() method will return a list of Bio::DB::GFF::Typename -objects, each corresponding to a feature that overlaps the segment. -If the optional -enumerate parameter is set to a true value, then the -method will return a hash in which the keys are the type names and the -values are the number of times a feature of that type is present on -the segment. For example: - - %count = $s->types(-enumerate=>1); - -=cut - -# wrapper for lower-level types() call. -sub types { - my $self = shift; - my ($ref,$class,$start,$stop,$strand) = @{$self}{qw(sourceseq class start stop strand)}; - ($start,$stop) = ($stop,$start) if $strand eq '-'; - - my @args; - if (@_ && $_[0] !~ /^-/) { - @args = (-type => \@_) - } else { - @args = @_; - } - $self->factory->types(-ref => $ref, - -start=> $start, - -stop => $stop, - @args); -} - -=head1 Internal Methods - -The following are internal methods and should not be called directly. - -=head2 new_from_segment - - Title : new_from_segment - Usage : $s = $segment->new_from_segment(@args) - Function: create a new relative segment - Returns : a new Bio::DB::GFF::RelSegment object - Args : see below - Status : Internal - -This constructor is used internally by the subseq() method. It forces -the new segment into the Bio::DB::GFF::RelSegment package, regardless -of the package that it is called from. This causes subclass-specfic -information, such as feature types, to be dropped when a subsequence -is created. - -=cut - -sub new_from_segment { - my $package = shift; - $package = ref $package if ref $package; - my $segment = shift; - my $new = {}; - @{$new}{qw(factory sourceseq start stop strand class ref refstart refstrand)} - = @{$segment}{qw(factory sourceseq start stop strand class ref refstart refstrand)}; - return bless $new,__PACKAGE__; -} - -=head2 _abs2rel - - Title : _abs2rel - Usage : @coords = $s->_abs2rel(@coords) - Function: convert absolute coordinates into relative coordinates - Returns : a list of relative coordinates - Args : a list of absolute coordinates - Status : Internal - -This is used internally to map from absolute to relative -coordinates. It does not take the offset of the reference sequence -into account, so please use abs2rel() instead. - -=cut - -sub _abs2rel { - my $self = shift; - my @result; - return unless defined $_[0]; - - if ($self->absolute) { - @result = @_; - } else { - my ($refstart,$refstrand) = @{$self}{qw(refstart refstrand)}; - @result = defined($refstrand) && $refstrand eq '-' ? map { $refstart - $_ + 1 } @_ - : map { $_ - $refstart + 1 } @_; - } - # if called with a single argument, caller will expect a single scalar reply - # not the size of the returned array! - return $result[0] if @result == 1 and !wantarray; - @result; -} - -=head2 rel2abs - - Title : rel2abs - Usage : @coords = $s->rel2abs(@coords) - Function: convert relative coordinates into absolute coordinates - Returns : a list of absolute coordinates - Args : a list of relative coordinates - Status : Public - -This function takes a list of positions in relative coordinates to the -segment, and converts them into absolute coordinates. - -=cut - -sub rel2abs { - my $self = shift; - my @result; - - if ($self->absolute) { - @result = @_; - } else { - my ($abs_start,$abs_strand) = ($self->abs_start,$self->abs_strand); - @result = $abs_strand < 0 ? map { $abs_start - $_ + 1 } @_ - : map { $_ + $abs_start - 1 } @_; - } - # if called with a single argument, caller will expect a single scalar reply - # not the size of the returned array! - return $result[0] if @result == 1 and !wantarray; - @result; -} - -=head2 abs2rel - - Title : abs2rel - Usage : @rel_coords = $s->abs2rel(@abs_coords) - Function: convert absolute coordinates into relative coordinates - Returns : a list of relative coordinates - Args : a list of absolute coordinates - Status : Public - -This function takes a list of positions in absolute coordinates -and returns a list expressed in relative coordinates. - -=cut - -sub abs2rel { - my $self = shift; - my @result; - - if ($self->absolute) { - @result = @_; - } else { - my ($abs_start,$abs_strand) = ($self->abs_start,$self->abs_strand); - @result = $abs_strand < 0 ? map { $abs_start - $_ + 1 } @_ - : map { $_ - $abs_start + 1 } @_; - } - # if called with a single argument, caller will expect a single scalar reply - # not the size of the returned array! - return $result[0] if @result == 1 and !wantarray; - @result; -} - -sub subseq { - my $self = shift; - my $obj = $self->SUPER::subseq(@_); - bless $obj,__PACKAGE__; # always bless into the generic RelSegment package -} - -sub strand { - my $self = shift; - if ($self->absolute) { - return _to_strand($self->{strand}); - } - my $start = $self->start; - my $stop = $self->stop; - return 0 unless defined $start and defined $stop; - return $stop <=> $start; -} - -sub _to_strand { - my $s = shift; - return -1 if $s eq '-'; - return +1 if $s eq '+'; - return 0; -} - -=head2 Bio::RangeI Methods - -The following Bio::RangeI methods are supported: - -overlaps(), contains(), equals(),intersection(),union(),overlap_extent() - -=cut - -sub intersection { - my $self = shift; - my (@ranges) = @_; - unshift @ranges,$self if ref $self; - $ranges[0]->isa('Bio::DB::GFF::RelSegment') - or return $self->SUPER::intersection(@_); - - my $ref = $ranges[0]->abs_ref; - my ($low,$high); - foreach (@ranges) { - return unless $_->can('abs_ref'); - $ref eq $_->abs_ref or return; - $low = $_->abs_low if !defined($low) or $low < $_->abs_low; - $high = $_->abs_high if !defined($high) or $high > $_->abs_high; - } - - return unless $low < $high; - return Bio::DB::GFF::RelSegment->new(-factory => $self->factory, - -seq => $ref, - -start => $low, - -stop => $high, - ); -} - -sub overlaps { - my $self = shift; - my($other,$so) = @_; - return $self->SUPER::overlaps(@_) unless $other->isa('Bio::DB::GFF::RelSegment'); - return if $self->abs_ref ne $other->abs_ref; - return if $self->abs_low > $other->abs_high; - return if $self->abs_high < $other->abs_low; - 1; -} - -sub contains { - my $self = shift; - my($other,$so) = @_; - return $self->SUPER::overlaps(@_) unless $other->isa('Bio::DB::GFF::RelSegment'); - return if $self->abs_ref ne $other->abs_ref; - return unless $self->abs_low <= $other->abs_low; - return unless $self->abs_high >= $other->abs_high; - 1; -} - -sub union { - my $self = shift; - my (@ranges) = @_; - unshift @ranges,$self if ref $self; - $ranges[0]->isa('Bio::DB::GFF::RelSegment') - or return $self->SUPER::union(@_); - - my $ref = $ranges[0]->abs_ref; - my ($low,$high); - foreach (@ranges) { - return unless $_->can('abs_ref'); - $ref eq $_->abs_ref or return; - $low = $_->abs_low if !defined($low) or $low > $_->abs_low; - $high = $_->abs_high if !defined($high) or $high < $_->abs_high; - } - $self->new(-factory=> $self->factory, - -seq => $ref, - -start => $low, - -stop => $high); -} - -sub version { 0 } - - -1; - -__END__ - -=head1 BUGS - -Schemas need some work. - -=head1 SEE ALSO - -L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - diff --git a/lib/Bio/DB/GFF/Segment.pm b/lib/Bio/DB/GFF/Segment.pm deleted file mode 100644 index 9521097bf..000000000 --- a/lib/Bio/DB/GFF/Segment.pm +++ /dev/null @@ -1,884 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Segment -- Simple DNA segment object - -=head1 SYNOPSIS - -See L. - -=head1 DESCRIPTION - -Bio::DB::GFF::Segment provides the basic representation of a range of -DNA contained in a GFF database. It is the base class from which the -Bio::DB::GFF::RelSegment and Bio::DB::GFF::Feature classes are -derived. - -Generally, you will not create or manipulate Bio::DB::GFF::Segment -objects directly, but use those that are returned by the Bio::DB::GFF -module. - -=cut - -package Bio::DB::GFF::Segment; - -use strict; -use Bio::Annotation::Collection; - -use base qw(Bio::Root::Root Bio::RangeI Bio::SeqI Bio::Das::SegmentI); - -use overload - '""' => 'asString', - eq => 'equals', - fallback => 1; - -=head1 API - -The remainder of this document describes the API for -Bio::DB::GFF::Segment. - -=cut - -=head2 new - - Title : new - Usage : $s = Bio::DB::GFF::Segment->new(@args) - Function: create a new segment - Returns : a new Bio::DB::GFF::Segment object - Args : see below - Status : Public - -This method creates a new Bio::DB::GFF::Segment object. Generally -this is called automatically by the Bio::DB::GFF module and -derivatives. - -There are five positional arguments: - - $factory a Bio::DB::GFF::Adaptor to use for database access - $sourceseq ID of the source sequence - $sourceclass class of the source sequence - $start start of the desired segment relative to source sequence - $stop stop of the desired segment relative to source sequence - -=cut - -sub new { - my $class = shift; - my ($factory,$segclass,$segname,$start,$stop) = @_; - $segclass = $segname->class if ref($segname) && $segname->can('class'); - $segclass ||= 'Sequence'; - - $factory or $class->throw("->new(): provide a factory argument"); - $class = ref $class if ref $class; - return bless { factory => $factory, - sourceseq => $segname, - class => $segclass, - start => $start, - stop => $stop, - strand => 0, - },$class; -} - -# read-only accessors - -=head2 factory - - Title : factory - Usage : $s->factory - Function: get the factory object - Returns : a Bio::DB::GFF::Adaptor - Args : none - Status : Public - -This is a read-only accessor for the Bio::DB::GFF::Adaptor object used -to create the segment. - -=cut - -sub factory { shift->{factory} } - -# start, stop, length - -=head2 start - - Title : start - Usage : $s->start - Function: start of segment - Returns : integer - Args : none - Status : Public - -This is a read-only accessor for the start of the segment. - -=cut - -sub start { shift->{start} } - -=head2 end - - Title : end - Usage : $s->end - Function: end of segment - Returns : integer - Args : none - Status : Public - -This is a read-only accessor for the end of the segment. - -=cut - -sub end { shift->{stop} } - -=head2 stop - - Title : stop - Usage : $s->stop - Function: stop of segment - Returns : integer - Args : none - Status : Public - -This is an alias for end(), provided for AcePerl compatibility. - -=cut - -*stop = \&end; - -=head2 length - - Title : length - Usage : $s->length - Function: length of segment - Returns : integer - Args : none - Status : Public - -Returns the length of the segment. Always a positive number. - -=cut - -sub length { abs($_[0]->{start} - $_[0]->{stop})+1 } - - -=head2 strand - - Title : strand - Usage : $s->strand - Function: strand of segment - Returns : +1,0,-1 - Args : none - Status : Public - -Returns the strand on which the segment resides, either +1, 0 or -1. - -=cut - -sub strand { - my $self = shift; - 0; -} - -=head2 low - - Title : low - Usage : $s->low - Function: return lower coordinate - Returns : lower coordinate - Args : none - Status : Public - -Returns the lower coordinate, either start or end. - -=cut - -sub low { - my $self = shift; - my ($start,$stop) = ($self->start,$self->stop); - return $start < $stop ? $start : $stop; -} -*abs_low = \&low; - -=head2 high - - Title : high - Usage : $s->high - Function: return higher coordinate - Returns : higher coordinate - Args : none - Status : Public - -Returns the higher coordinate, either start or end. - -=cut - -sub high { - my $self = shift; - my ($start,$stop) = ($self->start,$self->stop); - return $start > $stop ? $start : $stop; -} -*abs_high = \&high; - -=head2 sourceseq - - Title : sourceseq - Usage : $s->sourceseq - Function: get the segment source - Returns : a string - Args : none - Status : Public - -Returns the name of the source sequence for this segment. - -=cut - -sub sourceseq { shift->{sourceseq} } - -=head2 class - - Title : class - Usage : $s->class([$newclass]) - Function: get the source sequence class - Returns : a string - Args : new class (optional) - Status : Public - -Gets or sets the class for the source sequence for this segment. - -=cut - -sub class { - my $self = shift; - my $d = $self->{class}; - $self->{class} = shift if @_; - $d; -} - -=head2 subseq - - Title : subseq - Usage : $s->subseq($start,$stop) - Function: generate a subsequence - Returns : a Bio::DB::GFF::Segment object - Args : start and end of subsequence - Status : Public - -This method generates a new segment from the start and end positions -given in the arguments. If stop E start, then the strand is reversed. - -=cut - -sub subseq { - my $self = shift; - my ($newstart,$newstop) = @_; - my ($refseq,$start,$stop,$class) = ($self->{sourceseq}, - $self->{start},$self->{stop}, - $self->class); - - # We deliberately force subseq to return objects of type RelSegment - # Otherwise, when we get a subsequence from a Feature object, - # its method and source go along for the ride, which is incorrect. - my $new = $self->new_from_segment($self); - if ($start <= $stop) { - @{$new}{qw(start stop)} = ($start + $newstart - 1, $start + $newstop - 1); - } else { - @{$new}{qw(start stop)} = ($start - ($newstart - 1), $start - ($newstop - 1)), - - } - - $new; -} - -=head2 seq - - Title : seq - Usage : $s->seq - Function: get the sequence string for this segment - Returns : a Bio::PrimarySeq - Args : none - Status : Public - -Returns the sequence for this segment as a Bio::PrimarySeq. (-) -strand segments are automatically reverse complemented - -The method is called dna() return the data as a simple sequence -string. - -=cut - -sub seq { - my $self = shift; - my $dna = $self->dna; - require Bio::PrimarySeq unless Bio::PrimarySeq->can('new'); - return Bio::PrimarySeq->new(-id => $self->display_name) unless $dna; - return Bio::PrimarySeq->new(-seq => $dna, - -id => $self->display_name); -} - -=head2 dna - - Title : dna - Usage : $s->dna - Function: get the DNA string for this segment - Returns : a string - Args : none - Status : Public - -Returns the sequence for this segment as a simple string. (-) strand -segments are automatically reverse complemented - -The method is also called protein(). - -=cut - -sub dna { - my $self = shift; - my ($ref,$class,$start,$stop,$strand) - = @{$self}{qw(sourceseq class start stop strand)}; - return $self->factory->dna($ref,$start,$stop,$class); -} - -*protein = \&dna; - - -=head2 primary_seq - - Title : primary_seq - Usage : $s->primary_seq - Function: returns a Bio::PrimarySeqI compatible object - Returns : a Bio::PrimarySeqI object - Args : none - Status : Public - -This is for compatibility with BioPerl's separation of SeqI -from PrimarySeqI. It just returns itself. - -=cut - -#' - -sub primary_seq { shift } - -=head2 type - - Title : type - Usage : $s->type - Function: return the string "feature" - Returns : the string "feature" - Args : none - Status : Public - -This is for future sequence ontology-compatibility and -represents the default type of a feature on the genome - -=cut - -sub type { "feature" } - -=head2 equals - - Title : equals - Usage : $s->equals($d) - Function: segment equality - Returns : true, if two segments are equal - Args : another segment - Status : Public - -Returns true if the two segments have the same source sequence, start and stop. - -=cut - -sub equals { - my $self = shift; - my $peer = shift; - return unless defined $peer; - return $self->asString eq $peer unless ref($peer) && $peer->isa('Bio::DB::GFF::Segment'); - return $self->{start} eq $peer->{start} - && $self->{stop} eq $peer->{stop} - && $self->{sourceseq} eq $peer->{sourceseq}; -} - -=head2 asString - - Title : asString - Usage : $s->asString - Function: human-readable string for segment - Returns : a string - Args : none - Status : Public - -Returns a human-readable string representing this sequence. Format -is: - - sourceseq/start,stop - -=cut - -sub asString { - my $self = shift; - my $label = $self->refseq; - my $start = $self->start; - my $stop = $self->stop; - return "$label:$start,$stop"; -} - -=head2 clone - - Title : clone - Usage : $copy = $s->clone - Function: make a copy of this segment - Returns : a Bio::DB::GFF::Segment object - Args : none - Status : Public - -This method creates a copy of the segment and returns it. - -=cut - -# deep copy of the thing -sub clone { - my $self = shift; - my %h = %$self; - return bless \%h,ref($self); -} - -=head2 error - - Title : error - Usage : $error = $s->error([$new_error]) - Function: get or set the last error - Returns : a string - Args : an error message (optional) - Status : Public - -In case of a fault, this method can be used to obtain the last error -message. Internally it is called to set the error message. - -=cut - -sub error { - my $self = shift; - my $g = $self->{error}; - $self->{error} = shift if @_; - $g; -} - -=head1 Relative Addressing Methods - -The following methods are provided for compatibility with -Bio::DB::GFF::RelSegment, which provides relative addressing -functions. - -=head2 abs_start - - Title : abs_start - Usage : $s->abs_start - Function: the absolute start of the segment - Returns : an integer - Args : none - Status : Public - -This is an alias to start(), and provided for API compatibility with -Bio::DB::GFF::RelSegment. - -=cut - -*abs_start = \&start; - -=head2 abs_end - - Title : abs_end - Usage : $s->abs_end - Function: the absolute stop of the segment - Returns : an integer - Args : none - Status : Public - -This is an alias to stop(), and provided for API compatibility with -Bio::DB::GFF::RelSegment. - -=cut - -*abs_stop = \&stop; -*abs_end = \&stop; - -=head2 abs_strand - - Title : abs_strand - Usage : $s->abs_strand - Function: the absolute strand of the segment - Returns : +1,0,-1 - Args : none - Status : Public - -This is an alias to strand(), and provided for API compatibility with -Bio::DB::GFF::RelSegment. - -=cut - -sub abs_strand { - my $self = shift; - return $self->abs_end <=> $self->abs_start; -} - -=head2 abs_ref - - Title : abs_ref - Usage : $s->abs_ref - Function: the reference sequence for this segment - Returns : a string - Args : none - Status : Public - -This is an alias to sourceseq(), and is here to provide API -compatibility with Bio::DB::GFF::RelSegment. - -=cut - -*abs_ref = \&sourceseq; - -=head2 refseq - - Title : refseq - Usage : $s->refseq - Function: get or set the reference sequence - Returns : a string - Args : none - Status : Public - -Examine or change the reference sequence. This is an alias to -sourceseq(), provided here for API compatibility with -Bio::DB::GFF::RelSegment. - -=cut - -*refseq = \&sourceseq; - -=head2 ref - - Title : ref - Usage : $s->refseq - Function: get or set the reference sequence - Returns : a string - Args : none - Status : Public - -An alias for refseq() - -=cut - -sub ref { shift->refseq(@_) } - -=head2 seq_id - - Title : seq_id - Usage : $ref = $s->seq_id - Function: get the reference sequence in a LocationI-compatible way - Returns : a string - Args : none - Status : Public - -An alias for refseq() but only allows reading. - -=cut - -sub seq_id { shift->refseq } -*seqname = \&seq_id; - -=head2 truncated - - Title : truncated - Usage : $truncated = $s->truncated - Function: Flag indicating that the segment was truncated during creation - Returns : A boolean flag - Args : none - Status : Public - -This indicates that the sequence was truncated during creation. The -returned flag is undef if no truncation occurred. If truncation did -occur, the flag is actually an array ref in which the first element is -true if truncation occurred on the left, and the second element -occurred if truncation occurred on the right. - -=cut - -sub truncated { - my $self = shift; - my $hash = $self->{truncated} or return; - CORE::ref($hash) eq 'HASH' or return [1,1]; # paranoia -- not that this would ever happen ;-) - return [$hash->{start},$hash->{stop}]; -} - -=head2 Bio::RangeI Methods - -The following Bio::RangeI methods are supported: - -overlaps(), contains(), equals(),intersection(),union(),overlap_extent() - -=cut - -sub overlaps { - my $self = shift; - my($other,$so) = @_; - if ($other->isa('Bio::DB::GFF::RelSegment')) { - return if $self->abs_ref ne $other->abs_ref; - } - $self->SUPER::overlaps(@_); -} - -sub contains { - my $self = shift; - my($other,$so) = @_; - if ($other->isa('Bio::DB::GFF::RelSegment')) { - return if $self->abs_ref ne $other->abs_ref; - } - $self->SUPER::contains(@_); -} -#sub equals { -# my $self = shift; -# my($other,$so) = @_; -# if ($other->isa('Bio::DB::GFF::RelSegment')) { -# return if $self->abs_ref ne $other->abs_ref; -# } -# $self->SUPER::equals(@_); -#} -sub intersection { - my $self = shift; - my($other,$so) = @_; - if ($other->isa('Bio::DB::GFF::RelSegment')) { - return if $self->abs_ref ne $other->abs_ref; - } - $self->SUPER::intersection(@_); -} -sub union { - my $self = shift; - my($other) = @_; - if ($other->isa('Bio::DB::GFF::RelSegment')) { - return if $self->abs_ref ne $other->abs_ref; - } - $self->SUPER::union(@_); -} - -sub overlap_extent { - my $self = shift; - my($other) = @_; - if ($other->isa('Bio::DB::GFF::RelSegment')) { - return if $self->abs_ref ne $other->abs_ref; - } - $self->SUPER::overlap_extent(@_); -} - - -=head2 Bio::SeqI implementation - -=cut - -=head2 primary_id - - Title : primary_id - Usage : $unique_implementation_key = $obj->primary_id; - Function: Returns the unique id for this object in this - implementation. This allows implementations to manage their - own object ids in a way the implementation can control - clients can expect one id to map to one object. - - For sequences with no accession number, this method should - return a stringified memory location. - - Returns : A string - Args : None - Status : Virtual - - -=cut - -sub primary_id { - my ($obj,$value) = @_; - - if( defined $value) { - $obj->{'primary_id'} = $value; - } - if( ! exists $obj->{'primary_id'} ) { - return "$obj"; - } - return $obj->{'primary_id'}; -} - - -=head2 display_name - - Title : display_name - Usage : $id = $obj->display_name or $obj->display_name($newid); - Function: Gets or sets the display id, also known as the common name of - the Seq object. - - The semantics of this is that it is the most likely string - to be used as an identifier of the sequence, and likely to - have "human" readability. The id is equivalent to the LOCUS - field of the GenBank/EMBL databanks and the ID field of the - Swissprot/sptrembl database. In fasta format, the >(\S+) is - presumed to be the id, though some people overload the id - to embed other information. Bioperl does not use any - embedded information in the ID field, and people are - encouraged to use other mechanisms (accession field for - example, or extending the sequence object) to solve this. - - Notice that $seq->id() maps to this function, mainly for - legacy/convenience issues. - Returns : A string - Args : None or a new id - -Note, this used to be called display_id(), and this name is preserved for -backward compatibility. The default is to return the seq_id(). - -=cut - -sub display_name { shift->seq_id } -*display_id = \&display_name; - -=head2 accession_number - - Title : accession_number - Usage : $unique_biological_key = $obj->accession_number; - Function: Returns the unique biological id for a sequence, commonly - called the accession_number. For sequences from established - databases, the implementors should try to use the correct - accession number. Notice that primary_id() provides the - unique id for the implementation, allowing multiple objects - to have the same accession number in a particular implementation. - - For sequences with no accession number, this method should return - "unknown". - Returns : A string - Args : None - - -=cut - -sub accession_number { - return 'unknown'; -} - -=head2 alphabet - - Title : alphabet - Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ } - Function: Returns the type of sequence being one of - 'dna', 'rna' or 'protein'. This is case sensitive. - - This is not called because this would cause - upgrade problems from the 0.5 and earlier Seq objects. - - Returns : a string either 'dna','rna','protein'. NB - the object must - make a call of the type - if there is no type specified it - has to guess. - Args : none - Status : Virtual - - -=cut - -sub alphabet{ - return 'dna'; # no way this will be anything other than dna! -} - -=head2 desc - - Title : desc - Usage : $seqobj->desc($string) or $seqobj->desc() - Function: Sets or gets the description of the sequence - Example : - Returns : The description - Args : The description or none - - -=cut - -sub desc { shift->asString } - -*description = \&desc; - -=head2 species - - Title : species - Usage : $species = $seq->species() or $seq->species($species) - Function: Gets or sets the species - Example : - Returns : Bio::Species object - Args : None or Bio::Species object - -See L for more information - -=cut - -sub species { - my ($self, $species) = @_; - if ($species) { - $self->{'species'} = $species; - } else { - return $self->{'species'}; - } -} - -=head2 annotation - - Title : annotation - Usage : $ann = $seq->annotation or $seq->annotation($annotation) - Function: Gets or sets the annotation - Example : - Returns : Bio::Annotation object - Args : None or Bio::Annotation object - -See L for more information - -=cut - -sub annotation { - my ($obj,$value) = @_; - if( defined $value || ! defined $obj->{'annotation'} ) { - $value = Bio::Annotation::Collection->new() unless defined $value; - $obj->{'annotation'} = $value; - } - return $obj->{'annotation'}; - -} - -=head2 is_circular - - Title : is_circular - Usage : if( $obj->is_circular) { /Do Something/ } - Function: Returns true if the molecule is circular - Returns : Boolean value - Args : none - -=cut - -sub is_circular{ - return 0; -} - - -1; -__END__ - -=head1 BUGS - -Report them please. - -=head1 SEE ALSO - -L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=head1 CONTRIBUTORS - -Jason Stajich Ejason@bioperl.orgE. - -=cut - diff --git a/lib/Bio/DB/GFF/Typename.pm b/lib/Bio/DB/GFF/Typename.pm deleted file mode 100644 index 6768c18cb..000000000 --- a/lib/Bio/DB/GFF/Typename.pm +++ /dev/null @@ -1,188 +0,0 @@ -=head1 NAME - -Bio::DB::GFF::Typename -- The name of a feature type - -=head1 SYNOPSIS - - use Bio::DB::GFF; - - my $type = Bio::DB::GFF::Typename->new(similarity => 'BLAT_EST_GENOME'); - my $segment = $segment->features($type); - -=head1 DESCRIPTION - -Bio::DB::GFF::Typename objects encapsulate the combination of feature -method and source used by the GFF flat file format. They can be used -in the Bio::DB::GFF modules wherever a feature type is called for. - -Since there are relatively few types and many features, this module -maintains a memory cache of unique types so that two features of the -same type will share the same Bio::DB::GFF::Typename object. - -=head1 METHODS - -=cut - -package Bio::DB::GFF::Typename; - -use strict; -use overload - '""' => 'asString', - fallback => 1; - - -use base qw(Bio::Root::Root Bio::Das::FeatureTypeI); - -# cut down on the number of equivalent objects we have to create -my %OBJECT_CACHE; - -=head2 new - - Title : new - Usage : $type = Bio::DB::GFF::Typename->new($method,$source) - Function: create a new Bio::DB::GFF::Typename object - Returns : a new Bio::DB::GFF::Typename object - Args : method and source - Status : Public - -=cut - -sub new { - my $package = shift; - my ($method,$source) = @_; - $method ||= ''; - $source ||= ''; - if ($source eq '' && $method =~ /^([\w\-\.]+):([\w\-\.]*)$/) { - $method = $1; - $source = $2; - } - return $OBJECT_CACHE{"$method:$source"} ||= bless [$method,$source],$package; -} - -=head2 method - - Title : method - Usage : $method = $type->method([$newmethod]) - Function: get or set the method - Returns : a method name - Args : new method name (optional) - Status : Public - -=cut - -sub method { - my $self = shift; - my $d = $self->[0]; - $self->[0] = shift if @_; - $d; -} - - -=head2 source - - Title : source - Usage : $source = $type->source([$newsource]) - Function: get or set the source - Returns : a source name - Args : new source name (optional) - Status : Public - -=cut - -sub source { - my $self = shift; - my $d = $self->[1]; - $self->[1] = shift if @_; - $d; -} - -=head2 asString - - Title : asString - Usage : $string = $type->asString - Function: get the method and source as a string - Returns : a string in "method:source" format - Args : none - Status : Public - -This method is used by operator overloading to overload the '""' -operator. - -=cut - -sub asString { - $_[0]->[1] ? join ':',@{$_[0]} : $_[0]->[0]; -} - -=head2 clone - - Title : clone - Usage : $new_clone = $type->clone; - Function: clone this object - Returns : a new Bio::DB::GFF::Typename object - Args : none - Status : Public - -This method creates an exact copy of the object. - -=cut - -sub clone { - my $self = shift; - return bless [@$self],ref $self; -} - -=head2 match - - Title : match - Usage : $boolean = $type->match($type_or_string) - Function: fuzzy match on types - Returns : a flag indicating that the argument matches the object - Args : a Bio::DB::GFF::typename object, or a string in method:source format - Status : Public - -This match allows Sequence:Link and Sequence: to match, but not -Sequence:Link and Sequence:Genomic_canonical. - -=cut - -sub match { - my $self = shift; - my $target = shift; - my ($method,$source); - - if (UNIVERSAL::isa($target,'Bio::DB::GFF::Typename')) { - ($method,$source) = ($target->method,$target->source); - } else { - ($method,$source) = split /:/,$target; - } - - $source ||= ''; # quash uninit variable warnings - - return if $method ne '' && $self->method ne '' && $method ne $self->method; - return if $source ne '' && $self->source ne '' && $source ne $self->source; - 1; -} - -1; - -=head1 BUGS - -This module is still under development. - -=head1 SEE ALSO - -L, L, L - -=head1 AUTHOR - -Lincoln Stein Elstein@cshl.orgE. - -Copyright (c) 2001 Cold Spring Harbor Laboratory. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. - -=cut - -1; diff --git a/t/LocalDB/BioDBGFF.t b/t/LocalDB/BioDBGFF.t deleted file mode 100644 index 8a5e7e157..000000000 --- a/t/LocalDB/BioDBGFF.t +++ /dev/null @@ -1,454 +0,0 @@ -# -*-Perl-*- Test Harness script for Bioperl -# $Id$ - -use strict; -use Module::Build; -use Data::Dumper; - -BEGIN { - use lib '.'; - use Bio::Root::Test; - - test_begin(-tests => 275); - - use_ok('Bio::DB::GFF'); -} - -my $fasta_files = test_input_file('dbfa'); -my $gff_file1 = test_input_file('biodbgff', 'test.gff'); -my $gff_file2 = test_input_file('biodbgff', 'test.gff3'); - -my $build = Module::Build->current; -my $test_dsn = $build->notes('test_dsn'); - -my $adaptor = $test_dsn ? $test_dsn : 'memory'; -$adaptor = shift if @ARGV; - -if ($adaptor =~ /sqlite/i) { - $adaptor = 'memory'; -} - -my @args; -if ($adaptor =~ /^dbi/) { - my $cfg = {}; - $cfg->{dbd_driver} = $build->notes('dbd_driver'); - $cfg->{test_db} = $build->notes('test_db'); - $cfg->{test_host} = $build->notes('test_host'); - $cfg->{test_user} = $build->notes('test_user'); - $cfg->{test_pass} = $build->notes('test_pass'); - $cfg->{test_dsn} = $build->notes('test_dsn'); - - $adaptor = "dbi::$cfg->{dbd_driver}" if $cfg->{dbd_driver}; - @args = ( '-adaptor' => $adaptor, - '-dsn' => $cfg->{test_dsn}, - ); - push @args,('-user' => $cfg->{test_user}) if $cfg->{test_user}; - push @args,('-pass' => $cfg->{test_pass}) if $cfg->{test_pass}; -} else { - @args = ('-adaptor' => $adaptor, - '-create' => 1); -} - -push @args,('-aggregators' => ['transcript','processed_transcript']); - -SKIP: { -for my $FILE ($gff_file1,$gff_file2) { - - my $db = eval { Bio::DB::GFF->new(@args) }; - skip "DB load failed? Skipping all! $@", 278 if $@; - ok($db); - - $db->debug(0); - $db->gff3_name_munging(1); - - # set the preferred groups - $db->preferred_groups( [ 'transcript', 'gene', 'mRNA' ] ); - my @pg = $db->preferred_groups; - is(scalar(@pg), 3); - is($pg[1], 'gene'); - - # exercise the loader - ok($db->initialize(1)); - ok($db->load_gff($FILE)); - ok($db->load_fasta($fasta_files)); - - # exercise db->types - my @types = sort $db->types; - is(scalar @types,11); - is($types[0],'CDS:confirmed'); - is($types[-1],'transposon:tc1'); - my %types = $db->types('-enumerate'=>1); - is($types{'transposon:tc1'},2); - - # exercise segment - my $segment1 = $db->segment('Contig1'); - - ok($segment1); - is($segment1->length,37450); - is($segment1->start,1); - is($segment1->end,37450); - is($segment1->strand,1); - - my $segment2 = $db->segment('Contig1',1=>1000); - is($segment2->length,1000); - is($segment2->start,1); - is($segment2->end,1000); - is($segment2->strand,1); - - my $segment3 = $db->segment('Contig1',10=>1); - is($segment3->start,10); - is($segment3->end,1); - is($segment3->strand,-1); - - # exercise attribute fetching - my @t = $db->fetch_feature_by_name(Transcript => 'trans-1'); - my ($t) = grep {$_->type eq 'transcript:confirmed'} @t; - is($t->attributes('Note'),'function unknown'); - is(join(' ',sort $t->attributes('Gene')),'abc-1 xyz-2'); - my $att = $t->attributes; - is(scalar @{$att->{Gene}},2); - @t = sort {$a->display_name cmp $b->display_name} $db->fetch_feature_by_attribute('Gene'=>'abc-1'); - cmp_ok(@t,'>',0); - is($t[0], $t); - my $seg = $db->segment('Contig1'); - @t = $seg->features(-attributes=>{'Gene'=>'abc-1'}); - cmp_ok(@t,'>',0); - is($seg->feature_count, 17); - @t = $seg->features(-attributes=>{'Gene'=>'xyz-2',Note=>'Terribly interesting'}); - is(@t,1); - - # exercise dna() a bit - my $dna = $segment2->dna; - is(length $dna,1000); - is(substr($dna,0,10),'gcctaagcct'); - is($segment3->dna,'aggcttaggc'); - is($segment1->dna, $db->dna($segment1->ref)); - - # exercise ref() - my $segment4 = $db->segment('-name'=>'c128.1','-class'=>'Transposon'); - is($segment4->length,1000); - is($segment4->start,1); - is($segment4->end,1000); - is($segment4->ref,'c128.1'); - is($segment4->strand,1); - ok(!$segment4->absolute); - - $segment4->absolute(1); - ok($segment4->absolute); - is($segment4->ref,'Contig1'); - is($segment4->start,5001); - $segment4->absolute(0); - my $tmp = $db->segment('Contig1',5001=>6000); - is($segment4->dna,$tmp->dna); - - $segment4->ref('Contig1'); - is($segment4->ref,'Contig1'); - is($segment4->start,5001); - is($segment4->end,6000); - - my $segment5 = $db->segment('-name'=>'c128.2','-class'=>'Transposon'); - is($segment5->length,1000); - is($segment5->start,1); - is($segment5->end,1000); - is($segment5->ref,'c128.2'); - is($segment5->strand,1); - - $tmp = $db->segment('Contig1',9000,8001); - is($segment5->dna,$tmp->dna); - $segment5->absolute(1); - is($segment5->strand,-1); - - # rel/rel addressing - # first two positive strand features - $segment4 = $db->segment('-name'=>'c128.1','-class'=>'Transposon'); - my $start4 = $segment4->abs_start; - $segment5 = $db->segment('Transcript' => 'trans-1'); - my $start5 = $segment5->abs_start; - $segment4->ref($segment5); - is($segment4->strand,1); - is($segment4->start,$start4-$start5+1); - is($segment4->stop,$start4-$start5+$segment4->length); - - $segment4->ref('Transposon' => 'c128.1'); - $segment5->ref('Transcript' => 'trans-1'); - $segment5->ref($segment4); - is($segment5->start,$start5-$start4+1); - - # now a positive on a negative strand feature - my $segment6 = $db->segment('Transcript'=>'trans-2'); - my $start6 = $segment6->abs_start; - is($segment6->strand,1); - is($segment6->abs_strand,-1); - $segment6->ref($segment4); - is($segment6->start,$start6-$start4+1); - is($segment6->strand,-1); - - $segment4->ref($segment6); - is($segment4->start,$start6-$start4+1); - is($segment4->strand,-1); - is($segment4->ref,$segment6); - - # the reference sequence shouldn't affect the dna - $segment6 = $db->segment('Transcript'=>'trans-2'); - $dna = $segment6->dna; - $segment6->ref($segment4); - is($segment6->dna,$dna); - - # segments should refuse to accept a reference sequence on a foreign segment - undef $@; - my $result = eval { $segment6->ref('Contig2') }; - ok(!$result); - like($@, qr/are on different sequence segments/); - - # types across a segment - $segment1 = $db->segment('Contig1'); - @types = sort $segment1->types; - is(scalar @types,6); - is($types[0],'CDS:confirmed'); - is($types[-1],'transposon:tc1'); - %types = $segment1->types('-enumerate'=>1); - is($types{'similarity:est'},3); - - # features across a segment - my @features = $segment1->features('-automerge'=>0); - is(scalar @features,17); - my %types_seen; - foreach (@features) { - $types_seen{$_->type}++; - } - my $inconsistency = 0; - foreach (keys %types,keys %types_seen) { - $inconsistency++ unless $types_seen{$_} == $types{$_}; - } - ok(!$inconsistency); - - @features = sort {$a->start<=>$b->start} @features; - - # make sure that we can use features to get at dna - is($features[0]->dna,$db->segment('Contig1',$features[0]->start,$features[0]->end)->dna); - - # check three forward features and three reverse features - # (This depends on the test.gff data) - for (1..3,-3..-1) { - $segment2 = $db->segment($features[$_],50,100); - if ($features[$_]->strand >= 0) { - is($segment2->dna,$db->segment('Contig1', - $features[$_]->start+50-1, - $features[$_]->start+100-1)->dna) - } else { - is($segment2->dna,$db->segment('Contig1', - $features[$_]->start-50+1, - $features[$_]->start-100+1)->dna) - } - } - - # exercise the aggregator - my $aggregator = Bio::DB::GFF::Aggregator->new('-method' => 'aggregated_transcript', - '-main_method' => 'transcript', - '-sub_parts' => ['exon','CDS']); - $db->add_aggregator($aggregator); - $segment1 = $db->segment('Contig1'); - @features = sort $segment1->features('aggregated_transcript'); # sort so that trans-1 comes first - is(scalar @features,2); - cmp_ok($features[0]->Exon, '>', 0); - cmp_ok($features[0]->Cds,'>', 0); - - # Test that sorting is correct. The way that test.gff is set up, the lower one is - # on the + strand and the higher is on the -. - @features = sort {$a->start <=> $b->start} @features; - is($features[0]->strand,1); - is($features[1]->strand,-1); - - my $last = 0; - $inconsistency = 0; - foreach ($features[0]->Exon) { - $inconsistency++ if $_->start > $_->end; - $inconsistency++ if $last && $_->start < $last; - $last = $_->start; - } - ok(!$inconsistency); - - $inconsistency = $last = 0; - foreach ($features[1]->Exon) { - $inconsistency++ if $_->start < $_->end; - $inconsistency++ if $last && $_->start > $last; - $last = $_->start; - } - ok(!$inconsistency); - - # relative addressing in aggregated features - my $transcript1 = $db->segment($features[0]); - $transcript1->ref($features[0]); - my @overlap = sort {$a->start <=> $b->start } $transcript1->features; - is(scalar(@overlap),5); - is($overlap[0]->start,-999); - - $transcript1 = $db->segment('Transcript' => 'trans-1'); - @overlap = sort {$a->start <=> $b->start } $transcript1->features; - is($overlap[0]->start,-999); - - # test strandedness of features - $segment1 = $db->segment('-class' => 'Transcript', - '-name' => 'trans-3', - '-start' => 1, - '-stop' => 6000); - is($segment1->strand,1); - @overlap = sort {$a->start <=> $b->start} $segment1->features('transcript'); - is(scalar(@overlap),2); - is($overlap[0]->name,'trans-3'); - is($overlap[1]->name,'trans-4'); - is($overlap[0]->strand,1); - is($overlap[1]->strand,-1); - - # testing feature id and group_id - my $tf = $overlap[0]; - ok(defined $tf->id); - my $t1 = $db->fetch_feature_by_id($tf->id); - is($t1->id,$tf->id); - - SKIP: { - if (defined $tf->group_id) { - my $t2 = $db->fetch_feature_by_gid($tf->group_id); - is($t2->group_id,$tf->group_id); - is($t2->group_id,$t1->group_id); - } else { - skip("fetch_feature_by_gid() not implemented by this adaptor",2); - } - } - - $segment1 = $db->segment('-class' => 'Transcript', - '-name' => 'trans-4', - '-start' => 1, - '-stop' => 6000); - is($segment1->strand,1); - @overlap = sort {$a->start <=> $b->start} $segment1->features('transcript'); - is($overlap[0]->name,'trans-4'); - is($overlap[1]->name,'trans-3'); - is($overlap[0]->strand,1); - is($overlap[1]->strand,-1); - - @overlap = sort {$a->start <=> $b->start} $segment1->features('Component'); - is($overlap[0]->strand,0); - -SKIP: { - # test preferred group assignments - if ($FILE =~ /\.gff$/) { - my @gene = $db->get_feature_by_name( gene => 'gene-9' ); - my @mrna = $db->get_feature_by_name( mRNA => 'trans-9' ); - is($gene[0]->ref, 'Contig4'); - is(scalar(@gene), 2); - is(scalar(@mrna), 1); - } else { - skip('preferred groups are not supported by gff3',3); - } -} - - # test iterator across a segment - $segment1 = $db->segment('Contig1'); - my $i = $segment1->features('-automerge'=>0,'-iterator'=>1); - my %strand; - while (my $s = $i->next_feature) { - $strand{$s->strand}++; - } - is(keys %strand, 3); - - # test iterator across entire database - $i = $db->features('-automerge'=>0,'-iterator'=>1); - %strand = (); - while (my $s = $i->next_feature) { - $strand{$s->strand}++; - } - is(keys %strand, 3); - - # test iterator across a segment, limited by an attribute - $i = $seg->get_feature_stream(-attributes=>{'Gene'=>'abc-1',Note=>'function unknown'}); - my $count = 0; - while ($i->next_seq) { - $count++; - } - is($count,2); - - # test that aliases work - my $st1 = $db->segment(Transcript => 'trans-3'); - ok($st1); - my $st2 = $db->segment(Transcript => 'trans-18'); # this is an alias! - ok($st2); - is($st1,$st2); - my @transcripts = $st1->features('transcript'); - is(($transcripts[0]->aliases)[0],'trans-18'); - - # test truncation - $db->strict_bounds_checking(1); - my $tseg = $db->segment(-name=>'trans-1',-class=>'Transcript',-start=>1,-stop=>500); - ok(!$tseg->truncated); - $tseg = $db->segment(-name=>'trans-1',-class=>'Transcript',-start=>1,-stop=>50000); - ok($tseg->truncated); - $db->strict_bounds_checking(0); - $tseg = $db->segment(-name=>'trans-1',-class=>'Transcript',-start=>1,-stop=>50000); - ok(!$tseg->truncated); - - # test the processed_transcript aggregator - $db->clear_aggregators; - $db->add_aggregator('processed_transcript'); - my @f = $db->fetch_feature_by_name(mRNA => 'trans-8'); - is(scalar @f,1); - is($f[0]->length,35000-32000+1); - is(scalar $f[0]->CDS,3); - is(scalar $f[0]->UTR,2); - - # test deletions - # segment delete() method - my $clone = $db->segment(Clone=>'M7.3'); - my $overlapping_feature_count = $clone->features(-range_type =>'overlaps'); - my $contained_feature_count = $clone->features(-range_type =>'contains'); - is(scalar $clone->delete(-range_type=>'contains'),$contained_feature_count); - is(scalar $clone->features,$overlapping_feature_count - $contained_feature_count); - - # database delete() method - is($db->delete(-type=>['mRNA:confirmed','transposon:tc1']),4); - is($db->delete(-type=>'UTR',-ref=>'Contig29'),undef); - is($db->delete(-type=>'CDS',-ref=>'AL12345.2',-class=>'Clone'),3); - is($db->delete_features(1,2,3),3); - - SKIP: { - $result = eval { - is($db->delete_groups(1,2,3,4,5),5); - my @features = $db->get_feature_by_name(Sequence => 'Contig2'); - is($db->delete_groups(@features),1); - 1; - }; - if (!$result && $@ =~ /not implemented/i) { - skip("delete_groups() not implemented by this adaptor",2); - } - } - - SKIP: { - test_skip(-tests => 1, -excludes_os => 'mswin'); - - # test ability to pass adaptors across a fork - if (my $child = open(F,"-|")) { # parent reads from child - ok(scalar ); - close F; - } - else { # in child - $db->clone; - my @f = $db->features(); - print @f>0; - exit 0; - } - } - - ok(!defined eval{$db->delete()}); - ok($db->delete(-force=>1)); - is(scalar $db->features,0); - ok(!$db->segment('Contig1')); - -} - -} - - - -END { - unlink $fasta_files."/directory.index"; -} diff --git a/t/data/biodbgff/test.gff b/t/data/biodbgff/test.gff deleted file mode 100644 index 067425584..000000000 --- a/t/data/biodbgff/test.gff +++ /dev/null @@ -1,42 +0,0 @@ -## sequence-region Contig1 1 37450 -Contig1 confirmed transcript 1001 2000 42 + . Transcript trans-1; Gene "abc-1"; Gene "xyz-2"; Note "function unknown" -Contig1 confirmed exon 1001 1100 . + . Transcript trans-1 -Contig1 confirmed exon 1201 1300 . + . Transcript trans-1 -Contig1 confirmed exon 1401 1450 . + . Transcript trans-1 -Contig1 confirmed CDS 1051 1100 . + 0 Transcript trans-1 -Contig1 confirmed CDS 1201 1300 . + 2 Transcript trans-1 -Contig1 confirmed CDS 1401 1440 . + 0 Transcript trans-1 -Contig1 est similarity 1001 1100 96 . . Target "EST:CEESC13F" 1 100 -Contig1 est similarity 1201 1300 99 . . Target "EST:CEESC13F" 101 200 -Contig1 est similarity 1401 1450 99 . . Target "EST:CEESC13F" 201 250 -Contig1 tc1 transposon 5001 6000 . + . Transposon c128.1 -Contig1 tc1 transposon 8001 9000 . - . Transposon c128.2 -Contig1 confirmed transcript 30001 31000 . - . Transcript trans-2; Gene "xyz-2"; Note "Terribly interesting" -Contig1 confirmed exon 30001 30100 . - . Transcript trans-2; Gene "abc-1"; Note "function unknown" -Contig1 confirmed exon 30701 30800 . - . Transcript trans-2 -Contig1 confirmed exon 30801 31000 . - . Transcript trans-2 - -## sequence-region Contig2 1 37450 -Contig2 clone Component 1 2000 . . . Target "Clone:AL12345.1" 1 2000; Note "Terribly interesting" -Contig2 clone Component 2001 5000 . . . Target "Clone:AL11111.1" 6000 3001 -Contig2 clone Component 5001 20000 . . . Target "Clone:AC13221.2" 1 15000 -Contig2 clone Component 2001 37450 . . . Target "Clone:M7.3" 1001 36450 -Contig2 predicted transcript 2501 4500 . + . Transcript trans-3 ; Alias trans-18 -Contig2 predicted transcript 5001 8001 . - . Transcript trans-4 - - -#processed_transcript -Contig3 clone Component 1 50000 . . . Clone AL12345.2 -Contig3 confirmed mRNA 32000 35000 . + . mRNA trans-8 -Contig3 confirmed UTR 32000 32100 . + . mRNA trans-8 -Contig3 confirmed CDS 32101 33000 . + . mRNA trans-8 -Contig3 confirmed CDS 34000 34500 . + . mRNA trans-8 -Contig3 confirmed CDS 34600 34900 . + . mRNA trans-8 -Contig3 confirmed UTR 34901 35000 . + . mRNA trans-8 - -## preferred group assignments -Contig4 clone Component 1 50000 . . . Clone ABC123 -Contig4 confirmed gene 32000 35000 . + . Misc thing1 ; gene gene-9 -Contig4 confirmed mRNA 32000 35000 . + . Misc thing2 ; mRNA trans-9 ; gene gene-9 -Contig4 confirmed CDS 32000 35000 . + . Misc thing3 ; mRNA trans-9 - diff --git a/t/data/biodbgff/test.gff3 b/t/data/biodbgff/test.gff3 deleted file mode 100644 index 16d0a68be..000000000 --- a/t/data/biodbgff/test.gff3 +++ /dev/null @@ -1,44 +0,0 @@ -##gff-version 3 -##sequence-region Contig1 1 37450 -#Contig1 reference Component 1 37450 . . . Name=Contig1 -Contig1 confirmed transcript 1001 2000 42 + . ID=Transcript:trans-1;Gene=abc-1;Gene=xyz-2;Note=function+unknown -Contig1 confirmed exon 1001 1100 . + . ID=Transcript:trans-1 -Contig1 confirmed exon 1201 1300 . + . ID=Transcript:trans-1 -Contig1 confirmed exon 1401 1450 . + . ID=Transcript:trans-1 -Contig1 confirmed CDS 1051 1100 . + 0 ID=Transcript:trans-1 -Contig1 confirmed CDS 1201 1300 . + 2 ID=Transcript:trans-1 -Contig1 confirmed CDS 1401 1440 . + 0 ID=Transcript:trans-1 -Contig1 est similarity 1001 1100 96 . . Target=EST:CEESC13F 1 100 + -Contig1 est similarity 1201 1300 99 . . Target=EST:CEESC13F 101 200 + -Contig1 est similarity 1401 1450 99 . . Target=EST:CEESC13F 201 250 + -Contig1 tc1 transposon 5001 6000 . + . ID=Transposon:c128.1 -Contig1 tc1 transposon 8001 9000 . - . ID=Transposon:c128.2 -Contig1 confirmed transcript 30001 31000 . - . ID=Transcript:trans-2;Gene=xyz-2;Note=Terribly+interesting -Contig1 confirmed exon 30001 30100 . - . ID=Transcript:trans-2;Gene=abc-1;Note=function+unknown -Contig1 confirmed exon 30701 30800 . - . ID=Transcript:trans-2 -Contig1 confirmed exon 30801 31000 . - . ID=Transcript:trans-2 - -##sequence-region Contig2 1 37450 -Contig2 clone Component 1 2000 . . . Target=Clone:AL12345.1 1 2000 +;Note=Terribly+interesting -Contig2 clone Component 2001 5000 . . . Target=Clone:AL11111.1 6000 3001 + -Contig2 clone Component 5001 20000 . . . Target=Clone:AC13221.2 1 15000 + -Contig2 clone Component 2001 37450 . . . Target=Clone:M7.3 1001 36450 + -Contig2 predicted transcript 2501 4500 . + . ID=Transcript:trans-3;Alias=trans-18 -Contig2 predicted transcript 5001 8001 . - . ID=Transcript:trans-4 - - -#processed_transcript -Contig3 clone Component 1 50000 . . . ID=Clone:AL12345.2 -Contig3 confirmed mRNA 32000 35000 . + . ID=mRNA:trans-8 -Contig3 confirmed UTR 32000 32100 . + . ID=mRNA:trans-8 -Contig3 confirmed CDS 32101 33000 . + . ID=mRNA:trans-8 -Contig3 confirmed CDS 34000 34500 . + . ID=mRNA:trans-8 -Contig3 confirmed CDS 34600 34900 . + . ID=mRNA:trans-8 -Contig3 confirmed UTR 34901 35000 . + . ID=mRNA:trans-8 - -## preferred group assignments -Contig4 clone Component 1 50000 . . . ID=Clone:ABC123 -Contig4 confirmed gene 32000 35000 . + . ID=Misc:thing1;gene=gene-9 -Contig4 confirmed mRNA 32000 35000 . + . ID=Misc:thing2;mRNA=trans-9;gene=gene-9 -Contig4 confirmed CDS 32000 35000 . + . ID=Misc:thing3;mRNA=trans-9 - -- 2.11.4.GIT