maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / DB / GFF / Adaptor / dbi.pm
blob25ac3b12aaa3255dfb8a478fa2b27d203bb436ae
2 =head1 NAME
4 Bio::DB::GFF::Adaptor::dbi -- Database adaptor for DBI (SQL) databases
6 =head1 SYNOPSIS
8 See L<Bio::DB::GFF>
10 =head1 DESCRIPTION
12 This is the base class for DBI-based adaptors. It does everything
13 except generating the text of the queries to be used. See the section
14 QUERIES TO IMPLEMENT for the list of methods that must be implemented.
16 =cut
18 package Bio::DB::GFF::Adaptor::dbi;
20 # base class for dbi-based implementations
21 use strict;
23 use DBI;
24 use Bio::DB::GFF::Util::Rearrange; # for rearrange()
25 use Bio::DB::GFF::Util::Binning;
26 use Bio::DB::GFF::Adaptor::dbi::iterator;
27 use Bio::DB::GFF::Adaptor::dbi::caching_handle;
29 use base qw(Bio::DB::GFF);
31 # constants for choosing
33 use constant MAX_SEGMENT => 1_000_000_000; # the largest a segment can get
35 # this is the largest that any reference sequence can be (100 megabases)
36 use constant MAX_BIN => 1_000_000_000;
38 # this is the smallest bin (1 K)
39 use constant MIN_BIN => 1000;
41 # size of range over which it is faster to force the database to use the range for indexing
42 use constant STRAIGHT_JOIN_LIMIT => 200_000;
44 # this is the size to which DNA should be shredded
45 use constant DNA_CHUNK_SIZE => 2000;
47 # size of summary bins for interval coverage statistics
48 use constant SUMMARY_BIN_SIZE => 1000;
50 # for debugging fbin optimization
51 use constant EPSILON => 1e-7; # set to zero if you trust mysql's floating point comparisons
52 use constant OPTIMIZE => 1; # set to zero to turn off optimization completely
54 ##############################################################################
57 =head2 new
59 Title : new
60 Usage : $db = Bio::DB::GFF->new(@args)
61 Function: create a new adaptor
62 Returns : a Bio::DB::GFF object
63 Args : see below
64 Status : Public
66 This is the constructor for the adaptor. It is called automatically
67 by Bio::DB::GFF-E<gt>new. In addition to arguments that are common among
68 all adaptors, the following class-specific arguments are recgonized:
70 Argument Description
71 -------- -----------
73 -dsn the DBI data source, e.g. 'dbi:mysql:ens0040'
75 -user username for authentication
77 -pass the password for authentication
79 =cut
81 # Create a new Bio::DB::GFF::Adaptor::dbi object
82 sub new {
83 my $class = shift;
84 my ($features_db,$username,$auth,$other) = rearrange([
85 [qw(FEATUREDB DB DSN)],
86 [qw(USERNAME USER)],
87 [qw(PASSWORD PASSWD PASS)],
88 ],@_);
90 $features_db || $class->throw("new(): Provide a data source or DBI database");
92 if (!ref($features_db)) {
93 my $dsn = $features_db;
94 my @args;
95 push @args,$username if defined $username;
96 push @args,$auth if defined $auth;
97 $features_db = Bio::DB::GFF::Adaptor::dbi::caching_handle->new($dsn,@args)
98 || $class->throw("new(): Failed to connect to $dsn: "
99 . Bio::DB::GFF::Adaptor::dbi::caching_handle->errstr);
100 } else {
101 $features_db->isa('DBI::db')
102 || $class->throw("new(): $features_db is not a DBI handle");
105 # fill in object
106 return bless {
107 features_db => $features_db
108 },$class;
111 sub debug {
112 my $self = shift;
113 $self->features_db->debug(@_);
114 $self->SUPER::debug(@_);
117 =head2 features_db
119 Title : features_db
120 Usage : $dbh = $db->features_db
121 Function: get database handle
122 Returns : a DBI handle
123 Args : none
124 Status : Public
126 Note: what is returned is not really a DBI::db handle, but a
127 subclass of one. This means that you cannot manipulate the
128 handle's attributes directly. Instead call the attribute
129 method:
131 my $dbh = $db->features_db;
132 $dbh->attribute(AutoCommit=>0);
134 =cut
136 sub features_db { shift->{features_db} }
137 sub dbh { shift->{features_db} }
139 =head2 get_dna
141 Title : get_dna
142 Usage : $string = $db->get_dna($name,$start,$stop,$class)
143 Function: get DNA string
144 Returns : a string
145 Args : name, class, start and stop of desired segment
146 Status : Public
148 This method performs the low-level fetch of a DNA substring given its
149 name, class and the desired range. It is actually a front end to the
150 abstract method make_dna_query(), which it calls after some argument
151 consistency checking.
153 =cut
155 sub get_dna {
156 my $self = shift;
157 my ($ref,$start,$stop,$class) = @_;
159 my ($offset_start,$offset_stop);
161 my $has_start = defined $start;
162 my $has_stop = defined $stop;
164 my $reversed;
165 if ($has_start && $has_stop && $start > $stop) {
166 $reversed++;
167 ($start,$stop) = ($stop,$start);
170 # turn start and stop into 0-based offsets
171 my $cs = $self->dna_chunk_size;
172 $start -= 1; $stop -= 1;
173 $offset_start = int($start/$cs)*$cs;
174 $offset_stop = int($stop/$cs)*$cs;
176 my $sth;
177 # special case, get it all
178 if (!($has_start || $has_stop)) {
179 $sth = $self->dbh->do_query('select fdna,foffset from fdna where fref=? order by foffset',$ref);
182 elsif (!$has_stop) {
183 $sth = $self->dbh->do_query('select fdna,foffset from fdna where fref=? and foffset>=? order by foffset',
184 $ref,$offset_start);
187 else { # both start and stop defined
188 $sth = $self->dbh->do_query('select fdna,foffset from fdna where fref=? and foffset>=? and foffset<=? order by foffset',
189 $ref,$offset_start,$offset_stop);
192 my $dna = '';
193 while (my($frag,$offset) = $sth->fetchrow_array) {
194 substr($frag,0,$start-$offset) = '' if $has_start && $start > $offset;
195 $dna .= $frag;
197 substr($dna,$stop-$start+1) = '' if $has_stop && $stop-$start+1 < length($dna);
198 if ($reversed) {
199 $dna = reverse $dna;
200 $dna =~ tr/gatcGATC/ctagCTAG/;
203 $sth->finish;
204 $dna;
208 =head2 get_abscoords
210 Title : get_abscoords
211 Usage : ($refseq,$refclass,$start,$stop,$strand) = $db->get_abscoords($name,$class)
212 Function: get absolute coordinates for landmark
213 Returns : an array ref -- see below
214 Args : name and class of desired landmark
215 Status : Public
217 This method performs the low-level resolution of a landmark into a
218 reference sequence and position.
220 The result is an array ref, each element of which is a five-element
221 list containing reference sequence name, class, start, stop and strand.
223 =cut
225 sub get_abscoords {
226 my $self = shift;
227 my ($name,$class,$refseq) = @_;
229 my $sth = $self->make_abscoord_query($name,$class,$refseq);
231 my @result;
232 while (my @row = $sth->fetchrow_array) {
233 push @result,\@row
235 $sth->finish;
237 if (@result == 0) {
238 #$self->error("$name not found in database");
239 my $sth2 = $self->make_aliasabscoord_query($name,$class);
241 while (my @row2 = $sth2->fetchrow_array) {
242 push @result,\@row2
244 $sth->finish;
246 if (@result == 0){
247 $self->error("$name not found in database");
248 return;
251 return \@result;
255 =head2 get_features
257 Title : get_features
258 Usage : $db->get_features($search,$options,$callback)
259 Function: retrieve features from the database
260 Returns : number of features retrieved
261 Args : see below
262 Status : Public
264 This is the low-level method that is called to retrieve GFF lines from
265 the database. It is responsible for retrieving features that satisfy
266 range and feature type criteria, and passing the GFF fields to a
267 callback subroutine.
269 See the manual page for Bio::DB::GFF for the interpretation of the
270 arguments and how the information retrieved by get_features is passed
271 to the callback for processing.
273 Internally, get_features() is a front end for range_query(). The
274 latter method constructs the query and executes it. get_features()
275 calls fetchrow_array() to recover the fields and passes them to the
276 callback.
278 =cut
280 # Given sequence name, range, and optional filter, retrieve list of
281 # all features. Passes features through callback.
282 sub get_features {
283 my $self = shift;
284 my ($search,$options,$callback) = @_;
285 $callback || $self->throw('must provide a callback argument');
287 my $sth = $self->range_query(@{$search}{qw(rangetype
288 refseq
289 refclass
290 start
291 stop
292 types) },
293 @{$options}{qw(
294 sparse
295 sort_by_group
296 ATTRIBUTES
297 BINSIZE)}) or return;
299 my $count = 0;
300 while (my @row = $sth->fetchrow_array) {
301 $callback->(@row);
302 $count++;
304 $sth->finish;
305 return $count;
308 =head2 classes
310 Title : classes
311 Usage : $db->classes
312 Function: return list of landmark classes in database
313 Returns : a list of classes
314 Args : none
315 Status : public
317 This routine returns the list of reference classes known to the
318 database, or empty if classes are not used by the database. Classes
319 are distinct from types, being essentially qualifiers on the reference
320 namespaces.
322 NOTE: In the current mysql-based schema, this query takes a while to
323 run due to the classes not being normalized.
325 =cut
327 sub classes {
328 my $self = shift;
329 my ($query,@args) = $self->make_classes_query or return;
330 my $sth = $self->dbh->do_query($query,@args);
331 my @classes;
332 while (my ($c) = $sth->fetchrow_array) {
333 push @classes,$c;
335 @classes;
338 =head2 make_classes_query
340 Title : make_classes_query
341 Usage : ($query,@args) = $db->make_classes_query
342 Function: return query fragment for generating list of reference classes
343 Returns : a query and args
344 Args : none
345 Status : public
347 =cut
349 sub make_classes_query {
350 my $self = shift;
351 return;
354 =head2 _feature_by_name
356 Title : _feature_by_name
357 Usage : $db->get_features_by_name($name,$class,$callback)
358 Function: get a list of features by name and class
359 Returns : count of number of features retrieved
360 Args : name of feature, class of feature, and a callback
361 Status : protected
363 This method is used internally. The callback arguments are those used
364 by make_feature(). Internally, it invokes the following abstract procedures:
366 make_features_select_part
367 make_features_from_part
368 make_features_by_name_where_part
369 make_features_by_alias_where_part (for aliases)
370 make_features_join_part
372 =cut
374 sub _feature_by_name {
375 my $self = shift;
376 my ($class,$name,$location,$callback) = @_;
377 $callback || $self->throw('must provide a callback argument');
379 my $select = $self->make_features_select_part;
380 my $from = $self->make_features_from_part(undef,{sparse_groups=>1});
381 my ($where,@args) = $self->make_features_by_name_where_part($class,$name);
382 my $join = $self->make_features_join_part;
383 my $range = $self->make_features_by_range_where_part('overlaps',
384 {refseq=>$location->[0],
385 class =>'',
386 start=>$location->[1],
387 stop =>$location->[2]}) if $location;
388 # group query
389 my $query1 = "SELECT $select FROM $from WHERE $where AND $join";
390 $query1 .= " AND $range" if $range;
392 # alias query
393 $from = $self->make_features_from_part(undef,{attributes=>1});
394 ($where,@args) = $self->make_features_by_alias_where_part($class,$name); # potential bug - @args1==@args2?
396 my $query2 = "SELECT $select FROM $from WHERE $where AND $join";
397 $query2 .= " AND $range" if $range;
399 my $count = 0;
401 for my $query ($query1,$query2) {
402 my $sth = $self->dbh->do_query($query,@args);
403 while (my @row = $sth->fetchrow_array) {
404 $callback->(@row);
405 $count++;
407 $sth->finish;
410 return $count;
413 =head2 _feature_by_id
415 Title : _feature_by_id
416 Usage : $db->_feature_by_id($ids,$type,$callback)
417 Function: get a list of features by ID
418 Returns : count of number of features retrieved
419 Args : arrayref containing list of IDs to fetch and a callback
420 Status : protected
422 This method is used internally. The $type selector is one of
423 "feature" or "group". The callback arguments are those used by
424 make_feature(). Internally, it invokes the following abstract
425 procedures:
427 make_features_select_part
428 make_features_from_part
429 make_features_by_id_where_part
430 make_features_join_part
432 =cut
434 sub _feature_by_id {
435 my $self = shift;
436 my ($ids,$type,$callback) = @_;
437 $callback || $self->throw('must provide a callback argument');
439 my $select = $self->make_features_select_part;
440 my $from = $self->make_features_from_part;
441 my ($where,@args) = $type eq 'feature' ? $self->make_features_by_id_where_part($ids)
442 : $self->make_features_by_gid_where_part($ids);
443 my $join = $self->make_features_join_part;
444 my $query = "SELECT $select FROM $from WHERE $where AND $join";
445 my $sth = $self->dbh->do_query($query,@args);
447 my $count = 0;
448 while (my @row = $sth->fetchrow_array) {
449 $callback->(@row);
450 $count++;
452 $sth->finish;
453 return $count;
456 sub _feature_by_attribute {
457 my $self = shift;
458 my ($attributes,$callback) = @_;
459 $callback || $self->throw('must provide a callback argument');
461 my $select = $self->make_features_select_part;
462 my $from = $self->make_features_from_part(undef,{attributes=>$attributes});
463 my ($where,@args) = $self->make_features_by_range_where_part('',{attributes=>$attributes});
464 my $join = $self->make_features_join_part({attributes=>$attributes});
465 my $query = "SELECT $select FROM $from WHERE $where AND $join";
466 my $sth = $self->dbh->do_query($query,@args);
468 my $count = 0;
469 while (my @row = $sth->fetchrow_array) {
470 $callback->(@row);
471 $count++;
473 $sth->finish;
474 return $count;
477 =head2 get_types
479 Title : get_types
480 Usage : $db->get_types($refseq,$refclass,$start,$stop,$count)
481 Function: get list of types
482 Returns : a list of Bio::DB::GFF::Typename objects
483 Args : see below
484 Status : Public
486 This method is responsible for fetching the list of feature type names
487 from the database. The query may be limited to a particular range, in
488 which case the range is indicated by a landmark sequence name and
489 class and its subrange, if any. These arguments may be undef if it is
490 desired to retrieve all feature types in the database (which may be a
491 slow operation in some implementations).
493 If the $count flag is false, the method returns a simple list of
494 vBio::DB::GFF::Typename objects. If $count is true, the method returns
495 a list of $name=E<gt>$count pairs, where $count indicates the number of
496 times this feature occurs in the range.
498 Internally, this method calls upon the following functions to generate
499 the SQL and its bind variables:
501 ($q1,@args) = make_types_select_part(@args);
502 ($q2,@args) = make_types_from_part(@args);
503 ($q3,@args) = make_types_where_part(@args);
504 ($q4,@args) = make_types_join_part(@args);
505 ($q5,@args) = make_types_group_part(@args);
507 The components are then combined as follows:
509 $query = "SELECT $q1 FROM $q2 WHERE $q3 AND $q4 GROUP BY $q5";
511 If any of the query fragments contain the ? bind variable, then the
512 same number of bind arguments must be provided in @args. The
513 fragment-generating functions are described below.
515 =cut
517 sub get_types {
518 my $self = shift;
519 my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_;
520 my $straight = $self->do_straight_join($srcseq,$start,$stop,[]) ? 'straight_join' : '';
521 my ($select,@args1) = $self->make_types_select_part($srcseq,$start,$stop,$want_count,$typelist);
522 my ($from,@args2) = $self->make_types_from_part($srcseq,$start,$stop,$want_count,$typelist);
523 my ($join,@args3) = $self->make_types_join_part($srcseq,$start,$stop,$want_count,$typelist);
524 my ($where,@args4) = $self->make_types_where_part($srcseq,$start,$stop,$want_count,$typelist);
525 my ($group,@args5) = $self->make_types_group_part($srcseq,$start,$stop,$want_count,$typelist);
527 my $query = "SELECT $straight $select FROM $from WHERE $join AND $where";
528 $query .= " GROUP BY $group" if $group;
529 my @args = (@args1,@args2,@args3,@args4,@args5);
530 my $sth = $self->dbh->do_query($query,@args) or return;
532 my (%result,%obj);
533 while (my ($method,$source,$count) = $sth->fetchrow_array) {
534 my $type = Bio::DB::GFF::Typename->new($method,$source);
535 $result{$type} = $count;
536 $obj{$type} = $type;
538 return $want_count ? %result : values %obj;
541 =head2 range_query
543 Title : range_query
544 Usage : $db->range_query($range_type,$refseq,$refclass,$start,$stop,$types,$order_by_group,$attributes,$binsize)
545 Function: create statement handle for range/overlap queries
546 Returns : a DBI statement handle
547 Args : see below
548 Status : Protected
550 This method constructs the statement handle for this module's central
551 query: given a range and/or a list of feature types, fetch their GFF
552 records.
554 The positional arguments are as follows:
556 Argument Description
558 $isrange A flag indicating that this is a range.
559 query. Otherwise an overlap query is
560 assumed.
562 $refseq The reference sequence name (undef if no range).
564 $refclass The reference sequence class (undef if no range).
566 $start The start of the range (undef if none).
568 $stop The stop of the range (undef if none).
570 $types Array ref containing zero or feature types in the
571 format [method,source].
573 $order_by_group A flag indicating that statement handler should group
574 the features by group id (handy for iterative fetches)
576 $attributes A hash containing select attributes.
578 $binsize A bin size for generating tables of feature density.
580 If successful, this method returns a statement handle. The handle is
581 expected to return the fields described for get_features().
583 Internally, range_query() makes calls to the following methods,
584 each of which is expected to be overridden in subclasses:
586 $select = $self->make_features_select_part;
587 $from = $self->make_features_from_part;
588 $join = $self->make_features_join_part;
589 ($where,@args) = $self->make_features_by_range_where_part($isrange,$srcseq,$class,
590 $start,$stop,$types,$class);
592 The query that is constructed looks like this:
594 SELECT $select FROM $from WHERE $join AND $where
596 The arguments that are returned from make_features_by_range_where_part() are
597 passed to the statement handler's execute() method.
599 range_query() also calls a do_straight_join() method, described
600 below. If this method returns true, then the keyword "straight_join"
601 is inserted right after SELECT.
603 =cut
605 sub range_query {
606 my $self = shift;
607 my($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes,$bin) = @_;
609 my $dbh = $self->features_db;
611 # NOTE: straight_join is necessary in some database to force the right index to be used.
612 my %a = (refseq=>$refseq,class=>$class,start=>$start,stop=>$stop,types=>$types,attributes=>$attributes,bin_width=>$bin);
613 my $straight = $self->do_straight_join(\%a) ? 'straight_join' : '';
614 my $select = $self->make_features_select_part(\%a);
615 my $from = $self->make_features_from_part($sparse,\%a);
616 my $join = $self->make_features_join_part(\%a);
617 my ($where,@args) = $self->make_features_by_range_where_part($rangetype,\%a);
618 my ($group_by,@more_args) = $self->make_features_group_by_part(\%a);
619 my $order_by = $self->make_features_order_by_part(\%a) if $order_by_group;
621 my $query = "SELECT $straight $select FROM $from WHERE $join";
622 $query .= " AND $where" if $where;
623 if ($group_by) {
624 $query .= " GROUP BY $group_by";
625 push @args,@more_args;
627 $query .= " ORDER BY $order_by" if $order_by;
629 my $sth = $self->dbh->do_query($query,@args);
630 $sth;
633 =head2 make_features_by_range_where_part
635 Title : make_features_by_range_where_part
636 Usage : ($string,@args) =
637 $db->make_features_select_part($isrange,$refseq,$class,$start,$stop,$types)
638 Function: make where part of the features query
639 Returns : the list ($query,@bind_args)
640 Args : see below
641 Status : Protected
643 This method creates the part of the features query that immediately
644 follows the WHERE keyword and is ANDed with the string returned by
645 make_features_join_part().
647 The six positional arguments are a flag indicating whether to perform
648 a range search or an overlap search, the reference sequence, class,
649 start and stop, all of which define an optional range to search in,
650 and an array reference containing a list [$method,$souce] pairs.
652 The method result is a multi-element list containing the query string
653 and the list of runtime arguments to bind to it with the execute()
654 method.
656 This method's job is to clean up arguments and perform consistency
657 checking. The real work is done by the following abstract methods:
659 Method Description
661 refseq_query() Return the query string needed to match the reference
662 sequence.
664 range_query() Return the query string needed to find all features contained
665 within a range.
667 overlap_query() Return the query string needed to find all features that overlap
668 a range.
670 See Bio::DB::Adaptor::dbi::mysql for an example of how this works.
672 =cut
676 sub make_features_by_range_where_part {
677 my $self = shift;
678 my ($rangetype,$options) = @_;
679 $options ||= {};
680 my ($refseq,$class,$start,$stop,$types,$attributes) =
681 @{$options}{qw(refseq class start stop types attributes)};
683 my (@query,@args);
685 if ($refseq) {
686 my ($q,@a) = $self->refseq_query($refseq,$class);
687 push @query,$q;
688 push @args,@a;
691 if (defined $start or defined $stop) {
692 $start = 0 unless defined($start);
693 $stop = MAX_SEGMENT unless defined($stop);
695 my ($range_query,@range_args) =
696 $rangetype eq 'overlaps' ? $self->overlap_query($start,$stop)
697 : $rangetype eq 'contains' ? $self->contains_query($start,$stop)
698 : $rangetype eq 'contained_in' ? $self->contained_in_query($start,$stop)
699 : ();
701 push @query,$range_query;
702 push @args,@range_args;
705 if (defined $types && @$types) {
706 my ($type_query,@type_args) = $self->types_query($types);
707 push @query,$type_query;
708 push @args,@type_args;
711 if ($attributes) {
712 my ($attribute_query,@attribute_args) = $self->make_features_by_attribute_where_part($attributes);
713 push @query,"($attribute_query)";
714 push @args,@attribute_args;
717 my $query = join "\n\tAND ",@query;
718 return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args);
721 =head2 do_straight_join
723 Title : do_straight_join
724 Usage : $boolean = $db->do_straight_join($refseq,$class,$start,$stop,$types)
725 Function: optimization flag
726 Returns : a flag
727 Args : see range_query()
728 Status : Protected
730 This subroutine, called by range_query() returns a boolean flag.
731 If true, range_query() will perform a straight join, which can be
732 used to optimize certain SQL queries. The four arguments correspond
733 to similarly-named arguments passed to range_query().
735 =cut
737 sub do_straight_join { 0 } # false by default
739 =head2 string_match
741 Title : string_match
742 Usage : $string = $db->string_match($field,$value)
743 Function: create a SQL fragment for performing exact or regexp string matching
744 Returns : query string
745 Args : the table field and match value
746 Status : public
748 This method examines the passed value for meta characters. If so it
749 produces a SQL fragment that performs a regular expression match.
750 Otherwise, it produces a fragment that performs an exact string match.
752 This method is not used in the module, but is available for use by
753 subclasses.
755 =cut
757 sub string_match {
758 my $self = shift;
759 my ($field,$value) = @_;
760 return qq($field = ?) if $value =~ /^[!@%&a-zA-Z0-9_\'\" ~-]+$/;
761 return qq($field REGEXP ?);
764 =head2 exact_match
766 Title : exact_match
767 Usage : $string = $db->exact_match($field,$value)
768 Function: create a SQL fragment for performing exact string matching
769 Returns : query string
770 Args : the table field and match value
771 Status : public
773 This method produces the SQL fragment for matching a field name to a
774 constant string value.
776 =cut
778 sub exact_match {
779 my $self = shift;
780 my ($field,$value) = @_;
781 return qq($field = ?);
784 =head2 search_notes
786 Title : search_notes
787 Usage : @search_results = $db->search_notes("full text search string",$limit)
788 Function: Search the notes for a text string, using mysql full-text search
789 Returns : array of results
790 Args : full text search string, and an optional row limit
791 Status : public
793 This is a mysql-specific method. Given a search string, it performs a
794 full-text search of the notes table and returns an array of results.
795 Each row of the returned array is a arrayref containing the following fields:
797 column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment()
798 column 2 The text of the note
799 column 3 A relevance score.
800 column 4 A Bio::DB::GFF::Typename object
802 =cut
804 sub search_notes {
805 my $self = shift;
806 my ($search_string,$limit) = @_;
808 $search_string =~ tr/*?//d;
810 my @words = $search_string =~ /(\w+)/g;
811 my $regex = join '|',@words;
812 my @searches = map {"fattribute_value LIKE '%${_}%'"} @words;
813 my $search = join(' OR ',@searches);
815 my $query = <<END;
816 SELECT distinct gclass,gname,fattribute_value,fmethod,fsource
817 FROM fgroup,fattribute_to_feature,fdata,ftype
818 WHERE fgroup.gid=fdata.gid
819 AND fdata.fid=fattribute_to_feature.fid
820 AND fdata.ftypeid=ftype.ftypeid
821 AND ($search)
825 my $sth = $self->dbh->do_query($query);
826 my @results;
827 while (my ($class,$name,$note,$method,$source) = $sth->fetchrow_array) {
828 next unless $class && $name; # sorry, ignore NULL objects
829 my @matches = $note =~ /($regex)/g;
830 my $relevance = 10*@matches;
831 my $featname = Bio::DB::GFF::Featname->new($class=>$name);
832 my $type = Bio::DB::GFF::Typename->new($method,$source);
833 push @results,[$featname,$note,$relevance,$type];
834 last if $limit && @results >= $limit;
836 @results;
840 =head2 meta
842 Title : meta
843 Usage : $value = $db->meta($name [,$newval])
844 Function: get or set a meta variable
845 Returns : a string
846 Args : meta variable name and optionally value
847 Status : public
849 Get or set a named metavariable for the database. Metavariables can
850 be used for database-specific settings. This method calls two
851 class-specific methods which must be implemented:
853 make_meta_get_query() Returns a sql fragment which given a meta
854 parameter name, returns its value. One bind
855 variable.
856 make_meta_set_query() Returns a sql fragment which takes two bind
857 arguments, the parameter name and its value
860 Don't make changes unless you know what you're doing! It will affect the
861 persistent database.
863 =cut
865 sub meta {
866 my $self = shift;
867 my $param_name = uc shift;
869 # getting
870 if (@_) {
871 my $value = shift;
872 my $sql = $self->make_meta_set_query() or return;
873 my $sth = $self->dbh->prepare_delayed($sql)
874 or $self->error("Can't prepare $sql: ",$self->dbh->errstr), return;
875 $sth->execute($param_name,$value)
876 or $self->error("Can't execute $sql: ",$self->dbh->errstr), return;
877 $sth->finish;
878 return $self->{meta}{$param_name} = $value;
881 elsif (exists $self->{meta}{$param_name}) {
882 return $self->{meta}{$param_name};
885 else {
886 undef $self->{meta}{$param_name}; # so that we don't check again
887 my $sql = $self->make_meta_get_query() or return;
888 my $sth = $self->dbh->prepare_delayed($sql)
889 or $self->error("Can't prepare $sql: ",$self->dbh->errstr), return;
890 $sth->execute($param_name)
891 or $self->error("Can't execute $sql: ",$sth->errstr),return;
892 my ($value) = $sth->fetchrow_array;
893 $sth->finish;
894 return $self->{meta}{$param_name} = $value;
899 =head2 make_meta_get_query
901 Title : make_meta_get_query
902 Usage : $sql = $db->make_meta_get_query
903 Function: return SQL fragment for getting a meta parameter
904 Returns : SQL fragment
905 Args : none
906 Status : public
908 By default this does nothing; meta parameters are not stored or
909 retrieved.
911 =cut
913 sub make_meta_get_query {
914 return 'SELECT fvalue FROM fmeta WHERE fname=?';
918 sub dna_chunk_size {
919 my $self = shift;
920 $self->meta('chunk_size') || DNA_CHUNK_SIZE;
923 =head2 make_meta_set_query
925 Title : make_meta_set_query
926 Usage : $sql = $db->make_meta_set_query
927 Function: return SQL fragment for setting a meta parameter
928 Returns : SQL fragment
929 Args : none
930 Status : public
932 By default this does nothing; meta parameters are not stored or
933 retrieved.
935 =cut
937 sub make_meta_set_query {
938 return;
941 =head2 default_meta_values
943 Title : default_meta_values
944 Usage : %values = $db->default_meta_values
945 Function: empty the database
946 Returns : a list of tag=>value pairs
947 Args : none
948 Status : protected
950 This method returns a list of tag=E<gt>value pairs that contain default
951 meta information about the database. It is invoked by initialize() to
952 write out the default meta values. The base class version returns an
953 empty list.
955 For things to work properly, meta value names must be UPPERCASE.
957 =cut
959 sub default_meta_values {
960 my $self = shift;
961 my @values = $self->SUPER::default_meta_values;
962 return (
963 @values,
964 max_bin => MAX_BIN,
965 min_bin => MIN_BIN,
966 straight_join_limit => STRAIGHT_JOIN_LIMIT,
967 chunk_size => DNA_CHUNK_SIZE,
971 sub min_bin {
972 my $self = shift;
973 return $self->meta('min_bin') || MIN_BIN;
975 sub max_bin {
976 my $self = shift;
977 return $self->meta('max_bin') || MAX_BIN;
980 sub straight_join_limit {
981 my $self = shift;
982 return $self->meta('straight_join_limit') || STRAIGHT_JOIN_LIMIT;
985 =head2 get_features_iterator
987 Title : get_features_iterator
988 Usage : $iterator = $db->get_features_iterator($search,$options,$callback)
989 Function: create an iterator on a features() query
990 Returns : A Bio::DB::GFF::Adaptor::dbi::iterator object
991 Args : see get_features()
992 Status : public
994 This method is similar to get_features(), except that it returns an
995 iterator across the query. See
996 L<Bio::DB::GFF::Adaptor::dbi::iterator>.
998 =cut
1000 sub get_features_iterator {
1001 my $self = shift;
1002 my ($search,$options,$callback) = @_;
1003 $callback || $self->throw('must provide a callback argument');
1004 my $sth = $self->range_query(@{$search}{qw(rangetype
1005 refseq
1006 refclass
1007 start
1008 stop
1009 types)},
1010 @{$options}{qw(
1011 sparse
1012 sort_by_group
1013 ATTRIBUTES
1014 BINSIZE)}) or return;
1015 return Bio::DB::GFF::Adaptor::dbi::iterator->new($sth,$callback);
1018 ########################## loading and initialization #####################
1020 =head2 do_initialize
1022 Title : do_initialize
1023 Usage : $success = $db->do_initialize($drop_all)
1024 Function: initialize the database
1025 Returns : a boolean indicating the success of the operation
1026 Args : a boolean indicating whether to delete existing data
1027 Status : protected
1029 This method will load the schema into the database. If $drop_all is
1030 true, then any existing data in the tables known to the schema will be
1031 deleted.
1033 Internally, this method calls schema() to get the schema data.
1035 =cut
1037 # Create the schema from scratch.
1038 # You will need create privileges for this.
1039 sub do_initialize {
1040 #shift->throw("do_initialize(): must be implemented by subclass");
1041 my $self = shift;
1042 my $erase = shift;
1043 $self->drop_all if $erase;
1045 my $dbh = $self->features_db;
1046 my $schema = $self->schema;
1047 foreach my $table_name ($self->tables) {
1048 my $create_table_stmt = $schema->{$table_name}{table} ;
1049 $dbh->do($create_table_stmt) || warn $dbh->errstr;
1050 $self->create_other_schema_objects(\%{$schema->{$table_name}});
1056 =head2 finish_load
1058 Title : finish_load
1059 Usage : $db->finish_load
1060 Function: called after load_gff_line()
1061 Returns : number of records loaded
1062 Args : none
1063 Status : protected
1065 This method performs schema-specific cleanup after loading a set of
1066 GFF records. It finishes each of the statement handlers prepared by
1067 setup_load().
1069 =cut
1071 sub finish_load {
1072 my $self = shift;
1074 my $dbh = $self->features_db or return;
1075 $dbh->do('UNLOCK TABLES') if $self->lock_on_load;
1077 foreach (keys %{$self->{load_stuff}{sth}}) {
1078 $self->{load_stuff}{sth}{$_}->finish;
1081 my $counter = $self->{load_stuff}{counter};
1082 delete $self->{load_stuff};
1083 return $counter;
1087 =head2 create_other_schema_objects
1089 Title : create_other_schema_objects
1090 Usage : $self->create_other_schema_objects($table_name)
1091 Function: create other schema objects like : indexes, sequences, triggers
1092 Returns :
1093 Args :
1094 Status : Abstract
1096 =cut
1098 sub create_other_schema_objects{
1099 #shift->throw("create_other_schema_objects(): must be implemented by subclass");
1100 my $self = shift ;
1101 my $table_schema = shift ;
1102 my $dbh = $self->features_db;
1103 foreach my $object_type(keys %$table_schema){
1104 if ($object_type !~ /table/) {
1105 foreach my $object_name(keys %{$table_schema->{$object_type}}){
1106 my $create_object_stmt = $table_schema->{$object_type}{$object_name};
1107 $dbh->do($create_object_stmt) || warn $dbh->errstr;
1114 =head2 drop_all
1116 Title : drop_all
1117 Usage : $db->drop_all
1118 Function: empty the database
1119 Returns : void
1120 Args : none
1121 Status : protected
1123 This method drops the tables known to this module. Internally it
1124 calls the abstract tables() method.
1126 =cut
1128 # Drop all the GFF tables -- dangerous!
1129 sub drop_all {
1130 #shift->throw("drop_all(): must be implemented by subclass");
1131 my $self = shift;
1132 my $dbh = $self->features_db;
1133 my $schema = $self->schema;
1135 local $dbh->{PrintError} = 0;
1136 foreach ($self->tables) {
1137 $dbh->do("drop table $_") || warn $dbh->errstr;
1139 #when dropping a table - the indexes and triggers are being dropped automatically
1140 # sequences needs to be dropped - if there are any (Oracle, PostgreSQL)
1141 if ($schema->{$_}{sequence}){
1142 foreach my $sequence_name(keys %{$schema->{$_}{sequence}}) {
1143 $dbh->do("drop sequence $sequence_name");
1147 #$self->drop_other_schema_objects($_);
1152 =head2 clone
1154 The clone() method should be used when you want to pass the
1155 Bio::DB::GFF object to a child process across a fork(). The child must
1156 call clone() before making any queries.
1158 This method does two things: (1) it sets the underlying database
1159 handle's InactiveDestroy parameter to 1, thereby preventing the
1160 database connection from being destroyed in the parent when the dbh's
1161 destructor is called; (2) it replaces the dbh with the result of
1162 dbh-E<gt>clone(), so that we now have an independent handle.
1164 =cut
1166 sub clone {
1167 my $self = shift;
1168 $self->features_db->clone;
1172 =head1 QUERIES TO IMPLEMENT
1174 The following astract methods either return DBI statement handles or
1175 fragments of SQL. They must be implemented by subclasses of this
1176 module. See Bio::DB::GFF::Adaptor::dbi::mysql for examples.
1181 =head2 drop_other_schema_objects
1183 Title : drop_other_schema_objects
1184 Usage : $self->create_other_schema_objects($table_name)
1185 Function: create other schema objects like : indexes, sequences, triggers
1186 Returns :
1187 Args :
1188 Status : Abstract
1191 =cut
1193 sub drop_other_schema_objects{
1194 #shift->throw("drop_other_schema_objects(): must be implemented by subclass");
1198 =head2 make_features_select_part
1200 Title : make_features_select_part
1201 Usage : $string = $db->make_features_select_part()
1202 Function: make select part of the features query
1203 Returns : a string
1204 Args : none
1205 Status : Abstract
1207 This abstract method creates the part of the features query that
1208 immediately follows the SELECT keyword.
1210 =cut
1212 sub make_features_select_part {
1213 shift->throw("make_features_select_part(): must be implemented by subclass");
1216 =head2 tables
1218 Title : tables
1219 Usage : @tables = $db->tables
1220 Function: return list of tables that belong to this module
1221 Returns : list of tables
1222 Args : none
1223 Status : protected
1225 This method lists the tables known to the module.
1227 =cut
1229 # return list of tables that "belong" to us.
1230 sub tables {
1231 my $schema = shift->schema;
1232 return keys %$schema;
1235 =head2 schema
1237 Title : schema
1238 Usage : $schema = $db->schema
1239 Function: return the CREATE script for the schema
1240 Returns : a hashref
1241 Args : none
1242 Status : abstract
1244 This method returns an array ref containing the various CREATE
1245 statements needed to initialize the database tables. The keys are the
1246 table names, and the values are strings containing the appropriate
1247 CREATE statement.
1249 =cut
1251 sub schema {
1252 shift->throw("The schema() method must be implemented by subclass");
1255 =head2 DESTROY
1257 Title : DESTROY
1258 Usage : $db->DESTROY
1259 Function: disconnect database at destruct time
1260 Returns : void
1261 Args : none
1262 Status : protected
1264 This is the destructor for the class.
1266 =cut
1268 sub DESTROY {
1269 my $self = shift;
1270 $self->features_db->disconnect if defined $self->features_db;
1273 ################## query cache ##################
1276 #########################################
1277 ## Moved from mysql.pm and mysqlopt.pm ##
1278 #########################################
1280 =head2 make_features_by_name_where_part
1282 Title : make_features_by_name_where_part
1283 Usage : $db->make_features_by_name_where_part
1284 Function: create the SQL fragment needed to select a feature by its group name & class
1285 Returns : a SQL fragment and bind arguments
1286 Args : see below
1287 Status : Protected
1289 =cut
1291 sub make_features_by_name_where_part {
1292 my $self = shift;
1293 my ($class,$name) = @_;
1294 if ($name =~ /\*/) {
1295 $name =~ s/%/\\%/g;
1296 $name =~ s/_/\\_/g;
1297 $name =~ tr/*/%/;
1298 return ("fgroup.gclass=? AND fgroup.gname LIKE ?",$class,$name);
1299 } else {
1300 return ("fgroup.gclass=? AND fgroup.gname=?",$class,$name);
1304 sub make_features_by_alias_where_part {
1305 my $self = shift;
1306 my ($class,$name) = @_;
1307 if ($name =~ /\*/) {
1308 $name =~ tr/*/%/;
1309 $name =~ s/_/\\_/g;
1310 return ("fgroup.gclass=? AND fattribute_to_feature.fattribute_value LIKE ? AND fgroup.gid=fdata.gid AND fattribute.fattribute_name in ('Alias','Name') AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id AND fattribute_to_feature.fid=fdata.fid AND ftype.ftypeid=fdata.ftypeid",$class,$name)
1311 } else {
1312 return ("fgroup.gclass=? AND fattribute_to_feature.fattribute_value=? AND fgroup.gid=fdata.gid AND fattribute.fattribute_name in ('Alias','Name') AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id AND fattribute_to_feature.fid=fdata.fid AND ftype.ftypeid=fdata.ftypeid",$class,$name);
1317 sub make_features_by_attribute_where_part {
1318 my $self = shift;
1319 my $attributes = shift;
1320 my @args;
1321 my @sql;
1322 foreach (keys %$attributes) {
1323 push @sql,"(fattribute.fattribute_name=? AND fattribute_to_feature.fattribute_value=?)";
1324 push @args,($_,$attributes->{$_});
1326 return (join(' OR ',@sql),@args);
1329 =head2 make_features_by_id_where_part
1331 Title : make_features_by_id_where_part
1332 Usage : $db->make_features_by_id_where_part($ids)
1333 Function: create the SQL fragment needed to select a set of features by their ids
1334 Returns : a SQL fragment and bind arguments
1335 Args : arrayref of IDs
1336 Status : Protected
1338 =cut
1340 sub make_features_by_id_where_part {
1341 my $self = shift;
1342 my $ids = shift;
1343 my $set = join ",",@$ids;
1344 return ("fdata.fid IN ($set)");
1347 =head2 make_features_by_gid_where_part
1349 Title : make_features_by_id_where_part
1350 Usage : $db->make_features_by_gid_where_part($ids)
1351 Function: create the SQL fragment needed to select a set of features by their ids
1352 Returns : a SQL fragment and bind arguments
1353 Args : arrayref of IDs
1354 Status : Protected
1356 =cut
1358 sub make_features_by_gid_where_part {
1359 my $self = shift;
1360 my $ids = shift;
1361 my $set = join ",",@$ids;
1362 return ("fgroup.gid IN ($set)");
1366 =head2 make_features_from_part
1368 Title : make_features_from_part
1369 Usage : $string = $db->make_features_from_part()
1370 Function: make from part of the features query
1371 Returns : a string
1372 Args : none
1373 Status : protected
1375 This method creates the part of the features query that immediately
1376 follows the FROM keyword.
1378 =cut
1380 sub make_features_from_part {
1381 my $self = shift;
1382 my $sparse = shift;
1383 my $options = shift || {};
1384 return $options->{attributes} ? "fdata,ftype,fgroup,fattribute,fattribute_to_feature\n"
1385 : "fdata,ftype,fgroup\n";
1389 =head2 make_features_join_part
1391 Title : make_features_join_part
1392 Usage : $string = $db->make_features_join_part()
1393 Function: make join part of the features query
1394 Returns : a string
1395 Args : none
1396 Status : protected
1398 This method creates the part of the features query that immediately
1399 follows the WHERE keyword.
1401 =cut
1403 sub make_features_join_part {
1404 my $self = shift;
1405 my $options = shift || {};
1406 return !$options->{attributes} ? <<END1 : <<END2;
1407 fgroup.gid = fdata.gid
1408 AND ftype.ftypeid = fdata.ftypeid
1409 END1
1410 fgroup.gid = fdata.gid
1411 AND ftype.ftypeid = fdata.ftypeid
1412 AND fattribute.fattribute_id=fattribute_to_feature.fattribute_id
1413 AND fdata.fid=fattribute_to_feature.fid
1414 END2
1417 =head2 make_features_order_by_part
1419 Title : make_features_order_by_part
1420 Usage : ($query,@args) = $db->make_features_order_by_part()
1421 Function: make the ORDER BY part of the features() query
1422 Returns : a SQL fragment and bind arguments, if any
1423 Args : none
1424 Status : protected
1426 This method creates the part of the features query that immediately
1427 follows the ORDER BY part of the query issued by features() and
1428 related methods.
1430 =cut
1432 sub make_features_order_by_part {
1433 my $self = shift;
1434 my $options = shift || {};
1435 return "fgroup.gname";
1438 =head2 make_features_group_by_part
1440 Title : make_features_group_by_part
1441 Usage : ($query,@args) = $db->make_features_group_by_part()
1442 Function: make the GROUP BY part of the features() query
1443 Returns : a SQL fragment and bind arguments, if any
1444 Args : none
1445 Status : protected
1447 This method creates the part of the features query that immediately
1448 follows the GROUP BY part of the query issued by features() and
1449 related methods.
1451 =cut
1453 sub make_features_group_by_part {
1454 my $self = shift;
1455 my $options = shift || {};
1456 if (my $att = $options->{attributes}) {
1457 my $key_count = keys %$att;
1458 return unless $key_count > 1;
1459 return ("fdata.fid,fref,fstart,fstop,fsource,
1460 fmethod,fscore,fstrand,fphase,gclass,gname,ftarget_start,
1461 ftarget_stop,fdata.gid
1462 HAVING count(fdata.fid) > ?",$key_count-1);
1464 elsif (my $b = $options->{bin_width}) {
1465 return "fref,fstart,fdata.ftypeid";
1470 =head2 refseq_query
1472 Title : refseq_query
1473 Usage : ($query,@args) = $db->refseq_query($name,$class)
1474 Function: create SQL fragment that selects the desired reference sequence
1475 Returns : a list containing the query and bind arguments
1476 Args : reference sequence name and class
1477 Status : protected
1479 This method is called by make_features_by_range_where_part() to
1480 construct the part of the select WHERE section that selects a
1481 particular reference sequence. It returns a mult-element list in
1482 which the first element is the SQL fragment and subsequent elements
1483 are bind values.
1485 For example:
1487 sub refseq_query {
1488 my ($name,$class) = @_;
1489 return ('gff.refseq=? AND gff.refclass=?',
1490 $name,$class);
1493 The current schema does not distinguish among different classes of
1494 reference sequence.
1496 =cut
1498 # IMPORTANT NOTE: THE MYSQL SCHEMA IGNORES THE SEQUENCE CLASS
1499 # THIS SHOULD BE FIXED
1500 sub refseq_query {
1501 my $self = shift;
1502 my ($refseq,$refclass) = @_;
1503 my $query = "fdata.fref=?";
1504 return wantarray ? ($query,$refseq) : $self->dbh->dbi_quote($query,$refseq);
1507 =head2 attributes
1509 Title : attributes
1510 Usage : @attributes = $db->attributes($id,$name)
1511 Function: get the attributes on a particular feature
1512 Returns : an array of string
1513 Args : feature ID
1514 Status : public
1516 Some GFF version 2 files use the groups column to store a series of
1517 attribute/value pairs. In this interpretation of GFF, the first such
1518 pair is treated as the primary group for the feature; subsequent pairs
1519 are treated as attributes. Two attributes have special meaning:
1520 "Note" is for backward compatibility and is used for unstructured text
1521 remarks. "Alias" is considered as a synonym for the feature name.
1523 If no name is provided, then attributes() returns a flattened hash, of
1524 attribute=E<gt>value pairs. This lets you do:
1526 %attributes = $db->attributes($id);
1528 Normally, attributes() will be called by the feature:
1530 @notes = $feature->attributes('Note');
1532 =cut
1534 sub do_attributes {
1535 my $self = shift;
1536 my ($id,$tag) = @_;
1537 my $sth;
1538 if ($id) {
1539 my $from = 'fattribute_to_feature,fattribute';
1540 my $join = 'fattribute.fattribute_id=fattribute_to_feature.fattribute_id';
1541 my $where1 = 'fid=? AND fattribute_name=?';
1542 my $where2 = 'fid=?';
1543 $sth = defined($tag) ? $self->dbh->do_query("SELECT fattribute_value FROM $from WHERE $where1 AND $join",$id,$tag)
1544 : $self->dbh->do_query("SELECT fattribute_name,fattribute_value FROM $from WHERE $where2 AND $join",$id);
1546 else {
1547 $sth = $self->dbh->do_query("SELECT fattribute_name FROM fattribute");
1549 my @result;
1550 while (my @stuff = $sth->fetchrow_array) {
1551 push @result,@stuff;
1553 $sth->finish;
1554 return @result;
1559 =head2 overlap_query_nobin
1561 Title : overlap_query
1562 Usage : ($query,@args) = $db->overlap_query($start,$stop)
1563 Function: create SQL fragment that selects the desired features by range
1564 Returns : a list containing the query and bind arguments
1565 Args : the start and stop of a range, inclusive
1566 Status : protected
1568 This method is called by make_features_byrange_where_part() to construct the
1569 part of the select WHERE section that selects a set of features that
1570 overlap a range. It returns a multi-element list in which the first
1571 element is the SQL fragment and subsequent elements are bind values.
1574 sub overlap_query_nobin {
1575 my ($start,$stop) = @_;
1576 return ('gff.stopE<gt>=? AND gff.startE<lt>=?',
1577 $start,$stop);
1579 =cut
1581 # find features that overlap a given range
1582 sub overlap_query_nobin {
1583 my $self = shift;
1584 my ($start,$stop) = @_;
1586 my $query = qq(fdata.fstop>=? AND fdata.fstart<=?);
1587 return wantarray ? ($query,$start,$stop) : $self->dbh->dbi_quote($query,$start,$stop);
1590 =head2 contains_query_nobin
1592 Title : contains_query
1593 Usage : ($query,@args) = $db->contains_query_nobin($start,$stop)
1594 Function: create SQL fragment that selects the desired features by range
1595 Returns : a list containing the query and bind arguments
1596 Args : the start and stop of a range, inclusive
1597 Status : protected
1599 This method is called by make_features_byrange_where_part() to construct the
1600 part of the select WHERE section that selects a set of features
1601 entirely enclosed by a range. It returns a multi-element list in which
1602 the first element is the SQL fragment and subsequent elements are bind
1603 values. For example:
1605 sub contains_query_nobin {
1606 my ($start,$stop) = @_;
1607 return ('gff.start>=? AND gff.stop<=?',
1608 $start,$stop);
1610 =cut
1612 # find features that are completely contained within a range
1613 sub contains_query_nobin {
1614 my $self = shift;
1615 my ($start,$stop) = @_;
1616 my $query = qq(fdata.fstart>=? AND fdata.fstop<=?);
1617 return wantarray ? ($query,$start,$stop) : $self->dbh->dbi_quote($query,$start,$stop);
1620 =head2 contained_in_query_nobin
1622 Title : contained_in_query_nobin
1623 Usage : ($query,@args) = $db->contained_in_query($start,$stop)
1624 Function: create SQL fragment that selects the desired features by range
1625 Returns : a list containing the query and bind arguments
1626 Args : the start and stop of a range, inclusive
1627 Status : protected
1629 This method is called by make_features_byrange_where_part() to construct the
1630 part of the select WHERE section that selects a set of features
1631 entirely enclosed by a range. It returns a multi-element list in which
1632 the first element is the SQL fragment and subsequent elements are bind
1633 values.For example:
1635 sub contained_in_query_nobin {
1636 my ($start,$stop) = @_;
1637 return ('gff.start<=? AND gff.stop>=?',
1638 $start,$stop);
1641 =cut
1643 # find features that are completely contained within a range
1644 sub contained_in_query_nobin {
1645 my $self = shift;
1646 my ($start,$stop) = @_;
1647 my $query = qq(fdata.fstart<=? AND fdata.fstop>=?);
1648 return wantarray ? ($query,$start,$stop) : $self->dbh->dbi_quote($query,$start,$stop);
1651 =head2 types_query
1653 Title : types_query
1654 Usage : ($query,@args) = $db->types_query($types)
1655 Function: create SQL fragment that selects the desired features by type
1656 Returns : a list containing the query and bind arguments
1657 Args : an array reference containing the types
1658 Status : protected
1660 This method is called by make_features_byrange_where_part() to construct the
1661 part of the select WHERE section that selects a set of features based
1662 on their type. It returns a multi-element list in which the first
1663 element is the SQL fragment and subsequent elements are bind values.
1664 The argument is an array reference containing zero or more
1665 [$method,$source] pairs.
1667 =cut
1669 # generate the fragment of SQL responsible for searching for
1670 # features with particular types and methods
1671 sub types_query {
1672 my $self = shift;
1673 my $types = shift;
1675 my @method_queries;
1676 my @args;
1677 for my $type (@$types) {
1678 my ($method,$source) = @$type;
1679 my ($mlike, $slike) = (0, 0);
1680 if ($method && $method =~ m/\.\*/) {
1681 $method =~ s/%/\\%/g;
1682 $method =~ s/_/\\_/g;
1683 $method =~ s/\.\*\??/%/g;
1684 $mlike++;
1686 if ($source && $source =~ m/\.\*/) {
1687 $source =~ s/%/\\%/g;
1688 $source =~ s/_/\\_/g;
1689 $source =~ s/\.\*\??/%/g;
1690 $slike++;
1692 my @pair;
1693 if (defined $method && length $method) {
1694 push @pair, $mlike ? qq(fmethod LIKE ?) : qq(fmethod = ?);
1695 push @args, $method;
1697 if (defined $source && length $source) {
1698 push @pair, $slike ? qq(fsource LIKE ?) : qq(fsource = ?);
1699 push @args, $source;
1701 push @method_queries,"(" . join(' AND ',@pair) .")" if @pair;
1703 my $query = " (".join(' OR ',@method_queries).")\n" if @method_queries;
1704 return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args);
1707 =head2 make_types_select_part
1709 Title : make_types_select_part
1710 Usage : ($string,@args) = $db->make_types_select_part(@args)
1711 Function: create the select portion of the SQL for fetching features type list
1712 Returns : query string and bind arguments
1713 Args : see below
1714 Status : protected
1716 This method is called by get_types() to generate the query fragment
1717 and bind arguments for the SELECT part of the query that retrieves
1718 lists of feature types. The four positional arguments are as follows:
1720 $refseq reference sequence name
1721 $start start of region
1722 $stop end of region
1723 $want_count true to return the count of this feature type
1725 If $want_count is false, the SQL fragment returned must produce a list
1726 of feature types in the format (method, source).
1728 If $want_count is true, the returned fragment must produce a list of
1729 feature types in the format (method, source, count).
1731 =cut
1733 #------------------------- support for the types() query ------------------------
1734 sub make_types_select_part {
1735 my $self = shift;
1736 my ($srcseq,$start,$stop,$want_count) = @_;
1737 my $query = $want_count ? 'ftype.fmethod,ftype.fsource,count(fdata.ftypeid)'
1738 : 'fmethod,fsource';
1739 return $query;
1742 =head2 make_types_from_part
1744 Title : make_types_from_part
1745 Usage : ($string,@args) = $db->make_types_from_part(@args)
1746 Function: create the FROM portion of the SQL for fetching features type lists
1747 Returns : query string and bind arguments
1748 Args : see below
1749 Status : protected
1751 This method is called by get_types() to generate the query fragment
1752 and bind arguments for the FROM part of the query that retrieves lists
1753 of feature types. The four positional arguments are as follows:
1755 $refseq reference sequence name
1756 $start start of region
1757 $stop end of region
1758 $want_count true to return the count of this feature type
1760 If $want_count is false, the SQL fragment returned must produce a list
1761 of feature types in the format (method, source).
1763 If $want_count is true, the returned fragment must produce a list of
1764 feature types in the format (method, source, count).
1766 =cut
1768 sub make_types_from_part {
1769 my $self = shift;
1770 my ($srcseq,$start,$stop,$want_count) = @_;
1771 my $query = defined($srcseq) || $want_count ? 'fdata,ftype' : 'ftype';
1772 return $query;
1775 =head2 make_types_join_part
1777 Title : make_types_join_part
1778 Usage : ($string,@args) = $db->make_types_join_part(@args)
1779 Function: create the JOIN portion of the SQL for fetching features type lists
1780 Returns : query string and bind arguments
1781 Args : see below
1782 Status : protected
1784 This method is called by get_types() to generate the query fragment
1785 and bind arguments for the JOIN part of the query that retrieves lists
1786 of feature types. The four positional arguments are as follows:
1788 $refseq reference sequence name
1789 $start start of region
1790 $stop end of region
1791 $want_count true to return the count of this feature type
1793 =cut
1795 sub make_types_join_part {
1796 my $self = shift;
1797 my ($srcseq,$start,$stop,$want_count) = @_;
1798 my $query = defined($srcseq) || $want_count ? 'fdata.ftypeid=ftype.ftypeid'
1799 : '';
1800 return $query || '1=1';
1803 =head2 make_types_where_part
1805 Title : make_types_where_part
1806 Usage : ($string,@args) = $db->make_types_where_part(@args)
1807 Function: create the WHERE portion of the SQL for fetching features type lists
1808 Returns : query string and bind arguments
1809 Args : see below
1810 Status : protected
1812 This method is called by get_types() to generate the query fragment
1813 and bind arguments for the WHERE part of the query that retrieves
1814 lists of feature types. The four positional arguments are as follows:
1816 $refseq reference sequence name
1817 $start start of region
1818 $stop end of region
1819 $want_count true to return the count of this feature type
1821 =cut
1823 sub make_types_where_part {
1824 my $self = shift;
1825 my ($srcseq,$start,$stop,$want_count,$typelist) = @_;
1826 my (@query,@args);
1827 if (defined($srcseq)) {
1828 push @query,'fdata.fref=?';
1829 push @args,$srcseq;
1830 if (defined $start or defined $stop) {
1831 $start = 1 unless defined $start;
1832 $stop = MAX_SEGMENT unless defined $stop;
1833 my ($q,@a) = $self->overlap_query($start,$stop);
1834 push @query,"($q)";
1835 push @args,@a;
1838 if (defined $typelist && @$typelist) {
1839 my ($q,@a) = $self->types_query($typelist);
1840 push @query,($q);
1841 push @args,@a;
1843 my $query = @query ? join(' AND ',@query) : '1=1';
1844 return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args);
1847 =head2 make_types_group_part
1849 Title : make_types_group_part
1850 Usage : ($string,@args) = $db->make_types_group_part(@args)
1851 Function: create the GROUP BY portion of the SQL for fetching features type lists
1852 Returns : query string and bind arguments
1853 Args : see below
1854 Status : protected
1856 This method is called by get_types() to generate the query fragment
1857 and bind arguments for the GROUP BY part of the query that retrieves
1858 lists of feature types. The four positional arguments are as follows:
1860 $refseq reference sequence name
1861 $start start of region
1862 $stop end of region
1863 $want_count true to return the count of this feature type
1865 =cut
1867 sub make_types_group_part {
1868 my $self = shift;
1869 my ($srcseq,$start,$stop,$want_count) = @_;
1870 return unless $srcseq or $want_count;
1871 return 'ftype.ftypeid,ftype.fmethod,ftype.fsource';
1875 =head2 get_feature_id
1877 Title : get_feature_id
1878 Usage : $integer = $db->get_feature_id($ref,$start,$stop,$typeid,$groupid)
1879 Function: get the ID of a feature
1880 Returns : an integer ID or undef
1881 Args : none
1882 Status : private
1884 This internal method is called by load_gff_line to look up the integer
1885 ID of an existing feature. It is ony needed when replacing a feature
1886 with new information.
1888 =cut
1890 # this method is called when needed to look up a feature's ID
1891 sub get_feature_id {
1892 my $self = shift;
1893 my ($ref,$start,$stop,$typeid,$groupid) = @_;
1894 my $s = $self->{load_stuff};
1895 unless ($s->{get_feature_id}) {
1896 my $dbh = $self->features_db;
1897 $s->{get_feature_id} =
1898 $dbh->prepare_delayed('SELECT fid FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND ftypeid=? AND gid=?');
1900 my $sth = $s->{get_feature_id} or return;
1901 $sth->execute($ref,$start,$stop,$typeid,$groupid) or return;
1902 my ($fid) = $sth->fetchrow_array;
1903 return $fid;
1908 =head2 make_abscoord_query
1910 Title : make_abscoord_query
1911 Usage : $sth = $db->make_abscoord_query($name,$class);
1912 Function: create query that finds the reference sequence coordinates given a landmark & classa
1913 Returns : a DBI statement handle
1914 Args : name and class of landmark
1915 Status : protected
1917 The statement handler should return rows containing five fields:
1919 1. reference sequence name
1920 2. reference sequence class
1921 3. start position
1922 4. stop position
1923 5. strand ("+" or "-")
1925 This query always returns "Sequence" as the class of the reference
1926 sequence.
1928 =cut
1930 # given sequence name, return (reference,start,stop,strand)
1931 sub make_abscoord_query {
1932 my $self = shift;
1933 my ($name,$class,$refseq) = @_;
1934 #my $query = GETSEQCOORDS;
1935 my $query = $self->getseqcoords_query();
1936 my $getforcedseqcoords = $self->getforcedseqcoords_query() ;
1937 if ($name =~ /\*/) {
1938 $name =~ s/%/\\%/g;
1939 $name =~ s/_/\\_/g;
1940 $name =~ tr/*/%/;
1941 $query =~ s/gname=\?/gname LIKE ?/;
1943 defined $refseq ? $self->dbh->do_query($getforcedseqcoords,$name,$class,$refseq)
1944 : $self->dbh->do_query($query,$name,$class);
1947 sub make_aliasabscoord_query {
1948 my $self = shift;
1949 my ($name,$class) = @_;
1950 #my $query = GETALIASCOORDS;
1951 my $query = $self->getaliascoords_query();
1952 if ($name =~ /\*/) {
1953 $name =~ s/%/\\%/g;
1954 $name =~ s/_/\\_/g;
1955 $name =~ tr/*/%/;
1956 $query =~ s/gname=\?/gname LIKE ?/;
1958 $self->dbh->do_query($query,$name,$class);
1961 sub getseqcoords_query {
1962 shift->throw("getseqcoords_query(): must be implemented by a subclass");
1965 sub getaliascoords_query {
1966 shift->throw("getaliascoords_query(): must be implemented by a subclass");
1969 sub bin_query {
1970 my $self = shift;
1971 my ($start,$stop,$minbin,$maxbin) = @_;
1972 if ($start && $start < 0 && $stop > 0) { # split the queries
1973 my ($lower_query,@lower_args) = $self->_bin_query($start,0,$minbin,$maxbin);
1974 my ($upper_query,@upper_args) = $self->_bin_query(0,$stop,$minbin,$maxbin);
1975 my $query = "$lower_query\n\t OR $upper_query";
1976 my @args = (@lower_args,@upper_args);
1977 return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args);
1978 } else {
1979 return $self->_bin_query($start,$stop,$minbin,$maxbin);
1983 sub _bin_query {
1984 my $self = shift;
1985 my ($start,$stop,$minbin,$maxbin) = @_;
1986 my ($query,@args);
1988 $start = 0 unless defined($start);
1989 $stop = $self->meta('max_bin') unless defined($stop);
1991 my @bins;
1992 $minbin = defined $minbin ? $minbin : $self->min_bin;
1993 $maxbin = defined $maxbin ? $maxbin : $self->max_bin;
1994 my $tier = $maxbin;
1995 while ($tier >= $minbin) {
1996 my ($tier_start,$tier_stop) = (bin_bot($tier,$start)-EPSILON(),bin_top($tier,$stop)+EPSILON());
1997 ($tier_start,$tier_stop) = ($tier_stop,$tier_start) if $tier_start > $tier_stop; # can happen when working with negative coordinates
1998 if ($tier_start == $tier_stop) {
1999 push @bins,'fbin=?';
2000 push @args,$tier_start;
2001 } else {
2002 push @bins,'fbin between ? and ?';
2003 push @args,($tier_start,$tier_stop);
2005 $tier /= 10;
2007 $query = join("\n\t OR ",@bins);
2008 return wantarray ? ($query,@args)
2009 : $self->dbh->dbi_quote($query,@args);
2012 # find features that overlap a given range
2013 sub overlap_query {
2014 my $self = shift;
2015 my ($start,$stop) = @_;
2017 my ($query,@args);
2018 my ($iq,@iargs) = $self->overlap_query_nobin($start,$stop);
2019 if (OPTIMIZE) {
2020 my ($bq,@bargs) = $self->bin_query($start,$stop);
2021 $query = "($bq)\n\tAND $iq";
2022 @args = (@bargs,@iargs);
2024 else {
2025 $query = $iq;
2026 @args = @iargs;
2029 return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args);
2032 # find features that are completely contained within a ranged
2033 sub contains_query {
2034 my $self = shift;
2035 my ($start,$stop) = @_;
2036 my ($bq,@bargs) = $self->bin_query($start,$stop,undef,bin($start,$stop,$self->min_bin));
2037 my ($iq,@iargs) = $self->contains_query_nobin($start,$stop);
2038 my $query = "($bq)\n\tAND $iq";
2039 my @args = (@bargs,@iargs);
2040 return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args);
2043 # find features that are completely contained within a range
2044 sub contained_in_query {
2045 my $self = shift;
2046 my ($start,$stop) = @_;
2047 my ($bq,@bargs) = $self->bin_query($start,$stop,abs($stop-$start)+1,undef);
2048 my ($iq,@iargs) = $self->contained_in_query_nobin($start,$stop);
2049 my $query = "($bq)\n\tAND $iq";
2050 my @args = (@bargs,@iargs);
2051 return wantarray ? ($query,@args) : $self->dbh->dbi_quote($query,@args);
2054 # implement the _delete_fattribute_to_feature() method
2055 sub _delete_fattribute_to_feature {
2056 my $self = shift;
2057 my @feature_ids = @_;
2058 my $dbh = $self->features_db;
2059 my $fields = join ',',map{$dbh->quote($_)} @feature_ids;
2061 my $query = "delete from fattribute_to_feature where fid in ($fields)";
2062 warn "$query\n" if $self->debug;
2063 my $result = $dbh->do($query);
2064 defined $result or $self->throw($dbh->errstr);
2065 $result;
2068 # implement the _delete_features() method
2069 sub _delete_features {
2070 my $self = shift;
2071 my @feature_ids = @_;
2072 my $dbh = $self->features_db;
2073 my $fields = join ',',map{$dbh->quote($_)} @feature_ids;
2075 # delete from fattribute_to_feature
2076 $self->_delete_fattribute_to_feature(@feature_ids);
2078 my $query = "delete from fdata where fid in ($fields)";
2079 warn "$query\n" if $self->debug;
2080 my $result = $dbh->do($query);
2081 defined $result or $self->throw($dbh->errstr);
2082 $result;
2085 # implement the _delete_groups() method
2086 sub _delete_groups {
2087 my $self = shift;
2088 my @group_ids = @_;
2089 my $dbh = $self->features_db;
2090 my $fields = join ',',map{$dbh->quote($_)} @group_ids;
2092 foreach my $gid (@group_ids){
2093 my @features = $self->get_feature_by_gid($gid);
2094 $self->delete_features(@features);
2097 my $query = "delete from fgroup where gid in ($fields)";
2098 warn "$query\n" if $self->debug;
2099 my $result = $dbh->do($query);
2100 defined $result or $self->throw($dbh->errstr);
2101 $result;
2104 # implement the _delete() method
2105 sub _delete {
2106 my $self = shift;
2107 my $delete_spec = shift;
2108 my $ranges = $delete_spec->{segments} || [];
2109 my $types = $delete_spec->{types} || [];
2110 my $force = $delete_spec->{force};
2111 my $range_type = $delete_spec->{range_type};
2112 my $dbh = $self->features_db;
2114 my $query = 'delete from fdata';
2115 my @where;
2117 my @range_part;
2118 for my $segment (@$ranges) {
2119 my $ref = $dbh->quote($segment->abs_ref);
2120 my $start = $segment->abs_start;
2121 my $stop = $segment->abs_stop;
2122 my $range = $range_type eq 'overlaps' ? $self->overlap_query($start,$stop)
2123 : $range_type eq 'contains' ? $self->contains_query($start,$stop)
2124 : $range_type eq 'contained_in' ? $self->contained_in_query($start,$stop)
2125 : $self->throw("Invalid range type '$range_type'");
2126 push @range_part,"(fref=$ref AND $range)";
2128 push @where,'('. join(' OR ',@range_part).')' if @range_part;
2130 # get all the types
2131 if (@$types) {
2132 my $types_where = $self->types_query($types);
2133 my $types_query = "select ftypeid from ftype where $types_where";
2134 my $result = $dbh->selectall_arrayref($types_query);
2135 my @typeids = map {$_->[0]} @$result;
2136 my $typelist = join ',',map{$dbh->quote($_)} @typeids;
2137 $typelist ||= "0"; # don't cause DBI to die with invalid SQL when
2138 # unknown feature types were requested.
2139 push @where,"(ftypeid in ($typelist))";
2141 $self->throw("This operation would delete all feature data and -force not specified")
2142 unless @where || $force;
2143 $query .= " where ".join(' and ',@where) if @where;
2144 warn "$query\n" if $self->debug;
2145 my $result = $dbh->do($query);
2147 defined $result or $self->throw($dbh->errstr);
2148 $result;
2152 =head2 feature_summary
2154 Title : feature_summary
2155 Usage : $summary = $db->feature_summary(@args)
2156 Function: returns a coverage summary across indicated region/type
2157 Returns : a Bio::SeqFeatureI object containing the "coverage" tag
2158 Args : see below
2159 Status : public
2161 This method is used to get coverage density information across a
2162 region of interest. You provide it with a region of interest, optional
2163 a list of feature types, and a count of the number of bins over which
2164 you want to calculate the coverage density. An object is returned
2165 corresponding to the requested region. It contains a tag called
2166 "coverage" that will return an array ref of "bins" length. Each
2167 element of the array describes the number of features that overlap the
2168 bin at this position.
2170 Arguments:
2172 Argument Description
2173 -------- -----------
2175 -seq_id Sequence ID for the region
2176 -start Start of region
2177 -end End of region
2178 -type/-types Feature type of interest or array ref of types
2179 -bins Number of bins across region. Defaults to 1000.
2180 -iterator Return an iterator across the region
2182 Note that this method uses an approximate algorithm that is only
2183 accurate to 500 bp, so when dealing with bins that are smaller than
2184 1000 bp, you may see some shifting of counts between adjacent bins.
2186 Although an -iterator option is provided, the method only ever returns
2187 a single feature, so this is fairly useless.
2189 =cut
2192 sub feature_summary {
2193 my $self = shift;
2194 my ($seq_name,$start,$end,$types,$bins,$iterator) =
2195 rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
2196 ['TYPES','TYPE','PRIMARY_TAG'],
2197 'BINS',
2198 'ITERATOR',
2199 ],@_);
2200 my ($coverage,$tag) = $self->coverage_array(-seqid=> $seq_name,
2201 -start=> $start,
2202 -end => $end,
2203 -type => $types,
2204 -bins => $bins) or return;
2205 my $score = 0;
2206 for (@$coverage) { $score += $_ }
2207 $score /= @$coverage;
2209 my $feature = Bio::SeqFeature::Lite->new(-seq_id => $seq_name,
2210 -start => $start,
2211 -end => $end,
2212 -type => $tag,
2213 -score => $score,
2214 -attributes =>
2215 { coverage => [$coverage] });
2216 return $iterator
2217 ? Bio::DB::GFF::FeatureIterator->new($feature)
2218 : $feature;
2221 =head2 coverage_array
2223 Title : coverage_array
2224 Usage : $arrayref = $db->coverage_array(@args)
2225 Function: returns a coverage summary across indicated region/type
2226 Returns : an array reference
2227 Args : see below
2228 Status : public
2230 This method is used to get coverage density information across a
2231 region of interest. The arguments are identical to feature_summary,
2232 except that instead of returning a Bio::SeqFeatureI object, it returns
2233 an array reference of the desired number of bins. The value of each
2234 element corresponds to the number of features in the bin.
2236 Arguments:
2238 Argument Description
2239 -------- -----------
2241 -seq_id Sequence ID for the region
2242 -start Start of region
2243 -end End of region
2244 -type/-types Feature type of interest or array ref of types
2245 -bins Number of bins across region. Defaults to 1000.
2247 Note that this method uses an approximate algorithm that is only
2248 accurate to 500 bp, so when dealing with bins that are smaller than
2249 1000 bp, you may see some shifting of counts between adjacent bins.
2251 =cut
2253 sub coverage_array {
2254 my $self = shift;
2255 my ($seq_name,$start,$end,$types,$bins) =
2256 rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
2257 ['TYPES','TYPE','PRIMARY_TAG'],'BINS'],@_);
2259 $types = $self->parse_types($types);
2260 my $dbh = $self->features_db;
2262 $bins ||= 1000;
2263 $start ||= 1;
2264 unless ($end) {
2265 my $segment = $self->segment($seq_name) or $self->throw("unknown seq_id $seq_name");
2266 $end = $segment->end;
2269 my $binsize = ($end-$start+1)/$bins;
2270 my $seqid = $seq_name;
2272 return [] unless $seqid;
2274 # where each bin starts
2275 my @his_bin_array = map {$start + $binsize * $_} (0..$bins);
2276 my @sum_bin_array = map {int(($_-1)/SUMMARY_BIN_SIZE)} @his_bin_array;
2278 my $interval_stats_table = 'finterval_stats';
2280 # pick up the type ids
2281 my ($type_from,@a) = $self->types_query($types);
2282 my $query = "select ftypeid,fmethod,fsource from ftype where $type_from";
2283 my $sth = $dbh->prepare_delayed($query);
2284 my (@t,$report_tag);
2285 $sth->execute(@a);
2286 while (my ($t,$method,$source) = $sth->fetchrow_array) {
2287 $report_tag ||= "$method:$source";
2288 push @t,$t;
2292 my %bins;
2293 my $sql = <<END;
2294 SELECT fbin,fcum_count
2295 FROM $interval_stats_table
2296 WHERE ftypeid=?
2297 AND fref=? AND fbin >= ?
2298 LIMIT 1
2301 $sth = $dbh->prepare_delayed($sql) or warn $dbh->errstr;
2302 eval {
2303 for my $typeid (@t) {
2305 for (my $i=0;$i<@sum_bin_array;$i++) {
2307 my @args = ($typeid,$seqid,$sum_bin_array[$i]);
2308 $self->_print_query($sql,@args) if $self->debug;
2310 $sth->execute(@args) or $self->throw($sth->errstr);
2311 my ($bin,$cum_count) = $sth->fetchrow_array;
2312 push @{$bins{$typeid}},[$bin,$cum_count];
2316 return unless %bins;
2318 my @merged_bins;
2319 my $firstbin = int(($start-1)/$binsize);
2320 for my $type (keys %bins) {
2321 my $arry = $bins{$type};
2322 my $last_count = $arry->[0][1];
2323 my $last_bin = -1;
2324 my $i = 0;
2325 my $delta;
2326 for my $b (@$arry) {
2327 my ($bin,$count) = @$b;
2328 $delta = $count - $last_count if $bin > $last_bin;
2329 $merged_bins[$i++] = $delta;
2330 $last_count = $count;
2331 $last_bin = $bin;
2335 return wantarray ? (\@merged_bins,$report_tag) : \@merged_bins;
2339 =head2 build_summary_statistics
2341 Title : build_summary_statistics
2342 Usage : $db->build_summary_statistics
2343 Function: prepares the table needed to call feature_summary()
2344 Returns : nothing
2345 Args : none
2346 Status : public
2348 This method is used to build the summary statistics table that is used
2349 by the feature_summary() and coverage_array() methods. It needs to be
2350 called whenever the database is updated.
2352 =cut
2354 sub build_summary_statistics {
2355 my $self = shift;
2356 my $interval_stats_table = 'finterval_stats';
2357 my $dbh = $self->dbh;
2358 $dbh->begin_work;
2360 my $sbs = SUMMARY_BIN_SIZE;
2362 my $result = eval {
2363 $self->_add_interval_stats_table;
2364 $self->disable_keys($interval_stats_table);
2365 $dbh->do("DELETE FROM $interval_stats_table");
2367 my $insert = $dbh->prepare(<<END) or $self->throw($dbh->errstr);
2368 INSERT INTO $interval_stats_table
2369 (ftypeid,fref,fbin,fcum_count)
2370 VALUES (?,?,?,?)
2375 my $sql = 'select ftypeid,fref,fstart,fstop from fdata order by ftypeid,fref,fstart';
2376 my $select = $dbh->prepare($sql) or $self->throw($dbh->errstr);
2378 my $current_bin = -1;
2379 my ($current_type,$current_seqid,$count);
2380 my $cum_count = 0;
2381 my (%residuals,$last_bin);
2383 my $le = -t \*STDERR ? "\r" : "\n";
2385 $select->execute;
2387 while (my($typeid,$seqid,$start,$end) = $select->fetchrow_array) {
2388 print STDERR $count," features processed$le" if ++$count % 1000 == 0;
2390 my $bin = int($start/$sbs);
2391 $current_type ||= $typeid;
2392 $current_seqid ||= $seqid;
2394 # because the input is sorted by start, no more features will contribute to the
2395 # current bin so we can dispose of it
2396 if ($bin != $current_bin) {
2397 if ($seqid != $current_seqid or $typeid != $current_type) {
2398 # load all bins left over
2399 $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);
2400 %residuals = () ;
2401 $cum_count = 0;
2402 } else {
2403 # load all up to current one
2404 $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid,$current_bin);
2408 $last_bin = $current_bin;
2409 ($current_seqid,$current_type,$current_bin) = ($seqid,$typeid,$bin);
2411 # summarize across entire spanned region
2412 my $last_bin = int(($end-1)/$sbs);
2413 for (my $b=$bin;$b<=$last_bin;$b++) {
2414 $residuals{$b}++;
2417 # handle tail case
2418 # load all bins left over
2419 $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);
2420 $self->enable_keys($interval_stats_table);
2424 if ($result) { $dbh->commit } else { warn "Can't build summary statistics: $@"; $dbh->rollback };
2425 print STDERR "\n";
2428 sub _load_bins {
2429 my $self = shift;
2430 my ($insert,$residuals,$cum_count,$type,$seqid,$stop_after) = @_;
2431 for my $b (sort {$a<=>$b} keys %$residuals) {
2432 last if defined $stop_after and $b > $stop_after;
2433 $$cum_count += $residuals->{$b};
2434 my @args = ($type,$seqid,$b,$$cum_count);
2435 $insert->execute(@args) or warn $insert->errstr;
2436 delete $residuals->{$b}; # no longer needed
2440 sub _add_interval_stats_table {
2441 my $self = shift;
2442 my $schema = $self->schema;
2443 my $create_table_stmt = $schema->{'finterval_stats'}{'table'};
2444 my $dbh = $self->features_db;
2445 $dbh->do("drop table finterval_stats");
2446 $dbh->do($create_table_stmt) || warn $dbh->errstr;
2449 sub disable_keys { } # noop
2450 sub enable_keys { } # noop
2454 __END__
2456 =head1 BUGS
2458 Schemas need work to support multiple hierarchical groups.
2460 =head1 SEE ALSO
2462 L<Bio::DB::GFF>, L<bioperl>
2464 =head1 AUTHOR
2466 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
2468 Copyright (c) 2001 Cold Spring Harbor Laboratory.
2470 This library is free software; you can redistribute it and/or modify
2471 it under the same terms as Perl itself.
2473 =cut