1 package Bio
::DB
::GFF
::Adaptor
::dbi
::mysql
;
5 Bio::DB::GFF::Adaptor::dbi::mysql -- Database adaptor for a specific mysql schema
13 # a simple mysql adaptor
15 use Bio
::DB
::GFF
::Util
::Rearrange
; # for rearrange()
16 use Bio
::DB
::GFF
::Util
::Binning
;
17 use base
qw(Bio::DB::GFF::Adaptor::dbi);
19 use constant MAX_SEGMENT
=> 100_000_000
; # the largest a segment can get
21 use constant GETSEQCOORDS
=><<END;
23 IF(ISNULL(gclass),'Sequence',gclass),
31 AND fgroup.gid=fdata.gid
32 GROUP BY fref,fstrand,gname
36 use constant GETALIASCOORDS
=><<END;
38 IF(ISNULL(gclass),'Sequence',gclass),
43 FROM fdata,fgroup,fattribute,fattribute_to_feature
44 WHERE fattribute_to_feature.fattribute_value=?
46 AND fgroup.gid=fdata.gid
47 AND fattribute.fattribute_name='Alias'
48 AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id
49 AND fattribute_to_feature.fid=fdata.fid
50 GROUP BY fref,fstrand,gname
54 use constant GETALIASLIKE
=><<END;
56 IF(ISNULL(gclass),'Sequence',gclass),
61 FROM fdata,fgroup,fattribute,fattribute_to_feature
62 WHERE fattribute_to_feature.fattribute_value LIKE ?
64 AND fgroup.gid=fdata.gid
65 AND fattribute.fattribute_name='Alias'
66 AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id
67 AND fattribute_to_feature.fid=fdata.fid
68 GROUP BY fref,fstrand,gname
72 use constant GETFORCEDSEQCOORDS
=><<END;
74 IF(ISNULL(gclass),'Sequence',gclass),
82 AND fgroup.gid=fdata.gid
87 use constant FULLTEXTSEARCH
=> <<END;
88 SELECT distinct gclass,gname,fattribute_value,MATCH(fattribute_value) AGAINST (?) as score,fmethod,fsource
89 FROM fgroup,fattribute_to_feature,fdata,ftype
90 WHERE fgroup.gid=fdata.gid
91 AND fdata.fid=fattribute_to_feature.fid
92 AND fdata.ftypeid=ftype.ftypeid
93 AND MATCH(fattribute_value) AGAINST (?)
99 This adaptor implements a specific mysql database schema that is
100 compatible with Bio::DB::GFF. It inherits from
101 Bio::DB::GFF::Adaptor::dbi, which itself inherits from Bio::DB::GFF.
103 The schema uses several tables:
109 This is the feature data table. Its columns are:
111 fid feature ID (integer)
112 fref reference sequence name (string)
113 fstart start position relative to reference (integer)
114 fstop stop position relative to reference (integer)
115 ftypeid feature type ID (integer)
116 fscore feature score (float); may be null
117 fstrand strand; one of "+" or "-"; may be null
118 fphase phase; one of 0, 1 or 2; may be null
119 gid group ID (integer)
120 ftarget_start for similarity features, the target start position (integer)
121 ftarget_stop for similarity features, the target stop position (integer)
123 Note that it would be desirable to normalize the reference sequence
124 name, since there are usually many features that share the same
125 reference feature. However, in the current schema, query performance
126 suffers dramatically when this additional join is added.
130 This is the group table. There is one row for each group. Columns:
132 gid the group ID (integer)
133 gclass the class of the group (string)
134 gname the name of the group (string)
136 The group table serves multiple purposes. As you might expect, it is
137 used to cluster features that logically belong together, such as the
138 multiple exons of the same transcript. It is also used to assign a
139 name and class to a singleton feature. Finally, the group table is
140 used to identify the target of a similarity hit. This is consistent
141 with the way in which the group field is used in the GFF version 2
144 The fgroup.gid field joins with the fdata.gid field.
148 mysql> select * from fgroup where gname='sjj_2L52.1';
149 +-------+-------------+------------+
150 | gid | gclass | gname |
151 +-------+-------------+------------+
152 | 69736 | PCR_product | sjj_2L52.1 |
153 +-------+-------------+------------+
154 1 row in set (0.70 sec)
156 mysql> select fref,fstart,fstop from fdata,fgroup
157 where gclass='PCR_product' and gname = 'sjj_2L52.1'
158 and fdata.gid=fgroup.gid;
159 +---------------+--------+-------+
160 | fref | fstart | fstop |
161 +---------------+--------+-------+
162 | CHROMOSOME_II | 1586 | 2355 |
163 +---------------+--------+-------+
164 1 row in set (0.03 sec)
168 This table contains the feature types, one per row. Columns are:
170 ftypeid the feature type ID (integer)
171 fmethod the feature type method name (string)
172 fsource the feature type source name (string)
174 The ftype.ftypeid field joins with the fdata.ftypeid field. Example:
176 mysql> select fref,fstart,fstop,fmethod,fsource from fdata,fgroup,ftype
177 where gclass='PCR_product'
178 and gname = 'sjj_2L52.1'
179 and fdata.gid=fgroup.gid
180 and fdata.ftypeid=ftype.ftypeid;
181 +---------------+--------+-------+-------------+-----------+
182 | fref | fstart | fstop | fmethod | fsource |
183 +---------------+--------+-------+-------------+-----------+
184 | CHROMOSOME_II | 1586 | 2355 | PCR_product | GenePairs |
185 +---------------+--------+-------+-------------+-----------+
186 1 row in set (0.08 sec)
190 This table holds the raw DNA of the reference sequences. It has three
193 fref reference sequence name (string)
194 foffset offset of this sequence
195 fdna the DNA sequence (longblob)
197 To overcome problems loading large blobs, DNA is automatically
198 fragmented into multiple segments when loading, and the position of
199 each segment is stored in foffset. The fragment size is controlled by
200 the -clump_size argument during initialization.
202 =item fattribute_to_feature
204 This table holds "attributes", which are tag/value pairs stuffed into
205 the GFF line. The first tag/value pair is treated as the group, and
206 anything else is treated as an attribute (weird, huh?).
208 CHR_I assembly_tag Finished 2032 2036 . + . Note "Right: cTel33B"
209 CHR_I assembly_tag Polymorphism 668 668 . + . Note "A->C in cTel33B"
211 The columns of this table are:
213 fid feature ID (integer)
214 fattribute_id ID of the attribute (integer)
215 fattribute_value text of the attribute (text)
217 The fdata.fid column joins with fattribute_to_feature.fid.
221 This table holds the normalized names of the attributes. Fields are:
223 fattribute_id ID of the attribute (integer)
224 fattribute_name Name of the attribute (varchar)
228 =head2 Data Loading Methods
230 In addition to implementing the abstract SQL-generating methods of
231 Bio::DB::GFF::Adaptor::dbi, this module also implements the data
232 loading functionality of Bio::DB::GFF.
240 Usage : $db = Bio::DB::GFF->new(@args)
241 Function: create a new adaptor
242 Returns : a Bio::DB::GFF object
246 The new constructor is identical to the "dbi" adaptor's new() method,
247 except that the prefix "dbi:mysql" is added to the database DSN identifier
248 automatically if it is not there already.
253 -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' or "ens0040"
255 -user username for authentication
257 -pass the password for authentication
265 my ($dsn,$other) = rearrange
([
266 [qw(FEATUREDB DB DSN)],
268 $dsn = "dbi:mysql:$dsn" if !ref($dsn) && $dsn !~ /^(?:dbi|DBI):/;
269 my $self = $class->SUPER::new
(-dsn
=>$dsn,%$other);
276 Usage : $string = $db->get_dna($name,$start,$stop,$class)
277 Function: get DNA string
279 Args : name, class, start and stop of desired segment
282 This method performs the low-level fetch of a DNA substring given its
283 name, class and the desired range. This should probably be moved to
288 sub getseqcoords_query
{
290 return GETSEQCOORDS
;
293 sub getaliascoords_query
{
295 return GETALIASCOORDS
;
299 sub getforcedseqcoords_query
{
301 return GETFORCEDSEQCOORDS
;
305 sub getaliaslike_query
{
307 return GETALIASLIKE
;
312 sub get_abscoords_bkup
{
314 my ($name,$class,$refseq) = @_;
316 my $result = $self->SUPER::get_abscoords
(@_);
317 return $result if $result;
320 if ($name =~ s/\*/%/g) {
321 $sth = $self->dbh->do_query(GETALIASLIKE
,$name,$class);
323 $sth = $self->dbh->do_query(GETALIASCOORDS
,$name,$class);
326 while (my @row = $sth->fetchrow_array) { push @result,\
@row }
330 $self->error("$name not found in database");
340 sub make_features_select_part
{
342 my $options = shift || {};
344 if (my $b = $options->{bin_width
}) {
348 1+$b*floor(fstart/$b) as fstart,
349 $b*(1+floor(fstart/$b)) as fstop,
350 IF(ISNULL(fsource),fmethod,concat(fmethod,':',fsource)),'bin',
353 IF(ISNULL(fsource),concat(fref,':',fmethod),concat(fref,':',fmethod,':',fsource)),
359 fref,fstart,fstop,fsource,fmethod,fscore,fstrand,fphase,gclass,gname,ftarget_start,ftarget_stop,fdata.fid,fdata.gid
363 $s .= ",count(fdata.fid)" if $options->{attributes
} && keys %{$options->{attributes
}}>1;
369 # WHETHER OR NOT THIS WORKS IS CRITICALLY DEPENDENT ON THE RELATIVE MAGNITUDE OF THE
370 sub make_features_from_part
{
372 my $sparse_types = shift;
373 my $options = shift || {};
374 my $sparse_groups = $options->{sparse_groups
};
375 my $index = $sparse_groups ?
' USE INDEX(gid)'
376 : $sparse_types ?
' USE INDEX(ftypeid)'
378 return $options->{attributes
} ?
"fdata${index},ftype,fgroup,fattribute,fattribute_to_feature\n"
379 : "fdata${index},ftype,fgroup\n";
385 Usage : @search_results = $db->search_notes("full text search string",$limit)
386 Function: Search the notes for a text string, using mysql full-text search
387 Returns : array of results
388 Args : full text search string, and an optional row limit
391 This is a mysql-specific method. Given a search string, it performs a
392 full-text search of the notes table and returns an array of results.
393 Each row of the returned array is a arrayref containing the following fields:
395 column 1 A Bio::DB::GFF::Featname object, suitable for passing to segment()
396 column 2 The text of the note
397 column 3 A relevance score.
403 my ($search_string,$limit) = @_;
405 $search_string =~ tr/*?//d;
407 my $query = FULLTEXTSEARCH
;
408 $query .= " limit $limit" if defined $limit;
409 my $sth = $self->dbh->do_query($query,$search_string,$search_string);
411 while (my ($class,$name,$note,$relevance,$method,$source) = $sth->fetchrow_array) {
412 next unless $class && $name; # sorry, ignore NULL objects
413 $relevance = sprintf("%.2f",$relevance); # trim long floats
414 my $featname = Bio
::DB
::GFF
::Featname
->new($class=>$name);
415 my $type = Bio
::DB
::GFF
::Typename
->new($method,$source);
416 push @results,[$featname,$note,$relevance,$type];
419 #added result filtering so that this method returns the expected results
420 #this section of code used to be in GBrowse's do_keyword_search method
422 my $match_sub = 'sub {';
423 foreach (split /\s+/,$search_string) {
424 $match_sub .= "return unless \$_[0] =~ /\Q$_\E/i; ";
427 my $match = eval $match_sub;
429 my @matches = grep { $match->($_->[1]) } @results;
436 ################################ loading and initialization ##################################
441 Usage : $schema = $db->schema
442 Function: return the CREATE script for the schema
443 Returns : a list of CREATE statemetns
447 This method returns a list containing the various CREATE statements
448 needed to initialize the database tables.
454 my $dbh = $self->dbh;
455 my ($version) = $dbh->selectrow_array('select version()');
456 my ($major, $minor) = split /\./, $version;
457 $version = "$major.$minor";
458 my $engine = $version >= 4.1 ?
'ENGINE' : 'TYPE';
463 fid
int not null auto_increment
,
464 fref varchar
(100) not null
,
467 fbin double precision
,
468 ftypeid
int not null
,
470 fstrand enum
('+','-'),
471 fphase enum
('0','1','2'),
476 unique
index(fref
,fbin
,fstart
,fstop
,ftypeid
,gid
),
485 create table fgroup
(
486 gid
int not null auto_increment
,
498 ftypeid
int not null auto_increment
,
499 fmethod varchar
(100) not null
,
500 fsource varchar
(100),
501 primary key
(ftypeid
),
504 unique ftype
(fmethod
,fsource
)
512 fref varchar
(100) not null
,
513 foffset
int(10) unsigned
not null
,
515 primary key
(fref
,foffset
)
523 fname varchar
(255) not null
,
524 fvalue varchar
(255) not null
,
532 create table fattribute
(
533 fattribute_id
int(10) unsigned
not null auto_increment
,
534 fattribute_name varchar
(255) not null
,
535 primary key
(fattribute_id
)
540 fattribute_to_feature
=> {
542 create table fattribute_to_feature
(
543 fid
int(10) not null
,
544 fattribute_id
int(10) not null
,
545 fattribute_value text
,
546 key
(fid
,fattribute_id
),
547 key
(fattribute_value
(48)),
548 fulltext
(fattribute_value
)
550 } # fattribute_to_feature table
551 },# fattribute_to_feature
555 create table finterval_stats
(
556 ftypeid integer
not null
,
557 fref varchar
(100) not null
,
558 fbin integer
not null
,
559 fcum_count integer
not null
,
560 primary key
(ftypeid
,fref
,fbin
)
562 } # finterval_stats table
571 =head2 make_classes_query
573 Title : make_classes_query
574 Usage : ($query,@args) = $db->make_classes_query
575 Function: return query fragment for generating list of reference classes
576 Returns : a query and args
582 sub make_classes_query
{
584 return 'SELECT DISTINCT gclass FROM fgroup WHERE NOT ISNULL(gclass)';
588 =head2 make_meta_set_query
590 Title : make_meta_set_query
591 Usage : $sql = $db->make_meta_set_query
592 Function: return SQL fragment for setting a meta parameter
593 Returns : SQL fragment
597 By default this does nothing; meta parameters are not stored or
602 sub make_meta_set_query
{
603 return 'REPLACE INTO fmeta VALUES (?,?)';
609 Usage : $db->setup_load
610 Function: called before load_gff_line()
615 This method performs schema-specific initialization prior to loading a
616 set of GFF records. It prepares a set of DBI statement handlers to be
617 used in loading the data.
624 my $dbh = $self->features_db;
626 if ($self->lock_on_load) {
627 my @tables = map { "$_ WRITE"} $self->tables;
628 my $tables = join ', ',@tables;
629 $dbh->do("LOCK TABLES $tables");
631 # for my $table (qw(fdata)) {
632 # $dbh->do("alter table $table disable keys");
635 my $lookup_type = $dbh->prepare_delayed('SELECT ftypeid FROM ftype WHERE fmethod=? AND fsource=?');
636 my $insert_type = $dbh->prepare_delayed('INSERT INTO ftype (fmethod,fsource) VALUES (?,?)');
638 my $lookup_group = $dbh->prepare_delayed('SELECT gid FROM fgroup WHERE gname=? AND gclass=?');
639 my $insert_group = $dbh->prepare_delayed('INSERT INTO fgroup (gname,gclass) VALUES (?,?)');
641 my $lookup_attribute = $dbh->prepare_delayed('SELECT fattribute_id FROM fattribute WHERE fattribute_name=?');
642 my $insert_attribute = $dbh->prepare_delayed('INSERT INTO fattribute (fattribute_name) VALUES (?)');
643 my $insert_attribute_value = $dbh->prepare_delayed('INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) VALUES (?,?,?)');
645 my $insert_data = $dbh->prepare_delayed(<<END);
646 INSERT INTO fdata (fref,fstart,fstop,fbin,ftypeid,fscore,
647 fstrand,fphase,gid,ftarget_start,ftarget_stop)
648 VALUES(?,?,?,?,?,?,?,?,?,?,?)
653 $self->{load_stuff
}{sth
}{lookup_ftype
} = $lookup_type;
654 $self->{load_stuff
}{sth
}{insert_ftype
} = $insert_type;
655 $self->{load_stuff
}{sth
}{lookup_fgroup
} = $lookup_group;
656 $self->{load_stuff
}{sth
}{insert_fgroup
} = $insert_group;
657 $self->{load_stuff
}{sth
}{insert_fdata
} = $insert_data;
658 $self->{load_stuff
}{sth
}{lookup_fattribute
} = $lookup_attribute;
659 $self->{load_stuff
}{sth
}{insert_fattribute
} = $insert_attribute;
660 $self->{load_stuff
}{sth
}{insert_fattribute_value
} = $insert_attribute_value;
661 $self->{load_stuff
}{types
} = {};
662 $self->{load_stuff
}{groups
} = {};
663 $self->{load_stuff
}{counter
} = 0;
668 Title : load_gff_line
669 Usage : $db->load_gff_line($fields)
670 Function: called to load one parsed line of GFF
671 Returns : true if successfully inserted
672 Args : hashref containing GFF fields
675 This method is called once per line of the GFF and passed a series of
676 parsed data items that are stored into the hashref $fields. The keys are:
678 ref reference sequence
679 source annotation source
680 method annotation method
681 start annotation start
683 score annotation score (may be undef)
684 strand annotation strand (may be undef)
685 phase annotation phase (may be undef)
686 group_class class of annotation's group (may be undef)
687 group_name ID of annotation's group (may be undef)
688 target_start start of target of a similarity hit
689 target_stop stop of target of a similarity hit
690 attributes array reference of attributes, each of which is a [tag=>value] array ref
698 my $s = $self->{load_stuff
};
699 my $dbh = $self->features_db;
700 local $dbh->{PrintError
} = 0;
702 defined(my $typeid = $self->get_table_id('ftype', $gff->{method
} => $gff->{source
})) or return;
703 defined(my $groupid = $self->get_table_id('fgroup',$gff->{gname
} => $gff->{gclass
})) or return;
705 if ($gff->{stop
}-$gff->{start
}+1 > $self->max_bin) {
706 warn "$gff->{gclass}:$gff->{gname} is ",$gff->{stop
}-$gff->{start
}+1,
707 " bp long, but the maximum indexable feature is set to ",$self->max_bin," bp.\n";
708 warn "Please set the maxbin value to a length at least as large as the largest feature you wish to store.\n";
709 warn "\n* You will need to reinitialize the database from scratch.\n";
710 warn "* With the Perl API you do this using the -max_bin argument to \$db->initialize().\n";
711 warn "* With the command-line tools you do with this with --maxfeature option.\n";
714 my $bin = bin
($gff->{start
},$gff->{stop
},$self->min_bin);
715 my $result = $s->{sth
}{insert_fdata
}->execute($gff->{ref},
716 $gff->{start
},$gff->{stop
},$bin,
718 $gff->{score
},$gff->{strand
},$gff->{phase
},
720 $gff->{tstart
},$gff->{tstop
});
722 warn $dbh->errstr,"\n" && return unless $result;
724 my $fid = $dbh->{mysql_insertid
}
725 || $self->get_feature_id($gff->{ref},$gff->{start
},$gff->{stop
},$typeid,$groupid);
729 foreach (@
{$gff->{attributes
}}) {
730 defined(my $attribute_id = $self->get_table_id('fattribute',$_->[0])) or return;
731 $s->{sth
}{insert_fattribute_value
}->execute($fid,$attribute_id,$_->[1]);
734 if ( (++$s->{counter
} % 1000) == 0) {
735 print STDERR
"$s->{counter} records loaded...";
736 print STDERR
-t STDOUT
&& !$ENV{EMACS
} ?
"\r" : "\n";
744 my $dbh = $self->features_db;
745 local $dbh->{PrintError
} = 0;
746 # for my $table (qw(fdata)) {
747 # $dbh->do("alter table $table enable keys");
749 $self->SUPER::finish_load
;
753 sub insert_sequence
{
755 my($id,$offset,$seq) = @_;
756 my $sth = $self->{_insert_sequence
}
757 ||= $self->dbh->prepare_delayed('replace into fdna values (?,?,?)');
758 $sth->execute($id,$offset,$seq) or $self->throw($sth->errstr);
765 Usage : $integer = $db->get_table_id($table,@ids)
766 Function: get the ID of a group or type
767 Returns : an integer ID or undef
771 This internal method is called by load_gff_line to look up the integer
772 ID of an existing feature type or group. The arguments are the name
773 of the table, and two string identifiers. For feature types, the
774 identifiers are the method and source. For groups, the identifiers
775 are group name and class.
777 This method requires that a statement handler named I<lookup_$table>,
778 have been created previously by setup_load(). It is here to overcome
779 deficiencies in mysql's INSERT syntax.
784 # get the object ID from a named table
790 # irritating warning for null id
794 $id_key = join ':',@ids;
797 my $s = $self->{load_stuff
};
799 my $dbh = $self->features_db;
801 unless (defined($s->{$table}{$id_key})) {
803 #########################################
804 # retrieval of the last inserted id is now located at the adaptor and not in caching_handle
805 #######################################
806 if ( (my $result = $sth->{"lookup_$table"}->execute(@ids)) > 0) {
807 $s->{$table}{$id_key} = ($sth->{"lookup_$table"}->fetchrow_array)[0];
809 $sth->{"insert_$table"}->execute(@ids)
810 && ($s->{$table}{$id_key} = $self->insertid($sth->{"insert_$table"}));
811 #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}{sth}{mysql_insertid});
812 #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}->insertid);
816 my $id = $s->{$table}{$id_key};
817 unless (defined $id) {
818 warn "No $table id for $id_key ",$dbh->errstr," Record skipped.\n";
827 $s->{mysql_insertid
};
831 =head2 get_feature_id
833 Title : get_feature_id
834 Usage : $integer = $db->get_feature_id($ref,$start,$stop,$typeid,$groupid)
835 Function: get the ID of a feature
836 Returns : an integer ID or undef
840 This internal method is called by load_gff_line to look up the integer
841 ID of an existing feature. It is ony needed when replacing a feature
842 with new information.
846 # this method is called when needed to look up a feature's ID
849 my ($ref,$start,$stop,$typeid,$groupid) = @_;
850 my $s = $self->{load_stuff
};
851 unless ($s->{get_feature_id
}) {
852 my $dbh = $self->features_db;
853 $s->{get_feature_id
} =
854 $dbh->prepare_delayed('SELECT fid FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND ftypeid=? AND gid=?');
856 my $sth = $s->{get_feature_id
} or return;
857 $sth->execute($ref,$start,$stop,$typeid,$groupid) or return;
858 my ($fid) = $sth->fetchrow_array;
862 sub _add_interval_stats_table
{
864 my $schema = $self->schema;
865 my $create_table_stmt = $schema->{'finterval_stats'}{'table'};
866 $create_table_stmt =~ s/create table/create table if not exists/i;
867 my $dbh = $self->features_db;
868 $dbh->do($create_table_stmt) || warn $dbh->errstr;
874 my $dbh = $self->dbh;
875 $dbh->do("alter table $table disable keys");
880 my $dbh = $self->dbh;
881 $dbh->do("alter table $table enable keys");
897 L<Bio::DB::GFF>, L<bioperl>
901 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
903 Copyright (c) 2002 Cold Spring Harbor Laboratory.
905 This library is free software; you can redistribute it and/or modify
906 it under the same terms as Perl itself.