fix spelling errors, fixes #3228
[bioperl-live.git] / Bio / DB / GFF / Adaptor / dbi / pg_fts.pm
blob5640d43c1f9af98dccba484473710608d6872a84
1 package Bio::DB::GFF::Adaptor::dbi::pg_fts;
4 =head1 NAME
6 Bio::DB::GFF::Adaptor::dbi::pg_fts -- Database adaptor for a specific postgres schema with a TSearch2 implementation
8 =head1 SYNOPSIS
10 #create new GFF database connection
11 my $db = Bio::DB::GFF->new( -adaptor => 'dbi::pg_fts',
12 -dsn => 'dbi:Pg:dbname=worm');
14 #add full text indexing 'stuff'
15 #assumes that TSearch2 is available to PostgreSQL
16 #this will take a VERY long time for a reasonably large database
17 $db->install_TSearch2();
19 ...some time later...
20 #we don't like full text searching...
21 $db->remove_TSearch2();
23 =head1 DESCRIPTION
25 This adaptor is based on Bio::DB::GFF::Adaptor::dbi::pg but it implements
26 the TSearch2 PostgreSQL contrib module for fast full text searching. To
27 use this module with your PostgreSQL GFF database, you need to make
28 TSearch2 available in the database.
30 To use this adaptor, follow these steps:
32 =over
34 =item Install TSearch2 contrib module for Pg
36 Can be as easy as `sudo yum install postgresql-contrib`, or you may
37 need to recompile PostgreSQL to include it. See
38 L<http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/tsearch-V2-intro.html>
39 for more details
41 =item Load the TSearch2 functions to you database
43 % cat tsearch2.sql | psql <your database>
45 =item Load your data using the pg adaptor:
47 % bp_pg_bulk_load_gff.pl -c -d yeast saccharomyces_cerevisiae.gff
51 % bp_load_gff.pl -c -d yeast -a dbi::pg saccharomyces_cerevisiae.gff
53 =item Add GFF/TSearch2 specific modifications
55 Execute a perl script like this one:
57 #!/usr/bin/perl -w
58 use strict;
60 use Bio::DB::GFF;
62 my $db = Bio::DB::GFF->new(
63 -adaptor => 'dbi::pg_fts',
64 -dsn => 'dbi:Pg:dbname=yeast',
65 -user => 'scott',
68 print "Installing TSearch2 columns...\n";
70 $db->install_TSearch2();
72 print "Done\n";
74 =back
76 Note that this last step will take a long time. For a S. cerevisiae
77 database with 15K rows, it took over an hour on my laptop, and
78 with a C. elegans database (~10 million rows) it took well over a day.
80 If at some point you add more data you your database, you need to run
81 a similar script to the one above, only executing the update_TSearch2()
82 method. Finally, if you want to remove the TSearch2 columns from your
83 database and go back to using the pg adaptor, you can execute a script
84 like the one above, only executing the remove_TSearch2() method.
86 =head1 NOTES ABOUT TSearch2 SEARCHING
88 You should know a few things about how searching with TSearch2 works in
89 the GBrowse environment:
91 =over
93 =item 1
95 TSearch2 does not do wild cards, so you should encourage your users not
96 to use them. If wild cards are used, the adaptor will fall back on
97 an ILIKE search, which will be much slower.
99 =item 2
101 However, TSearch2 does do 'word stemming'. That is, if you search
102 for 'copy', it will find 'copy', 'copies', and 'copied'.
104 =item 3
106 TSearch2 does not do phrase searching; all of the terms in the
107 search string are ANDed together.
109 =back
111 =head1 ACKNOWLEDGEMENTS
113 Special thanks to Russell Smithies and Paul Smale at AgResearch in
114 New Zealand for giving me their recipe for doing full text indexing
115 in a GFF database.
117 =head1 BUGS
119 Please report bugs to the BioPerl and/or GBrowse mailing lists
120 (L<mailto:bioperl-l@lists.open-bio.org> and L<mailto:gmod-gbrowse@lists.sourceforge.net>
121 respectively).
123 =head1 SEE ALSO
125 Please see L<Bio::DB::GFF::Adaptor::dbi::pg> for more information
126 about tuning your PostgreSQL server for GFF data, and for general
127 information about GFF database access, see L<Bio::DB::GFF>.
129 =head1 AUTHOR
131 Scott Cain, cain@cshl.edu
133 =head1 APPENDIX
135 =cut
137 # a simple postgres adaptor
138 use strict;
139 use Bio::DB::GFF::Adaptor::dbi;
140 use base qw(Bio::DB::GFF::Adaptor::dbi::pg);
142 use constant FULLTEXTSEARCH => <<END;
143 SELECT distinct gclass,gname,fattribute_value,fmethod,fsource
144 FROM fgroup,fattribute_to_feature,fdata,ftype
145 WHERE fgroup.gid=fdata.gid
146 AND fdata.fid=fattribute_to_feature.fid
147 AND fdata.ftypeid=ftype.ftypeid
148 AND (fattribute_to_feature.idxfti @@ to_tsquery('default', ?))
152 use constant FULLTEXTWILDCARD => <<END;
153 SELECT distinct gclass,gname,fattribute_value,fmethod,fsource
154 FROM fgroup,fattribute_to_feature,fdata,ftype
155 WHERE fgroup.gid=fdata.gid
156 AND fdata.fid=fattribute_to_feature.fid
157 AND fdata.ftypeid=ftype.ftypeid
158 AND lower(fattribute_to_feature.fattribute_value) LIKE lower(?)
162 sub new {
163 my $class = shift;
164 my $self = $class->SUPER::new(@_);
165 return $self;
168 =head2 search_notes
170 Title : search_notes
171 Usage : @search_results = $db->search_notes("full text string",$limit)
172 Function: Search the notes for a text string, using PostgreSQL TSearch2
173 Returns : array of results
174 Args : full text search string, and an optional row limit
175 Status : public
177 This is based on the mysql-specific method that makes use of the TSearch2
178 functionality in PosgreSQL's contrib directory. Given a search string,
179 it performs a full-text search of the notes table and returns an array
180 of results. Each row of the returned array is a arrayref containing
181 the following fields:
183 column 1 A Bio::DB::GFF::Featname object, for passing to segment()
184 column 2 The text of the note
185 column 3 A relevance score.
187 =cut
189 sub search_notes {
190 my $self = shift;
191 my ($search_string,$limit) = @_;
193 my @terms = split /\s+/, $search_string;
195 my $sth;
196 if ($search_string =~ /\*/) {
197 $search_string =~ tr/*/%/s;
198 my $query = FULLTEXTWILDCARD;
199 $query .= " limit $limit" if defined $limit;
200 $sth = $self->dbh->do_query($query,$search_string);
202 elsif (@terms == 1) {
203 my $query = FULLTEXTSEARCH;
204 $query .= " limit $limit" if defined $limit;
205 $sth = $self->dbh->do_query($query,$search_string);
207 else {
208 my $query = FULLTEXTSEARCH;
209 my $andstring = join (' & ', @terms);
210 # $query .= qq{ AND (fattribute_to_feature.fattribute_value ILIKE '\%$search_string%')};
211 $query .= " LIMIT $limit" if defined $limit;
212 $sth = $self->dbh->do_query($query,$andstring);
215 my @results;
216 while (my ($class,$name,$note,$method,$source) = $sth->fetchrow_array) {
218 next unless $class && $name; # sorry, ignore NULL objects
219 my $featname = Bio::DB::GFF::Featname->new($class=>$name);
220 my $type = Bio::DB::GFF::Typename->new($method,$source);
221 push @results,[$featname,$note,0,$type]; #gbrowse expects a score, but
222 #pg doesn't give one, thus the 0
225 return @results;
228 =head2 make_features_by_name_where_part
230 Title : make_features_by_name_where_part
231 Function: constructs a TSearch2-compliant WHERE clause for a name search
232 Status : protected
234 =cut
236 #need a make_features_by_name_where_part method to override pg
237 sub make_features_by_name_where_part {
238 my $self = shift;
239 my ($class,$name) = @_;
241 my @terms = split /\s+/, $name;
243 if ($name =~ /\*/) {
244 $name =~ tr/*/%/s;
245 return ("fgroup.gclass=? AND lower(fgroup.gname) LIKE lower(?)",$class,$name);
247 else {
248 my $where_str = "fgroup.gclass=? AND (fgroup.idxfti @@ to_tsquery('default', ?)) ";
249 if (@terms == 1) {
250 return ($where_str,$class,$name);
252 else {
253 my $andstring = join (' & ', @terms);
254 # $where_str .= qq{ AND (fgroup.gname ILIKE '\%$name%')};
255 return ($where_str,$class,$andstring);
260 =head2 install_TSearch2
262 Title : install_TSearch2
263 Function: installs schema modifications for use with TSearch2
264 Usage : $db->install_TSearch2
265 Status : public
267 =cut
270 #needs method for installing TSearch2 (does that mean that the SQL for
271 #creating the tables and functions should go in here? That would be
272 #the safest and easiest thing to do
273 sub install_TSearch2 {
274 my $self = shift;
276 my $dbh = $self->features_db;
278 $dbh->do('ALTER TABLE fattribute_to_feature ADD COLUMN idxFTI tsvector')
279 or $self->throw('adding FTI column to f_to_f failed');
281 $dbh->do('ALTER TABLE fgroup ADD COLUMN idxFTI tsvector')
282 or $self->throw('adding FTI column to fgroup failed');
284 $self->update_TSearch2();
286 return;
289 =head2 update_TSearch2
291 Title : update_TSearch2
292 Function: Updates TSearch2 columns
293 Usage : $db->update_TSearch2
294 Status : public
296 =cut
299 sub update_TSearch2 {
300 my $self = shift;
302 my $dbh = $self->features_db;
304 $self->warn('updating full text column; this may take a very long time...');
305 $dbh->do("UPDATE fattribute_to_feature "
306 ."SET idxFTI= to_tsvector('default', fattribute_value) "
307 ."WHERE idxFTI IS NULL")
308 or $self->throw('updating fti column failed');
309 $dbh->do("UPDATE fgroup "
310 ."SET idxFTI= to_tsvector('default', gname) "
311 ."WHERE idxFTI IS NULL")
312 or $self->throw('updating fgroup fti column failed');
314 $self->warn('Preliminary optimization of database; this may also take a long time...');
315 $dbh->do('VACUUM FULL ANALYZE')
316 or $self->throw('vacuum failed');
318 $self->warn('Updating full text index; again, this may take a long time');
319 $dbh->do('CREATE INDEX idxFTI_idx ON fattribute_to_feature '
320 .'USING gist(idxFTI)')
321 or $self->warn('creating full text index failed');
322 $dbh->do('CREATE INDEX fgroup_idxFTI_idx ON fgroup '
323 .'USING gist(idxFTI)')
324 or $self->warn('creating fgroup full text index failed');
326 $self->warn('Optimizing database; hopefully, this will not take as long as other steps');
327 $dbh->do('VACUUM FULL ANALYZE');
328 $dbh->do("SELECT set_curcfg('default')");
330 return;
333 =head2 remove_TSearch2
335 Title : remove_TSearch2
336 Function: Removes TSearch2 columns
337 Usage : $db->remove_TSearch2
338 Status : public
340 =cut
342 sub remove_TSearch2 {
343 my $self = shift;
345 my $dbh = $self->features_db;
347 $self->warn('Removing full text search capabilities');
348 $dbh->do('DROP INDEX idxFTI_idx')
349 or $self->throw('dropping full text index failed');
350 $dbh->do('DROP INDEX fgroup_idxFTI_idx')
351 or $self->throw('dropping full text index failed');
353 $dbh->do('ALTER TABLE fattribute_to_feature DROP COLUMN idxFTI')
354 or $self->throw('dropping full text column failed');
355 $dbh->do('ALTER TABLE fgroup DROP COLUMN idxFTI')
356 or $self->throw('dropping full text column failed');
359 return;