tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / DB / GFF / Adaptor / dbi / pg_fts.pm
blobeaa220392f52a2fff3829685b12557452b7a6e93
1 package Bio::DB::GFF::Adaptor::dbi::pg_fts;
3 # $Id$
5 =head1 NAME
7 Bio::DB::GFF::Adaptor::dbi::pg_fts -- Database adaptor for a specific postgres schema with a TSearch2 implementation
9 =head1 SYNOPSIS
11 #create new GFF database connection
12 my $db = Bio::DB::GFF->new( -adaptor => 'dbi::pg_fts',
13 -dsn => 'dbi:Pg:dbname=worm');
15 #add full text indexing 'stuff'
16 #assumes that TSearch2 is available to PostgreSQL
17 #this will take a VERY long time for a reasonably large database
18 $db->install_TSearch2();
20 ...some time later...
21 #we don't like full text searching...
22 $db->remove_TSearch2();
24 =head1 DESCRIPTION
26 This adaptor is based on Bio::DB::GFF::Adaptor::dbi::pg but it implements
27 the TSearch2 PostgreSQL contrib module for fast full text searching. To
28 use this module with your PostgreSQL GFF database, you need to make
29 TSearch2 available in the database.
31 To use this adaptor, follow these steps:
33 =over
35 =item Install TSearch2 contrib module for Pg
37 Can be as easy as `sudo yum install postgresql-contrib`, or you may
38 need to recompile PostgreSQL to include it. See
39 L<http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/tsearch-V2-intro.html>
40 for more details
42 =item Load the TSearch2 functions to you database
44 % cat tsearch2.sql | psql <your database>
46 =item Load your data using the pg adaptor:
48 % bp_pg_bulk_load_gff.pl -c -d yeast saccharomyces_cerevisiae.gff
52 % bp_load_gff.pl -c -d yeast -a dbi::pg saccharomyces_cerevisiae.gff
54 =item Add GFF/TSearch2 specific modifications
56 Execute a perl script like this one:
58 #!/usr/bin/perl -w
59 use strict;
61 use Bio::DB::GFF;
63 my $db = Bio::DB::GFF->new(
64 -adaptor => 'dbi::pg_fts',
65 -dsn => 'dbi:Pg:dbname=yeast',
66 -user => 'scott',
69 print "Installing TSearch2 columns...\n";
71 $db->install_TSearch2();
73 print "Done\n";
75 =back
77 Note that this last step will take a long time. For a S. cerevisiae
78 database with 15K rows, it took over an hour on my laptop, and
79 with a C. elegans database (~10 million rows) it took well over a day.
81 If at some point you add more data you your database, you need to run
82 a similar script to the one above, only executing the update_TSearch2()
83 method. Finally, if you want to remove the TSearch2 columns from your
84 database and go back to using the pg adaptor, you can execute a script
85 like the one above, only executing the remove_TSearch2() method.
87 =head1 NOTES ABOUT TSearch2 SEARCHING
89 You should know a few things about how searching with TSearch2 works in
90 the GBrowse enviroment:
92 =over
94 =item 1
96 TSearch2 does not do wild cards, so you should encourage your users not
97 to use them. If wild cards are used, the adaptor will fall back on
98 an ILIKE search, which will be much slower.
100 =item 2
102 However, TSearch2 does do 'word stemming'. That is, if you search
103 for 'copy', it will find 'copy', 'copies', and 'copied'.
105 =item 3
107 TSearch2 does not do phrase searching; all of the terms in the
108 search string are ANDed together.
110 =back
112 =head1 ACKNOWLEDGEMENTS
114 Special thanks to Russell Smithies and Paul Smale at AgResearch in
115 New Zealand for giving me their recipe for doing full text indexing
116 in a GFF database.
118 =head1 BUGS
120 Please report bugs to the BioPerl and/or GBrowse mailing lists
121 (L<mailto:bioperl-l@lists.open-bio.org> and L<mailto:gmod-gbrowse@lists.sourceforge.net>
122 respectively).
124 =head1 SEE ALSO
126 Please see L<Bio::DB::GFF::Adaptor::dbi::pg> for more information
127 about tuning your PostgreSQL server for GFF data, and for general
128 information about GFF database access, see L<Bio::DB::GFF>.
130 =head1 AUTHOR
132 Scott Cain, cain@cshl.edu
134 =head1 APPENDIX
136 =cut
138 # a simple postgres adaptor
139 use strict;
140 use Bio::DB::GFF::Adaptor::dbi;
141 use base qw(Bio::DB::GFF::Adaptor::dbi::pg);
143 use constant FULLTEXTSEARCH => <<END;
144 SELECT distinct gclass,gname,fattribute_value,fmethod,fsource
145 FROM fgroup,fattribute_to_feature,fdata,ftype
146 WHERE fgroup.gid=fdata.gid
147 AND fdata.fid=fattribute_to_feature.fid
148 AND fdata.ftypeid=ftype.ftypeid
149 AND (fattribute_to_feature.idxfti @@ to_tsquery('default', ?))
153 use constant FULLTEXTWILDCARD => <<END;
154 SELECT distinct gclass,gname,fattribute_value,fmethod,fsource
155 FROM fgroup,fattribute_to_feature,fdata,ftype
156 WHERE fgroup.gid=fdata.gid
157 AND fdata.fid=fattribute_to_feature.fid
158 AND fdata.ftypeid=ftype.ftypeid
159 AND lower(fattribute_to_feature.fattribute_value) LIKE lower(?)
163 sub new {
164 my $class = shift;
165 my $self = $class->SUPER::new(@_);
166 return $self;
169 =head2 search_notes
171 Title : search_notes
172 Usage : @search_results = $db->search_notes("full text string",$limit)
173 Function: Search the notes for a text string, using PostgreSQL TSearch2
174 Returns : array of results
175 Args : full text search string, and an optional row limit
176 Status : public
178 This is based on the mysql-specific method that makes use of the TSearch2
179 functionality in PosgreSQL's contrib directory. Given a search string,
180 it performs a full-text search of the notes table and returns an array
181 of results. Each row of the returned array is a arrayref containing
182 the following fields:
184 column 1 A Bio::DB::GFF::Featname object, for passing to segment()
185 column 2 The text of the note
186 column 3 A relevance score.
188 =cut
190 sub search_notes {
191 my $self = shift;
192 my ($search_string,$limit) = @_;
194 my @terms = split /\s+/, $search_string;
196 my $sth;
197 if ($search_string =~ /\*/) {
198 $search_string =~ tr/*/%/s;
199 my $query = FULLTEXTWILDCARD;
200 $query .= " limit $limit" if defined $limit;
201 $sth = $self->dbh->do_query($query,$search_string);
203 elsif (@terms == 1) {
204 my $query = FULLTEXTSEARCH;
205 $query .= " limit $limit" if defined $limit;
206 $sth = $self->dbh->do_query($query,$search_string);
208 else {
209 my $query = FULLTEXTSEARCH;
210 my $andstring = join (' & ', @terms);
211 # $query .= qq{ AND (fattribute_to_feature.fattribute_value ILIKE '\%$search_string%')};
212 $query .= " LIMIT $limit" if defined $limit;
213 $sth = $self->dbh->do_query($query,$andstring);
216 my @results;
217 while (my ($class,$name,$note,$method,$source) = $sth->fetchrow_array) {
219 next unless $class && $name; # sorry, ignore NULL objects
220 my $featname = Bio::DB::GFF::Featname->new($class=>$name);
221 my $type = Bio::DB::GFF::Typename->new($method,$source);
222 push @results,[$featname,$note,0,$type]; #gbrowse expects a score, but
223 #pg doesn't give one, thus the 0
226 return @results;
229 =head2 make_features_by_name_where_part
231 Title : make_features_by_name_where_part
232 Function: constructs a TSearch2-compliant WHERE clause for a name search
233 Status : protected
235 =cut
237 #need a make_features_by_name_where_part method to override pg
238 sub make_features_by_name_where_part {
239 my $self = shift;
240 my ($class,$name) = @_;
242 my @terms = split /\s+/, $name;
244 if ($name =~ /\*/) {
245 $name =~ tr/*/%/s;
246 return ("fgroup.gclass=? AND lower(fgroup.gname) LIKE lower(?)",$class,$name);
248 else {
249 my $where_str = "fgroup.gclass=? AND (fgroup.idxfti @@ to_tsquery('default', ?)) ";
250 if (@terms == 1) {
251 return ($where_str,$class,$name);
253 else {
254 my $andstring = join (' & ', @terms);
255 # $where_str .= qq{ AND (fgroup.gname ILIKE '\%$name%')};
256 return ($where_str,$class,$andstring);
261 =head2 install_TSearch2
263 Title : install_TSearch2
264 Function: installs schema modifications for use with TSearch2
265 Usage : $db->install_TSearch2
266 Status : public
268 =cut
271 #needs method for installing TSearch2 (does that mean that the SQL for
272 #creating the tables and functions should go in here? That would be
273 #the safest and easiest thing to do
274 sub install_TSearch2 {
275 my $self = shift;
277 my $dbh = $self->features_db;
279 $dbh->do('ALTER TABLE fattribute_to_feature ADD COLUMN idxFTI tsvector')
280 or $self->throw('adding FTI column to f_to_f failed');
282 $dbh->do('ALTER TABLE fgroup ADD COLUMN idxFTI tsvector')
283 or $self->throw('adding FTI column to fgroup failed');
285 $self->update_TSearch2();
287 return;
290 =head2 update_TSearch2
292 Title : update_TSearch2
293 Function: Updates TSearch2 columns
294 Usage : $db->update_TSearch2
295 Status : public
297 =cut
300 sub update_TSearch2 {
301 my $self = shift;
303 my $dbh = $self->features_db;
305 $self->warn('updating full text column; this may take a very long time...');
306 $dbh->do("UPDATE fattribute_to_feature "
307 ."SET idxFTI= to_tsvector('default', fattribute_value) "
308 ."WHERE idxFTI IS NULL")
309 or $self->throw('updating fti column failed');
310 $dbh->do("UPDATE fgroup "
311 ."SET idxFTI= to_tsvector('default', gname) "
312 ."WHERE idxFTI IS NULL")
313 or $self->throw('updating fgroup fti column failed');
315 $self->warn('Preliminary optimization of database; this may also take a long time...');
316 $dbh->do('VACUUM FULL ANALYZE')
317 or $self->throw('vacuum failed');
319 $self->warn('Updating full text index; again, this may take a long time');
320 $dbh->do('CREATE INDEX idxFTI_idx ON fattribute_to_feature '
321 .'USING gist(idxFTI)')
322 or $self->warn('creating full text index failed');
323 $dbh->do('CREATE INDEX fgroup_idxFTI_idx ON fgroup '
324 .'USING gist(idxFTI)')
325 or $self->warn('creating fgroup full text index failed');
327 $self->warn('Optimizing database; hopefully, this will not take as long as other steps');
328 $dbh->do('VACUUM FULL ANALYZE');
329 $dbh->do("SELECT set_curcfg('default')");
331 return;
334 =head2 remove_TSearch2
336 Title : remove_TSearch2
337 Function: Removes TSearch2 columns
338 Usage : $db->remove_TSearch2
339 Status : public
341 =cut
343 sub remove_TSearch2 {
344 my $self = shift;
346 my $dbh = $self->features_db;
348 $self->warn('Removing full text search capabilities');
349 $dbh->do('DROP INDEX idxFTI_idx')
350 or $self->throw('dropping full text index failed');
351 $dbh->do('DROP INDEX fgroup_idxFTI_idx')
352 or $self->throw('dropping full text index failed');
354 $dbh->do('ALTER TABLE fattribute_to_feature DROP COLUMN idxFTI')
355 or $self->throw('dropping full text column failed');
356 $dbh->do('ALTER TABLE fgroup DROP COLUMN idxFTI')
357 or $self->throw('dropping full text column failed');
360 return;