lib/Bio/DB/GFF/Adaptor/dbi/oracle.pm

   1 package Bio::DB::GFF::Adaptor::dbi::oracle;
   2
   3 =head1 NAME
   4
   5 Bio::DB::GFF::Adaptor::dbi::oracle -- Database adaptor for a specific oracle schema
   6
   7 =head1 SYNOPSIS
   8
   9 See L<Bio::DB::GFF>
  10
  11 =cut
  12
  13 # a simple oracle adaptor
  14 use strict;
  15 #use Bio::DB::GFF::Adaptor::dbi::mysql;
  16 #use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
  17 use Bio::DB::GFF::Util::Binning;
  18 use Bio::DB::GFF::Util::Rearrange; # for rearrange()
  19 use base qw(Bio::DB::GFF::Adaptor::dbi);
  20
  21 use constant MAX_SEGMENT => 100_000_000;  # the largest a segment can get
  22 use constant DEFAULT_CHUNK => 2000;
  23
  24 use constant GETSEQCOORDS =><<END;
  25 SELECT fref,
  26        NVL(gclass,'Sequence'),
  27        min(fstart),
  28        max(fstop),
  29        fstrand,
  30        gname
  31   FROM fdata,fgroup
  32   WHERE fgroup.gname=?
  33     AND fgroup.gclass=?
  34     AND fgroup.gid=fdata.gid
  35     GROUP BY fref,fstrand,gclass,gname
  36 END
  37 ;
  38
  39 use constant GETALIASCOORDS =><<END;
  40 SELECT fref,
  41        NVL(gclass,'Sequence'),
  42        min(fstart),
  43        max(fstop),
  44        fstrand,
  45        gname
  46   FROM fdata,fgroup,fattribute,fattribute_to_feature
  47   WHERE fattribute_to_feature.fattribute_value=?
  48     AND fgroup.gclass=?
  49     AND fgroup.gid=fdata.gid
  50     AND fattribute.fattribute_name='Alias'
  51     AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id
  52     AND fattribute_to_feature.fid=fdata.fid
  53     GROUP BY fref,fstrand,gclass,gname
  54 END
  55 ;
  56
  57 use constant GETALIASLIKE =><<END;
  58 SELECT fref,
  59        NVL(gclass,'Sequence'),
  60        min(fstart),
  61        max(fstop),
  62        fstrand,
  63        gname
  64   FROM fdata,fgroup,fattribute,fattribute_to_feature
  65   WHERE fattribute_to_feature.fattribute_value LIKE ?
  66     AND fgroup.gclass=?
  67     AND fgroup.gid=fdata.gid
  68     AND fattribute.fattribute_name='Alias'
  69     AND fattribute_to_feature.fattribute_id=fattribute.fattribute_id
  70     AND fattribute_to_feature.fid=fdata.fid
  71     GROUP BY fref,fstrand,gname
  72 END
  73 ;
  74
  75
  76 use constant GETFORCEDSEQCOORDS =><<END;
  77 SELECT fref,
  78        NVL(gclass,'Sequence'),
  79        min(fstart),
  80        max(fstop),
  81        fstrand
  82   FROM fdata,fgroup
  83   WHERE fgroup.gname=?
  84     AND fgroup.gclass=?
  85     AND fdata.fref=?
  86     AND fgroup.gid=fdata.gid
  87     GROUP BY fref,fstrand,gclass
  88 END
  89 ;
  90
  91 ########################
  92 # moved from mysqlopt.pm
  93 ########################
  94
  95 # this is the largest that any reference sequence can be (100 megabases)
  96 use constant MAX_BIN    => 100_000_000;
  97
  98 # this is the smallest bin (1 K)
  99 use constant MIN_BIN    => 1000;
 100
 101 # size of range over which it is faster to force mysql to use the range for indexing
 102 use constant STRAIGHT_JOIN_LIMIT => 200_000;
 103
 104 ##############################################################################
 105
 106 =head1 DESCRIPTION
 107
 108 This adaptor implements a specific oracle database schema that is
 109 compatible with Bio::DB::GFF.  It inherits from
 110 Bio::DB::GFF::Adaptor::dbi, which itself inherits from Bio::DB::GFF.
 111
 112 The schema uses several tables:
 113
 114 =over 4
 115
 116 =item fdata
 117
 118 This is the feature data table.  Its columns are:
 119
 120     fid            feature ID (integer)
 121     fref           reference sequence name (string)
 122     fstart         start position relative to reference (integer)
 123     fstop          stop position relative to reference (integer)
 124     ftypeid        feature type ID (integer)
 125     fscore         feature score (float); may be null
 126     fstrand        strand; one of "+" or "-"; may be null
 127     fphase         phase; one of 0, 1 or 2; may be null
 128     gid            group ID (integer)
 129     ftarget_start  for similarity features, the target start position (integer)
 130     ftarget_stop   for similarity features, the target stop position (integer)
 131
 132 Note that it would be desirable to normalize the reference sequence
 133 name, since there are usually many features that share the same
 134 reference feature.  However, in the current schema, query performance
 135 suffers dramatically when this additional join is added.
 136
 137 =item fgroup
 138
 139 This is the group table. There is one row for each group.  Columns:
 140
 141     gid       the group ID (integer)
 142     gclass    the class of the group (string)
 143     gname     the name of the group (string)
 144
 145 The group table serves multiple purposes.  As you might expect, it is
 146 used to cluster features that logically belong together, such as the
 147 multiple exons of the same transcript.  It is also used to assign a
 148 name and class to a singleton feature.  Finally, the group table is
 149 used to identify the target of a similarity hit.  This is consistent
 150 with the way in which the group field is used in the GFF version 2
 151 format.
 152
 153 The fgroup.gid field joins with the fdata.gid field.
 154
 155 Examples:
 156
 157   sql> select * from fgroup where gname='sjj_2L52.1';
 158   +-------+-------------+------------+
 159   | gid   | gclass      | gname      |
 160   +-------+-------------+------------+
 161   | 69736 | PCR_product | sjj_2L52.1 |
 162   +-------+-------------+------------+
 163   1 row in set (0.70 sec)
 164
 165   sql> select fref,fstart,fstop from fdata,fgroup
 166             where gclass='PCR_product' and gname = 'sjj_2L52.1'
 167                   and fdata.gid=fgroup.gid;
 168   +---------------+--------+-------+
 169   | fref          | fstart | fstop |
 170   +---------------+--------+-------+
 171   | CHROMOSOME_II |   1586 |  2355 |
 172   +---------------+--------+-------+
 173   1 row in set (0.03 sec)
 174
 175 =item ftype
 176
 177 This table contains the feature types, one per row.  Columns are:
 178
 179     ftypeid      the feature type ID (integer)
 180     fmethod      the feature type method name (string)
 181     fsource      the feature type source name (string)
 182
 183 The ftype.ftypeid field joins with the fdata.ftypeid field.  Example:
 184
 185   sql> select fref,fstart,fstop,fmethod,fsource from fdata,fgroup,ftype
 186          where gclass='PCR_product'
 187                and gname = 'sjj_2L52.1'
 188                and fdata.gid=fgroup.gid
 189                and fdata.ftypeid=ftype.ftypeid;
 190   +---------------+--------+-------+-------------+-----------+
 191   | fref          | fstart | fstop | fmethod     | fsource   |
 192   +---------------+--------+-------+-------------+-----------+
 193   | CHROMOSOME_II |   1586 |  2355 | PCR_product | GenePairs |
 194   +---------------+--------+-------+-------------+-----------+
 195   1 row in set (0.08 sec)
 196
 197 =item fdna
 198
 199 This table holds the raw DNA of the reference sequences.  It has three
 200 columns:
 201
 202     fref          reference sequence name (string)
 203     foffset       offset of this sequence
 204     fdna          the DNA sequence (longblob)
 205
 206 To overcome problems loading large blobs, DNA is automatically
 207 fragmented into multiple segments when loading, and the position of
 208 each segment is stored in foffset.  The fragment size is controlled by
 209 the -clump_size argument during initialization.
 210
 211 =item fattribute_to_feature
 212
 213 This table holds "attributes", which are tag/value pairs stuffed into
 214 the GFF line.  The first tag/value pair is treated as the group, and
 215 anything else is treated as an attribute (weird, huh?).
 216
 217  CHR_I assembly_tag Finished     2032 2036 . + . Note "Right: cTel33B"
 218  CHR_I assembly_tag Polymorphism 668  668  . + . Note "A->C in cTel33B"
 219
 220 The columns of this table are:
 221
 222     fid                 feature ID (integer)
 223     fattribute_id       ID of the attribute (integer)
 224     fattribute_value    text of the attribute (text)
 225
 226 The fdata.fid column joins with fattribute_to_feature.fid.
 227
 228 =item fattribute
 229
 230 This table holds the normalized names of the attributes.  Fields are:
 231
 232   fattribute_id      ID of the attribute (integer)
 233   fattribute_name    Name of the attribute (varchar)
 234
 235 =back
 236
 237 =head2 Data Loading Methods
 238
 239 In addition to implementing the abstract SQL-generating methods of
 240 Bio::DB::GFF::Adaptor::dbi, this module also implements the data
 241 loading functionality of Bio::DB::GFF.
 242
 243 =cut
 244
 245
 246 =head2 new
 247
 248  Title   : new
 249  Usage   : $db = Bio::DB::GFF->new(@args)
 250  Function: create a new adaptor
 251  Returns : a Bio::DB::GFF object
 252  Args    : see below
 253  Status  : Public
 254
 255 The new constructor is identical to the "dbi" adaptor's new() method,
 256 except that the prefix "dbi:oracle" is added to the database DSN identifier
 257 automatically if it is not there already.
 258
 259   Argument       Description
 260   --------       -----------
 261
 262   -dsn           the DBI data source, e.g. 'dbi:mysql:ens0040' or "ens0040"
 263
 264   -user          username for authentication
 265
 266   -pass          the password for authentication
 267
 268 =cut
 269
 270 #'
 271
 272 sub new {
 273   my $class = shift;
 274   my ($dsn,$other) = rearrange([
 275                                 [qw(FEATUREDB DB DSN)],
 276                                ],@_);
 277   $dsn = "dbi:Oracle:$dsn" if !ref($dsn) && $dsn !~ /^(dbi|DBI):/;
 278   my $self = $class->SUPER::new(-dsn=>$dsn,%$other);
 279   $self;
 280 }
 281
 282 =head2 schema
 283
 284  Title   : schema
 285  Usage   : $schema = $db->schema
 286  Function: return the CREATE script for the schema
 287  Returns : a list of CREATE statemetns
 288  Args    : none
 289  Status  : protected
 290
 291 This method returns a list containing the various CREATE statements
 292 needed to initialize the database tables.
 293
 294 =cut
 295
 296 sub schema {
 297   my %schema = (
 298                 fdata =>{
 299 table=> q{
 300 create table fdata (
 301   fid INTEGER  NOT NULL,
 302   fref VARCHAR(100) DEFAULT '' NOT NULL,
 303   fstart INTEGER DEFAULT '0' NOT NULL,
 304   fstop INTEGER DEFAULT '0' NOT NULL,
 305   fbin NUMBER DEFAULT '0.000000' NOT NULL,
 306   ftypeid INTEGER DEFAULT '0' NOT NULL,
 307   fscore NUMBER  ,
 308   fstrand VARCHAR2(3)   CHECK (fstrand IN ('+','-')),
 309   fphase VARCHAR2(3)   CHECK (fphase IN ('0','1','2')),
 310   gid INTEGER DEFAULT '0' NOT NULL,
 311   ftarget_start INTEGER  ,
 312   ftarget_stop INTEGER  ,
 313   CONSTRAINT fdata_pk PRIMARY KEY (fid)
 314 )
 315 }, # fdata table
 316
 317 index=>{
 318                 fdata_fref_idx => q{
 319 CREATE UNIQUE INDEX fdata_fref_idx ON fdata (fref,fbin,fstart,fstop,ftypeid,gid)
 320 },
 321
 322                 fdata_ftypeid_idx => q{
 323 CREATE INDEX fdata_ftypeid_idx ON fdata (ftypeid)
 324 },
 325
 326                 fdata_gid_idx => q{
 327 CREATE  INDEX fdata_gid_idx ON fdata (gid)
 328 }
 329          }, # fdata indexes
 330
 331 sequence=> {
 332                 fdata_fid_sq => q{
 333 CREATE SEQUENCE fdata_fid_sq START WITH 1
 334 }
 335             }, # fdata sequences
 336
 337 trigger=> {
 338                 fdata_fid_ai => q{
 339 CREATE OR REPLACE TRIGGER fdata_fid_ai
 340 BEFORE INSERT ON fdata
 341 FOR EACH ROW WHEN (new.fid IS NULL OR new.fid = 0)
 342 BEGIN
 343    SELECT fdata_fid_sq.nextval INTO :new.fid FROM dual;
 344 END;
 345 }
 346            }# fdata triggers
 347
 348 }, # fdata
 349
 350
 351
 352                 fgroup => {
 353 table => q{
 354 CREATE TABLE fgroup (
 355   gid INTEGER  NOT NULL,
 356   gclass VARCHAR(100)  ,
 357   gname VARCHAR(100)  ,
 358   CONSTRAINT fgroup_pk PRIMARY KEY (gid)
 359 )
 360 }, # fgroup table
 361
 362 index => {
 363                 fgroup_gclass_idx => q{
 364 CREATE UNIQUE INDEX fgroup_gclass_idx ON fgroup (gclass,gname)
 365 }
 366            }, # fgroup indexes
 367
 368 sequence => {
 369
 370                 fgroup_gid_sq => q{
 371 CREATE SEQUENCE fgroup_gid_sq START WITH 1
 372 }
 373              }, # fgroup sequences
 374
 375
 376 trigger => {
 377                 fgroup_gid_ai => q{
 378 CREATE OR REPLACE TRIGGER fgroup_gid_ai
 379 BEFORE INSERT ON fgroup
 380 FOR EACH ROW WHEN (new.gid IS NULL OR new.gid = 0)
 381 BEGIN
 382    SELECT fgroup_gid_sq.nextval INTO :new.gid FROM dual;
 383 END;
 384 }
 385             } # fgroup triggers
 386
 387 }, # fgroup
 388
 389                 ftype => {
 390 table => q{
 391 CREATE TABLE ftype (
 392   ftypeid INTEGER  NOT NULL,
 393   fmethod VARCHAR(100) DEFAULT '' NOT NULL,
 394   fsource VARCHAR(100),
 395   CONSTRAINT ftype_pk PRIMARY KEY (ftypeid)
 396 )
 397 }, # ftype table
 398
 399 index => {
 400                 ftype_fmethod_idx => q{
 401 CREATE  INDEX ftype_fmethod_idx ON ftype (fmethod)
 402 },
 403
 404                 ftype_fsource_idx => q{
 405 CREATE  INDEX ftype_fsource_idx ON ftype (fsource)
 406 },
 407
 408                 ftype_ftype_idx => q{
 409 CREATE UNIQUE INDEX ftype_ftype_idx ON ftype (fmethod,fsource)
 410 }
 411            }, # ftype indexes
 412
 413 sequence => {
 414                 ftype_ftypeid_sq => q{
 415 CREATE SEQUENCE ftype_ftypeid_sq START WITH 1
 416 }
 417              }, #ftype sequences
 418
 419 trigger => {
 420                 ftype_ftypeid_ai => q{
 421 CREATE OR REPLACE TRIGGER ftype_ftypeid_ai
 422 BEFORE INSERT ON ftype
 423 FOR EACH ROW WHEN (new.ftypeid IS NULL OR new.ftypeid = 0)
 424 BEGIN
 425    SELECT ftype_ftypeid_sq.nextval INTO :new.ftypeid FROM dual;
 426 END;
 427 }
 428             } #ftype triggers
 429 }, # ftype
 430
 431
 432          fdna => {
 433 table => q{
 434 CREATE TABLE fdna (
 435   fref VARCHAR(100) DEFAULT '' NOT NULL,
 436   foffset INTEGER DEFAULT '0' NOT NULL,
 437   fdna LONG         /* LONGBLOB */  ,
 438   CONSTRAINT fdna_pk PRIMARY KEY (fref,foffset)
 439 )
 440 } #fdna table
 441                  }, #fdna
 442
 443         fmeta => {
 444 table => q{
 445 CREATE TABLE fmeta (
 446   fname VARCHAR(255) DEFAULT '' NOT NULL,
 447   fvalue VARCHAR(255) DEFAULT '' NOT NULL,
 448   CONSTRAINT fmeta_pk PRIMARY KEY (fname)
 449 )
 450 } # fmeta table
 451                  }, # fmeta
 452
 453
 454        fattribute => {
 455 table => q{
 456 CREATE TABLE fattribute (
 457   fattribute_id INTEGER  NOT NULL,
 458   fattribute_name VARCHAR(255) DEFAULT '' NOT NULL,
 459   CONSTRAINT fattribute_pk PRIMARY KEY (fattribute_id)
 460 )
 461 }, # fattribute table
 462
 463 sequence=> {
 464        fattribute_fattribute_id_sq => q{
 465 CREATE SEQUENCE fattribute_fattribute_id_sq START WITH 1
 466 }
 467             }, # fattribute sequences
 468
 469 trigger => {
 470        fattribute_fattribute_id_ai => q{
 471 CREATE OR REPLACE TRIGGER fattribute_fattribute_id_ai
 472 BEFORE INSERT ON fattribute
 473 FOR EACH ROW WHEN (new.fattribute_id IS NULL OR new.fattribute_id = 0)
 474 BEGIN
 475    SELECT fattribute_fattribute_id_sq.nextval INTO :new.fattribute_id FROM dual;
 476 END;
 477 }
 478             } # fattribute triggers
 479 }, # fattribute
 480
 481        fattribute_to_feature => {
 482 table => q{
 483 CREATE TABLE fattribute_to_feature (
 484   fid INTEGER DEFAULT '0' NOT NULL,
 485   fattribute_id INTEGER DEFAULT '0' NOT NULL,
 486   fattribute_value VARCHAR2(255) /* TEXT */
 487 )
 488 }, # fattribute_to_feature table
 489
 490 index => {
 491        fattribute_to_feature_fid => q{
 492 CREATE  INDEX fattribute_to_feature_fid ON fattribute_to_feature (fid,fattribute_id)
 493 }
 494            } # fattribute_to_feature indexes
 495 }, # fattribute_to_feature
 496
 497        finterval_stats => {
 498 table=> q{
 499 CREATE TABLE "finterval_stats" (
 500    "ftypeid"          integer DEFAULT '0' NOT NULL,
 501    "fref"             VARCHAR(100) DEFAULT '' NOT NULL,
 502    "fbin"             integer DEFAULT '0' NOT NULL,
 503    "fcum_count"       integer DEFAULT '0' NOT NULL,
 504    CONSTRAINT finterval_stats_pk PRIMARY KEY (ftypeid,fref,fbin)
 505 )
 506 } # finterval_stats table
 507 },# finterval_stats
 508
 509 );
 510   return \%schema;
 511 }
 512
 513
 514 =head2 do_initialize
 515
 516  Title   : do_initialize
 517  Usage   : $success = $db->do_initialize($drop_all)
 518  Function: initialize the database
 519  Returns : a boolean indicating the success of the operation
 520  Args    : a boolean indicating whether to delete existing data
 521  Status  : protected
 522
 523 This method will load the schema into the database.  If $drop_all is
 524 true, then any existing data in the tables known to the schema will be
 525 deleted.
 526
 527 Internally, this method calls schema() to get the schema data.
 528
 529 =cut
 530
 531 # Create the schema from scratch.
 532 # You will need create privileges for this.
 533 #sub do_initialize {
 534 #  my $self = shift;
 535 #  my $erase = shift;
 536 #  $self->drop_all if $erase;
 537
 538 #  my $dbh = $self->features_db;
 539 #  my $schema = $self->schema;
 540
 541 #  foreach my $table_name(keys %$schema) {
 542 #    my $create_table_stmt = $$schema{$table_name}{table} ;
 543 #    $dbh->do($create_table_stmt) ||  warn $dbh->errstr;
 544 #  }
 545 #  1;
 546 #}
 547
 548
 549
 550 =head2 drop_all
 551
 552  Title   : drop_all
 553  Usage   : $db->drop_all
 554  Function: empty the database
 555  Returns : void
 556  Args    : none
 557  Status  : protected
 558
 559 This method drops the tables known to this module.  Internally it
 560 calls the abstract tables() method.
 561
 562 =cut
 563
 564 # Drop all the GFF tables -- dangerous!
 565 #sub drop_all {
 566 #  my $self = shift;
 567 #  my $dbh = $self->features_db;
 568 #  local $dbh->{PrintError} = 0;
 569 #  foreach ($self->tables) {
 570 #    $dbh->do("drop table $_");
 571 #  }
 572 #}
 573
 574
 575
 576
 577
 578
 579 =head2 setup_load
 580
 581  Title   : setup_load
 582  Usage   : $db->setup_load
 583  Function: called before load_gff_line()
 584  Returns : void
 585  Args    : none
 586  Status  : protected
 587
 588 This method performs schema-specific initialization prior to loading a
 589 set of GFF records.  It prepares a set of DBI statement handlers to be
 590 used in loading the data.
 591
 592 =cut
 593
 594 sub setup_load {
 595   my $self      = shift;
 596   my $schema = $self->schema;
 597
 598   my $dbh = $self->features_db;
 599
 600   if ($self->lock_on_load) {
 601     my @tables = map { "$_ WRITE"} $self->tables;
 602     my $tables = join ', ',@tables;
 603     $dbh->do("LOCK TABLES $tables");
 604   }
 605
 606   my $lookup_type = $dbh->prepare_delayed('SELECT ftypeid FROM ftype WHERE fmethod=? AND fsource=?');
 607   my $insert_type = $dbh->prepare_delayed('INSERT INTO ftype (fmethod,fsource) VALUES (?,?)');
 608   my $sequence_type = (keys %{$schema->{ftype}{sequence}})[0];
 609   my $insertid_type = $dbh->prepare_delayed("SELECT $sequence_type.CURRVAL FROM dual");
 610
 611   my $lookup_group = $dbh->prepare_delayed('SELECT gid FROM fgroup WHERE gname=? AND gclass=?');
 612   my $insert_group = $dbh->prepare_delayed('INSERT INTO fgroup (gname,gclass) VALUES (?,?)');
 613   my $sequence_group = (keys %{$schema->{fgroup}{sequence}})[0];
 614   my $insertid_group = $dbh->prepare_delayed("SELECT $sequence_group.CURRVAL FROM dual");
 615
 616   my $lookup_attribute = $dbh->prepare_delayed('SELECT fattribute_id FROM fattribute WHERE fattribute_name=?');
 617   my $insert_attribute = $dbh->prepare_delayed('INSERT INTO fattribute (fattribute_name) VALUES (?)');
 618   my $sequence_attribute = (keys %{$schema->{fattribute}{sequence}})[0];
 619   my $insertid_attribute = $dbh->prepare_delayed("SELECT $sequence_attribute.CURRVAL FROM dual");
 620
 621   my $insert_attribute_value = $dbh->prepare_delayed('INSERT INTO fattribute_to_feature (fid,fattribute_id,fattribute_value) VALUES (?,?,?)');
 622
 623   my $insert_data  = $dbh->prepare_delayed(<<END);
 624 INSERT INTO fdata (fref,fstart,fstop,fbin,ftypeid,fscore,
 625                    fstrand,fphase,gid,ftarget_start,ftarget_stop)
 626        VALUES(?,?,?,?,?,?,?,?,?,?,?)
 627 END
 628 ;
 629   my $delete_existing_data = $dbh->prepare_delayed('DELETE FROM fdata WHERE fref=? AND fstart=? AND fstop=? AND fbin=? AND ftypeid=? AND GID=?');
 630   my $sequence_data =  (keys %{$schema->{fdata}{sequence}})[0];
 631   my $insertid_data = $dbh->prepare_delayed("SELECT $sequence_data.CURRVAL FROM dual");
 632
 633
 634
 635   $self->{load_stuff}{sth}{lookup_ftype}     = $lookup_type;
 636   $self->{load_stuff}{sth}{insert_ftype}     = $insert_type;
 637   $self->{load_stuff}{sth}{insertid_ftype}   = $insertid_type;
 638   $self->{load_stuff}{sth}{lookup_fgroup}    = $lookup_group;
 639   $self->{load_stuff}{sth}{insert_fgroup}    = $insert_group;
 640   $self->{load_stuff}{sth}{insertid_fgroup}  = $insertid_group;
 641   $self->{load_stuff}{sth}{insert_fdata}     = $insert_data;
 642   $self->{load_stuff}{sth}{insertid_fdata}   = $insertid_data;
 643   $self->{load_stuff}{sth}{delete_existing_fdata} = $delete_existing_data;
 644   $self->{load_stuff}{sth}{lookup_fattribute} = $lookup_attribute;
 645   $self->{load_stuff}{sth}{insert_fattribute} = $insert_attribute;
 646   $self->{load_stuff}{sth}{insertid_fattribute} = $insertid_attribute;
 647   $self->{load_stuff}{sth}{insert_fattribute_value} = $insert_attribute_value;
 648   $self->{load_stuff}{types}  = {};
 649   $self->{load_stuff}{groups} = {};
 650   $self->{load_stuff}{counter} = 0;
 651 }
 652
 653 =head2 load_gff_line
 654
 655  Title   : load_gff_line
 656  Usage   : $db->load_gff_line($fields)
 657  Function: called to load one parsed line of GFF
 658  Returns : true if successfully inserted
 659  Args    : hashref containing GFF fields
 660  Status  : protected
 661
 662 This method is called once per line of the GFF and passed a series of
 663 parsed data items that are stored into the hashref $fields.  The keys are:
 664
 665  ref          reference sequence
 666  source       annotation source
 667  method       annotation method
 668  start        annotation start
 669  stop         annotation stop
 670  score        annotation score (may be undef)
 671  strand       annotation strand (may be undef)
 672  phase        annotation phase (may be undef)
 673  group_class  class of annotation's group (may be undef)
 674  group_name   ID of annotation's group (may be undef)
 675  target_start start of target of a similarity hit
 676  target_stop  stop of target of a similarity hit
 677  attributes   array reference of attributes, each of which is a [tag=>value] array ref
 678
 679 =cut
 680
 681 sub load_gff_line {
 682   my $self = shift;
 683   my $gff = shift;
 684
 685   if (defined $gff->{phase}){
 686      chomp($gff->{phase});
 687      undef($gff->{phase}) if $gff->{phase} eq '.';
 688    }
 689
 690   if (defined $gff->{strand} && $gff->{strand} eq '.'){undef($gff->{strand})};
 691   if (defined $gff->{score}  && $gff->{score} eq '.'){undef($gff->{score})};
 692
 693   my $s    = $self->{load_stuff};
 694   my $dbh  = $self->features_db;
 695   local $dbh->{PrintError} = 0;
 696
 697   defined(my $typeid  = $self->get_table_id('ftype', $gff->{method} => $gff->{source})) or return;
 698   defined(my $groupid = $self->get_table_id('fgroup',$gff->{gname}  => $gff->{gclass})) or return;
 699
 700   my $bin =  bin($gff->{start},$gff->{stop},$self->min_bin);
 701   my $result = $s->{sth}{insert_fdata}->execute($gff->{ref},
 702                                                $gff->{start},$gff->{stop},$bin,
 703                                                $typeid,
 704                                                $gff->{score},$gff->{strand},$gff->{phase},
 705                                                $groupid,
 706                                                $gff->{tstart},$gff->{tstop});
 707   if (defined ($dbh->errstr)){
 708     print  $dbh->errstr,"\n" ,%$gff,"\n";
 709     if ($dbh->errstr =~ /ORA-02290: check constraint/){
 710       print "PHASE=$gff->{phase}"."===","\n";
 711     }
 712
 713     if ($dbh->errstr =~ /ORA-00001: unique constraint/){
 714       $result = $s->{sth}{delete_existing_fdata}->execute($gff->{ref},
 715                                                            $gff->{start},$gff->{stop},$bin,
 716                                                            $typeid,
 717                                                            $groupid);
 718
 719       print "delete row result=$result\n";
 720       $result = $s->{sth}{insert_fdata}->execute($gff->{ref},
 721                                                $gff->{start},$gff->{stop},$bin,
 722                                                $typeid,
 723                                                $gff->{score},$gff->{strand},$gff->{phase},
 724                                                $groupid,
 725                                                $gff->{tstart},$gff->{tstop});
 726
 727       print "insert row result=$result\n";
 728     }
 729   }
 730   warn $dbh->errstr,"\n" and print "ref=",$gff->{ref}," start=",$gff->{start}," stop=",$gff->{stop}," bin=",$bin," typeid=",$typeid," groupid=",$groupid,"\n"
 731     and return unless $result;
 732
 733   my $fid = $self->insertid($s->{sth},'fdata')
 734     || $self->get_feature_id($gff->{ref},$gff->{start},$gff->{stop},$typeid,$groupid);
 735
 736
 737   # insert attributes
 738
 739   #  print STDERR map {"$fid attribute:". $_->[0]."=".$_->[1]."\n"} @{$gff->{attributes}};
 740
 741   foreach (@{$gff->{attributes}}) {
 742     defined(my $attribute_id = $self->get_table_id('fattribute',$_->[0])) or return;
 743     $s->{sth}{insert_fattribute_value}->execute($fid,$attribute_id,$_->[1]);
 744   }
 745
 746   if ( (++$s->{counter} % 1000) == 0) {
 747     print STDERR "$s->{counter} records loaded...";
 748     print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
 749   }
 750
 751   $fid;
 752 }
 753
 754
 755
 756
 757 =head2 get_table_id
 758
 759  Title   : get_table_id
 760  Usage   : $integer = $db->get_table_id($table,@ids)
 761  Function: get the ID of a group or type
 762  Returns : an integer ID or undef
 763  Args    : none
 764  Status  : private
 765
 766 This internal method is called by load_gff_line to look up the integer
 767 ID of an existing feature type or group.  The arguments are the name
 768 of the table, and two string identifiers.  For feature types, the
 769 identifiers are the method and source.  For groups, the identifiers
 770 are group name and class.
 771
 772 This method requires that a statement handler named I<lookup_$table>,
 773 have been created previously by setup_load().  It is here to overcome
 774 deficiencies in mysql's INSERT syntax.
 775
 776 =cut
 777
 778 #'
 779 # get the object ID from a named table
 780 sub get_table_id {
 781   my $self   = shift;
 782   my $table  = shift;
 783   my @ids    = @_;
 784
 785   # irritating warning for null id
 786   my $id_key;
 787   {
 788     local $^W=0;
 789     $id_key = join ':',@ids;
 790   }
 791
 792   my $s   = $self->{load_stuff};
 793   my $sth = $s->{sth};
 794   my $dbh = $self->features_db;
 795
 796   unless (defined($s->{$table}{$id_key})) {
 797     $sth->{"lookup_$table"}->execute(@ids);
 798     my @result = $sth->{"lookup_$table"}->fetchrow_array;
 799     if (@result > 0) {
 800       $s->{$table}{$id_key} = $result[0];
 801     } else {
 802       $sth->{"insert_$table"}->execute(@ids)
 803         && ($s->{$table}{$id_key} = $self->insertid($sth,$table));
 804         #&& ($s->{$table}{$id_key} = $self->insertid($sth->{"insertid_$table"}));
 805         #&& ($s->{$table}{$id_key} = $sth->{"insert_$table"}->insertid);
 806     }
 807   }
 808
 809   my $id = $s->{$table}{$id_key};
 810   unless (defined $id) {
 811     warn "No $table id for $id_key ",$dbh->errstr," Record skipped.\n";
 812     return;
 813   }
 814   $id;
 815 }
 816
 817 sub insertid {
 818   my $self = shift;
 819   my $sth = shift ;
 820   my $table = shift;
 821
 822   my $insert_id;
 823   if ($sth->{"insertid_$table"}->execute()){
 824      $insert_id = ($sth->{"insertid_$table"}->fetchrow_array)[0];
 825   }
 826   else{
 827     warn "No CURRVAL for SEQUENCE of table $table ",$sth->errstr,"\n";
 828     return;
 829   }
 830   return $insert_id;
 831 }
 832
 833
 834 #sub insertid {
 835 #  my $self = shift;
 836 #  my $insertid_sth = shift ;
 837 #  my $insert_id;
 838 #  if ($insertid_sth->execute){
 839 #     $insert_id = ($insertid_sth->fetchrow_array)[0];
 840 #  }
 841 #  else{
 842 #    warn "No CURRVAL for SEQUENCE  ",$insertid_sth->errstr,"\n";
 843 #    return;
 844 #  }
 845 #  return $insert_id;
 846 #}
 847
 848 sub insert_sequence {
 849   my $self = shift;
 850   my($id,$offset,$seq) = @_;
 851   my $sth = $self->{_insert_sequence}
 852     ||= $self->dbh->prepare_delayed('insert into fdna values (?,?,?)');
 853   $sth->execute($id,$offset,$seq) or $self->throw($sth->errstr);
 854 }
 855
 856 =head2 search_notes
 857
 858  Title   : search_notes
 859  Usage   : @search_results = $db->search_notes("full text search string",$limit)
 860  Function: Search the notes for a text string, using mysql full-text search
 861  Returns : array of results
 862  Args    : full text search string, and an optional row limit
 863  Status  : public
 864
 865 This is a mysql-specific method.  Given a search string, it performs a
 866 full-text search of the notes table and returns an array of results.
 867 Each row of the returned array is a arrayref containing the following fields:
 868
 869   column 1     A Bio::DB::GFF::Featname object, suitable for passing to segment()
 870   column 2     The text of the note
 871   column 3     A relevance score.
 872   column 4     A Bio::DB::GFF::Typename object
 873
 874 =cut
 875
 876 sub search_notes {
 877   my $self = shift;
 878   my ($search_string,$limit) = @_;
 879
 880   $search_string =~ tr/*?//d;
 881
 882   my @words  = $search_string =~ /(\w+)/g;
 883   my $regex  = join '|',@words;
 884   my @searches = map {"fattribute_value LIKE '%${_}%'"} @words;
 885   my $search   = join(' OR ',@searches);
 886
 887   my $query = <<END;
 888 SELECT distinct gclass,gname,fattribute_value,fmethod,fsource
 889   FROM fgroup,fattribute_to_feature,fdata,ftype
 890   WHERE fgroup.gid=fdata.gid
 891      AND fdata.fid=fattribute_to_feature.fid
 892      AND fdata.ftypeid=ftype.ftypeid
 893      AND ($search)
 894 END
 895 ;
 896
 897   my $sth = $self->dbh->do_query($query);
 898   my @results;
 899   while (my ($class,$name,$note,$method,$source) = $sth->fetchrow_array) {
 900      next unless $class && $name;    # sorry, ignore NULL objects
 901      my @matches = $note =~ /($regex)/g;
 902      my $relevance = 10*@matches;
 903      my $featname = Bio::DB::GFF::Featname->new($class=>$name);
 904      my $type     = Bio::DB::GFF::Typename->new($method,$source);
 905      push @results,[$featname,$note,$relevance,$type];
 906      last if $limit && @results >= $limit;
 907   }
 908   @results;
 909 }
 910
 911 =head2 make_meta_set_query
 912
 913  Title   : make_meta_set_query
 914  Usage   : $sql = $db->make_meta_set_query
 915  Function: return SQL fragment for setting a meta parameter
 916  Returns : SQL fragment
 917  Args    : none
 918  Status  : public
 919
 920 By default this does nothing; meta parameters are not stored or
 921 retrieved.
 922
 923 =cut
 924
 925 sub make_meta_set_query {
 926    return 'INSERT INTO fmeta VALUES (?,?)';
 927 }
 928
 929 sub make_classes_query {
 930   my $self = shift;
 931   return 'SELECT DISTINCT gclass FROM fgroup WHERE NOT gclass IS NULL';
 932 }
 933
 934
 935 sub chunk_size {
 936   my $self = shift;
 937   $self->meta('chunk_size') || DEFAULT_CHUNK;
 938 }
 939
 940 sub getseqcoords_query {
 941    my $self = shift;
 942    return GETSEQCOORDS ;
 943 }
 944
 945 sub getaliascoords_query{
 946   my $self = shift;
 947   return GETALIASCOORDS ;
 948 }
 949
 950
 951 sub getforcedseqcoords_query{
 952   my $self = shift;
 953   return GETFORCEDSEQCOORDS ;
 954 }
 955
 956
 957 sub getaliaslike_query{
 958   my $self = shift;
 959   return GETALIASLIKE ;
 960 }
 961
 962
 963 sub make_features_select_part {
 964   my $self = shift;
 965   my $options = shift || {};
 966   my $s;
 967   if (my $b = $options->{bin_width}) {
 968
 969     $s = <<END;
 970 fref,
 971   1+$b*floor(fstart/$b)   as fstart,
 972   $b*(1+floor(fstart/$b)) as fstop,
 973   NVL2(fsource,fmethod||':'||fsource,fmethod),'bin',
 974   count(*) as fscore,
 975   '.','.','bin',
 976   NVL2(fsource , fref||':'||fmethod||':'||fsource , fref||':'||fmethod),
 977   NULL,NULL,NULL,NULL
 978 END
 979 ;
 980   } else {
 981     $s = <<END;
 982 fref,fstart,fstop,fsource,fmethod,fscore,fstrand,fphase,gclass,gname,ftarget_start,ftarget_stop,fdata.fid,fdata.gid
 983 END
 984 ;
 985 }
 986   $s .= ",count(fdata.fid)" if $options->{attributes} && keys %{$options->{attributes}}>1;
 987   $s;
 988 }
 989
 990 sub make_features_from_part_bkup {
 991   my $self = shift;
 992   my $sparse = shift;
 993   my $options = shift || {};
 994   #my $index = $sparse ? ' USE INDEX(ftypeid)': '';
 995   my $index =  '';
 996   return $options->{attributes} ? "fdata${index},ftype,fgroup,fattribute,fattribute_to_feature\n"
 997                                 : "fdata${index},ftype,fgroup\n";
 998 }
 999
1000
1001 ####################################
1002 # moved from mysqlopt.pm
1003 ###################################
1004 # meta values
1005 sub default_meta_values {
1006   my $self = shift;
1007   my @values = $self->SUPER::default_meta_values;
1008   return (
1009           @values,
1010           max_bin => MAX_BIN,
1011           min_bin => MIN_BIN,
1012           straight_join_limit => STRAIGHT_JOIN_LIMIT,
1013          );
1014 }
1015
1016 sub min_bin {
1017   my $self = shift;
1018   return $self->meta('min_bin') || MIN_BIN;
1019 }
1020 sub max_bin {
1021   my $self = shift;
1022   return $self->meta('max_bin') || MAX_BIN;
1023 }
1024 sub straight_join_limit {
1025   my $self = shift;
1026   return $self->meta('straight_join_limit') || STRAIGHT_JOIN_LIMIT;
1027 }
1028
1029 1;