scripts/biosql/load_seqdatabase.pl

   1 #!/bin/perl
   2 #
   3 # You may distribute this module under the same terms as perl itself.
   4 # Refer to the Perl Artistic License (see the license accompanying this
   5 # software package, or see http://www.perl.com/language/misc/Artistic.html)
   6 # for the terms under which you may use, modify, and redistribute this module.
   7 #
   8 # $Id$
   9 #
  10
  11 =head1 NAME
  12
  13 load_seqdatabase.pl
  14
  15 =head1 SYNOPSIS
  16
  17    load_seqdatabase.pl --host somewhere.edu --dbname biosql \
  18                        --namespace swissprot --format swiss \
  19                        swiss_sptrembl swiss.dat primate.dat
  20
  21 =head1 DESCRIPTION
  22
  23 This script loads a BioSQL database with sequences. There are a number
  24 of options that have to do with where the database is and how it's
  25 accessed and the format and namespace of the input files. These are
  26 followed by any number of file names. The files are assumed to be
  27 formatted identically with the format given by the --format flag. See
  28 below for more details.
  29
  30 =head1 ARGUMENTS
  31
  32 The arguments after the named options constitute the filelist. If
  33 there are no such files, input is read from stdin. Default values for
  34 each parameter are shown in square brackets. Note that --bulk is no
  35 longer available.
  36
  37 =over 2
  38
  39 =item --host $URL
  40
  41 The host name or IP address incl. port. The default is undefined,
  42 which will get interpreted differently depending on the driver. E.g.,
  43 the mysql driver will assume localhost if host is undefined; the
  44 PostgreSQL driver will use a local (file-)socket connection to the
  45 local host, whereas it will use a TCP socket (which has to be enabled
  46 separately when starting the postmaster) if you specify 'localhost';
  47 the Oracle driver doesn't need (or may even get confused by) a host
  48 name if the local tnsnames.ora can properly resolve the SID, which
  49 would be specified using --dbname.
  50
  51 =item --port $port
  52
  53 the port to which to connect; usually the default port chosen by the
  54 driver will be appropriate.
  55
  56 =item --dbname $db_name
  57
  58 the name of the schema [biosql]
  59
  60 =item --dbuser $username
  61
  62 database username [root]
  63
  64 =item --dbpass $password
  65
  66 password [undef]
  67
  68 =item --driver $driver
  69
  70 the DBI driver name for the RDBMS e.g., mysql, Pg, or Oracle [mysql]
  71
  72 =item --dsn dsn
  73
  74 Instead of providing the database connection and driver parameters
  75 individually, you may also specify the DBI-formatted DSN that is to be
  76 used verbatim for connecting to the database. Note that if you do give
  77 individual parameters in addition they will not supplant what is in
  78 the DSN string. Hence, the only database-related parameter that may be
  79 useful to specify in addition is --driver, as that is used also for
  80 selecting the driver-specific adaptors that generate SQL
  81 code. Usually, the driver will be parsed out from the DSN though and
  82 therefore will be set as well by setting the DSN.
  83
  84 Consult the POD of your DBI driver for how to properly format the DSN
  85 for it. A typical example is dbi:Pg:dbname=biosql;host=foo.bar.edu
  86 (for PostgreSQL). Note that the DSN will be specific to the driver
  87 being used.
  88
  89 =item --schema schemaname
  90
  91 The schema under which the BioSQL tables reside in the database. For
  92 Oracle and MySQL this is synonymous with the user, and won't have an
  93 effect. PostgreSQL since v7.4 supports schemas as the namespace for
  94 collections of tables within a database.
  95
  96 =item --initrc paramfile
  97
  98 Instead of, or in addition to, specifying every individual database
  99 connection parameter you may put them into a file that when read by
 100 perl evaluates to an array or hash reference. This option specifies
 101 the file to read; the special value DEFAULT (or no value) will use a
 102 file ./.bioperldb or $HOME/.bioperldb, whichever is found first in
 103 that order.
 104
 105 Constructing a file that evaluates to a hash reference is very
 106 simple. The first non-space character needs to be an open curly brace,
 107 and the last non-space character a closing curly brace. In between the
 108 curly braces, write option name enclosed in single quotes, followed by
 109 => (equal to or greater than), followed by the value in single
 110 quotes. Separate each such option/value pair by comma. Here is an
 111 example:
 112
 113 {
 114     '-dbname' => 'mybiosql', '-host' => 'foo.bar.edu', '-user' => 'cleo'
 115 }
 116
 117 Line breaks and white space don't matter (except if in the value
 118 itself). Also note that options only have a single dash as prefix, and
 119 they need to be those accepted by Bio::DB::BioDB->new()
 120 (L<Bio::DB::BioDB>) or Bio::DB::SimpleDBContext->new()
 121 (L<Bio::DB::SimpleDBContext>). Those sometimes differ slightly from the
 122 option names used by this script, e.g., --dbuser corresponds to -user.
 123
 124 Note also that using the above example, you can use it for --initrc
 125 and still connect as user caesar by also supplying --dbuser caesar on
 126 the command line. I.e., command line arguments override any parameters
 127 also found in the initrc file.
 128
 129 Finally, note that if using this option with default file name and the
 130 default file is not found at any of the default locations, the option
 131 will be ignored; it is not considered an error.
 132
 133 =item --namespace $namesp
 134
 135 The namespace under which the sequences in the input files are to be
 136 created in the database. Note that the namespace will be
 137 left untouched if the object to be submitted has it set already [bioperl].
 138
 139 =item --lookup
 140
 141 flag to look-up by unique key first, converting the insert into an
 142 update if the object is found
 143
 144 =item --flatlookup
 145
 146 Similar to --lookup, but only the 'flat' row for the object is looked
 147 up, meaning no children will be fetched and attached to the
 148 object. This is potentially much faster than a full recursive object
 149 retrieval, but as a result the retrieved object lacks all association
 150 properties (e.g., a flat Bio::SeqI object would lack all features and
 151 all annotation, but still have display_id, accession, version
 152 etc.). This option is therefore most useful if you want to delete
 153 found objects (--remove), as then any time spent on retrieving more
 154 than the row together with the primary key is wasted.
 155
 156 =item --noupdate
 157
 158 don't update if object is found (with --lookup)
 159
 160 =item --remove
 161
 162 flag to remove sequences before actually adding them (this
 163 necessitates a prior lookup)
 164
 165 =item --safe
 166
 167 flag to continue despite errors when loading (the entire object
 168 transaction will still be rolled back)
 169
 170 =item --testonly
 171
 172 don't commit anything, rollback at the end
 173
 174 =item --format
 175
 176 This may theoretically be any IO subsytem and the format understood by
 177 that subsystem to parse the input file(s). IO subsytem and format must
 178 be separated by a double colon. See below for which subsystems are
 179 currently supported.
 180
 181 The default IO subsystem is SeqIO. 'Bio::' will automatically be
 182 prepended if not already present. As of now the other supported
 183 subsystem is ClusterIO. All input files must have the same format.
 184
 185 Examples:
 186     # this is the default
 187     --format genbank
 188     # SeqIO format EMBL
 189     --format embl
 190     # Bio::ClusterIO stream with -format => 'unigene'
 191     --format ClusterIO::unigene
 192
 193 =item --fmtargs
 194
 195 Use this argument to specify initialization parameters for the parser
 196 for the input format. The argument value is expected to be a string
 197 with parameter names and values delimited by commas.
 198
 199 Usually you will want to protect the argument list from interpretation
 200 by the shell, so surround it with double or single quotes.
 201
 202 If a parameter value contains a comma, escape it with a backslash
 203 (which means you also must protect the whole argument from the shell
 204 in order to preserve the backslash)
 205
 206 Examples:
 207
 208     # turn parser exceptions into warnings (don't try this at home)
 209     --fmtargs "-verbose,-1"
 210     # verbose parser with an additional path argument
 211     --fmtargs "-verbose,1,-indexpath,/home/luke/warp"
 212     # escape commas in values
 213     --fmtargs "-myspecialchar,\,"
 214
 215 =item --pipeline
 216
 217 This is a sequence of Bio::Factory::SeqProcessorI (see
 218 L<Bio::Factory::SeqProcessorI>) implementing objects that will be
 219 instantiated and chained in exactly this order. This allows you to
 220 write re-usable modules for custom post-processing of objects after
 221 the stream parser returns them. See L<Bio::Seq::BaseSeqProcessor> for
 222 a base implementation for such modules.
 223
 224 Modules are separated by the pipe character '|'. In addition, you can
 225 specify initialization parameters for each of the modules by enclosing
 226 a comma-separated list of alternating parameter name and value pairs
 227 in parentheses or angle brackets directly after the module.
 228
 229 This option will be ignored if no value is supplied.
 230
 231 Examples:
 232     # one module
 233     --pipeline "My::SeqProc"
 234     # two modules in the specified order
 235     --pipeline "My::SeqProc|My::SecondSeqProc"
 236     # two modules, the first of which has two initialization parameters
 237     --pipeline "My::SeqProc(-maxlength,1500,-minlength,300)|My::SecondProc"
 238
 239 =item --seqfilter
 240
 241 This is either a string or a file defining a closure to be used as
 242 sequence filter. The value is interpreted as a file if it refers to a
 243 readable file, and a string otherwise. See add_condition() in
 244 L<Bio::Seq::SeqBuilder> for more information about what the code will
 245 be used for. The closure will be passed a hash reference with an
 246 accumulated list of initialization paramaters for the prospective
 247 object. It returns TRUE if the object is to be built and FALSE
 248 otherwise.
 249
 250 Note that this closure operates at the stream parser level. Objects it
 251 rejects will be skipped by the parser. Objects it accepts can still be
 252 intercepted at a later stage (options --remove, --update, --noupdate,
 253 --mergeobjs).
 254
 255 Note that not necessarily all stream parsers support a
 256 Bio::Factory::ObjectBuilderI (see L<Bio::Factory::ObjectBuilderI>)
 257 object. Email bioperl-l@bioperl.org to find out which ones do. In
 258 fact, at the time of writing this, only Bio::SeqIO::genbank supports
 259 it.
 260
 261 This option will be ignored if no value is supplied.
 262
 263 =item --mergeobjs
 264
 265 This is also a string or a file defining a closure. If provided, the
 266 closure is called if a look-up for the unique key of the new object
 267 was successful. Hence, it will never be called without supplying
 268 --lookup at the same time.
 269
 270 Note that --noupdate will B<not> prevent the closure from being
 271 called. I.e., if you make changes to the database in your merge script
 272 as opposed to only modifying the object, --noupdate will B<not>
 273 prevent those changes. This is a feature, not a bug. Obviously,
 274 modifications to the in-memory object will have no effect with
 275 --noupdate since the database won't be updated with it.
 276
 277 The closure will be passed three arguments: the object found by
 278 lookup, the new object to be submitted, and the Bio::DB::DBAdaptorI
 279 (see L<Bio::DB::DBAdaptorI>) implementing object for the desired
 280 database. If the closure returns a value, it must be the object to be
 281 inserted or updated in the database (if $obj->primary_key returns a
 282 value, the object will be updated). If it returns undef, the script
 283 will skip to the next object in the input stream.
 284
 285 The purpose of the closure can be manifold. It was originally
 286 conceived as a means to customarily merge attributes or associated
 287 objects of the new object to the existing (found) one in order to
 288 avoid duplications but still capture additional information (e.g.,
 289 annotation). However, there is a multitude of other operations it can
 290 be used for, like physically deleting or altering certain associated
 291 information from the database (the found object and all its associated
 292 objects will implement Bio::DB::PersistentObjectI, see
 293 L<Bio::DB::PersistentObjectI>). Since the third argument is the
 294 persistent object and adaptor factory for the database, there is
 295 literally no limit as to the database operations the closure could
 296 possibly do.
 297
 298 This option will be ignored if no value is supplied.
 299
 300 =item --logchunk
 301
 302 If supplied with an integer argument n greater than zero, progress
 303 will be logged to stderr every n entries of the input file(s). Default
 304 is no progress logging.
 305
 306 =item --debug
 307
 308 Turn on verbose and debugging mode. This will produce a *lot* of
 309 logging output, hence you will want to capture the output in a
 310 file. This option is useful if you get some mysterious failure
 311 somewhere in the events of loading or updating a record, and you would
 312 like to see, e.g., precisely which SQL statement fails. Usually you
 313 turn on this option because you've been asked to do so by a person
 314 responding after you posted your problem to the Bioperl mailing list.
 315
 316 =item -u, -z, or --uncompress
 317
 318 Uncompress the input file(s) on-the-fly by piping them through
 319 gunzip. Gunzip must be in your path for this option to work.
 320
 321 =item more args
 322
 323 The remaining arguments will be treated as files to parse and load. If
 324 there are no additional arguments, input is expected to come from
 325 standard input.
 326
 327 =back
 328
 329 =head1 Authors
 330
 331 Ewan Birney E<lt>birney at ebi.ac.ukE<gt>
 332 Mark Wilkinson E<lt>mwilkinson at gene.pbi.nrc.caE<gt>
 333 Hilmar Lapp E<lt>hlapp at gmx.netE<gt>
 334 Chris Mungall E<lt>cjm at fruitfly.orgE<gt>
 335 Elia Stupka E<lt>elia at tll.org.sgE<gt>
 336
 337 =cut
 338
 339
 340 use Getopt::Long;
 341 use Carp (qw:cluck confess:);
 342 use Symbol;
 343 use Bio::Root::Root;
 344 use Bio::DB::BioDB;
 345 use Bio::Annotation::SimpleValue;
 346 use Bio::SeqIO;
 347 use Bio::ClusterIO;
 348
 349 ####################################################################
 350 # Defaults for options changeable through command line
 351 ####################################################################
 352 my ($host,$port);
 353 my $dbname;
 354 my $dbuser;
 355 my $driver;
 356 my $dbpass;
 357 my $schema;
 358 my $format = 'genbank';
 359 my $fmtargs = '';
 360 my $namespace = 'bioperl';
 361 my $logchunk = 0;        # log progress after <x> entries (0 = don't)
 362 my $seqfilter;           # see conditions in Bio::Seq::SeqBuilder
 363 my $mergefunc;           # if and how to merge old (found) and new objects
 364 my $pipeline;            # see Bio::Factory::SequenceProcessorI
 365 my $initrc;              # use an initialization file for parameters?
 366 my $dsn;                 # DSN to use verbatim for connecting, if any
 367 #
 368 # flags
 369 #
 370 my $remove_flag = 0;     # remove object before creating
 371 my $lookup_flag = 0;     # look up object before creating, update if found
 372 my $flat_flag = 0;       # don't attach children (when doing a lookup)
 373 my $no_update_flag = 0;  # do not update if found on look up
 374 my $help = 0;            # WTH
 375 my $debug = 0;           # try it ...
 376 my $testonly_flag = 0;   # don't commit anything, rollback at the end
 377 my $safe_flag = 0;       # tolerate exceptions on create
 378 my $uncompress = 0;      # whether to pipe through gunzip
 379 my $printerror = 0;      # whether to print DBI error messages
 380 ####################################################################
 381 # Global defaults or definitions not changeable through commandline
 382 ####################################################################
 383
 384 #
 385 # map of I/O type to the next_XXXX method name
 386 #
 387 my %nextobj_map = (
 388                    'Bio::SeqIO'     => 'next_seq',
 389                    'Bio::ClusterIO' => 'next_cluster',
 390                    );
 391
 392 ####################################################################
 393 # End of defaults
 394 ####################################################################
 395
 396 #
 397 # get options from commandline
 398 #
 399 my $ok = GetOptions( 'host=s'         => \$host,
 400                      'port=i'         => \$port,
 401                      'driver=s'       => \$driver,
 402                      'dbname=s'       => \$dbname,
 403                      'dbuser=s'       => \$dbuser,
 404                      'dbpass=s'       => \$dbpass,
 405                      'dsn=s'          => \$dsn,
 406                      'schema=s'       => \$schema,
 407                      'format=s'       => \$format,
 408                      'fmtargs=s'      => \$fmtargs,
 409                      'initrc:s'       => \$initrc,
 410                      'seqfilter:s'    => \$seqfilter,
 411                      'namespace=s'    => \$namespace,
 412                      'pipeline:s'     => \$pipeline,
 413                      'mergeobjs:s'    => \$mergefunc,
 414                      'logchunk=i'     => \$logchunk,
 415                      'safe'           => \$safe_flag,
 416                      'remove'         => \$remove_flag,
 417                      'lookup'         => \$lookup_flag,
 418                      'flatlookup'     => \$flat_flag,
 419                      'noupdate'       => \$no_update_flag,
 420                      'debug'          => \$debug,
 421                      'testonly'       => \$testonly_flag,
 422                      'u|z|uncompress' => \$uncompress,
 423                      'printerror'     => \$printerror,
 424                      'h|help'         => \$help
 425                      );
 426
 427 if((! $ok) || $help) {
 428     if(! $ok) {
 429         print STDERR "missing or unsupported option(s) on commandline\n";
 430     }
 431     system("perldoc $0");
 432     exit($ok ? 0 : 2);
 433 }
 434
 435 #
 436 # determine the function for re-throwing exceptions depending on $debug and
 437 # $safe_flag
 438 #
 439 my $throw = $safe_flag ?
 440     ($debug > 0 ? \&Carp::cluck : \&Carp::carp) :
 441     ($debug > 0 ? \&Carp::confess : \&Carp::croak);
 442
 443 # set the lookup flag in addition if only --flatlookup specified
 444 $lookup_flag = $flat_flag if ($flat_flag);
 445
 446 #
 447 # load and/or parse condition if supplied
 448 #
 449 my $condition = parse_code($seqfilter) if $seqfilter;
 450
 451 #
 452 # load and/or parse object merge function if supplied
 453 #
 454 my $merge_objs = parse_code($mergefunc) if $mergefunc;
 455
 456 #
 457 # determine input source(s)
 458 #
 459 my @files = @ARGV ? @ARGV : (\*STDIN);
 460
 461 #
 462 # determine input format and type
 463 #
 464 my $objio;
 465 my @fmtelems = split(/::/, $format);
 466 if(@fmtelems > 1) {
 467     $format = pop(@fmtelems);
 468     $objio = join('::', @fmtelems);
 469 } else {
 470     # default is SeqIO
 471     $objio = "SeqIO";
 472 }
 473 $objio = "Bio::".$objio if $objio !~ /^Bio::/;
 474 my $nextobj = $nextobj_map{$objio} || "next_seq"; # next_seq is the default
 475
 476 # the format might come with argument specifications
 477 my @fmtargs = split(/,/,$fmtargs,-1);
 478 # arguments might have had commas in them - we require them to be
 479 # escaped by backslash and need to stitch them back together now
 480 my $i = 0;
 481 while($i+1 < @fmtargs) {
 482     if($fmtargs[$i] =~ s/\\$//) {
 483         splice(@fmtargs, $i, 2, $fmtargs[$i].",".$fmtargs[$i+1]);
 484     } else {
 485         $i++;
 486     }
 487 }
 488
 489 #
 490 # setup the pipeline if desired
 491 #
 492 my @pipemods = ();
 493 if($pipeline) {
 494     if($objio ne "Bio::SeqIO") {
 495         die "pipelining sequence processors not supported for non-SeqIOs\n";
 496     }
 497     @pipemods = setup_pipeline($pipeline);
 498     warn "you specified -pipeline, but no processor modules resulted\n"
 499         unless @pipemods;
 500 }
 501
 502 #
 503 # check whether we need to apply defaults
 504 #
 505 $initrc = "DEFAULT" unless $initrc || !defined($initrc);
 506
 507 #
 508 # create the DBAdaptorI for our database
 509 #
 510 my $db = Bio::DB::BioDB->new(-database   => "biosql",
 511                              -printerror => $printerror,
 512                              -host       => $host,
 513                              -port       => $port,
 514                              -dbname     => $dbname,
 515                              -driver     => $driver,
 516                              -user       => $dbuser,
 517                              -pass       => $dbpass,
 518                              -dsn        => $dsn,
 519                              -schema     => $schema,
 520                              -initrc     => $initrc,
 521                              );
 522 $db->verbose($debug) if $debug > 0;
 523
 524 # declarations
 525 my ($pseq, $adp);
 526 my $time = time();
 527 my $n_entries = 0;
 528
 529 #
 530 # loop over every input file and load its content
 531 #
 532 foreach $file ( @files ) {
 533
 534     my $fh = $file;
 535     my $seqin;
 536
 537     # create a handle if it's not one already
 538     if(! ref($fh)) {
 539         $fh = gensym;
 540         my $fspec = $uncompress ? "gunzip -c $file |" : "<$file";
 541         if(! open($fh, $fspec)) {
 542             warn "unable to open $file for reading, skipping: $!\n";
 543             next;
 544         }
 545         print STDERR "Loading $file ...\n";
 546     }
 547     # create stream
 548     $seqin = $objio->new(-fh => $fh,
 549                          $format ? (-format => $format) : (),
 550                          @fmtargs);
 551
 552     # establish filter if provided
 553     if($condition) {
 554         if(! $seqin->can('sequence_builder')) {
 555             $seqin->throw("object IO parser ".ref($seqin).
 556                           " does not support control by ObjectBuilderIs");
 557         }
 558         $seqin->sequence_builder->add_object_condition($condition);
 559     }
 560
 561     # chain to pipeline if pipelining is requested
 562     if(@pipemods) {
 563         $pipemods[0]->source_stream($seqin);
 564         $seqin = $pipemods[-1];
 565     }
 566
 567     # reset entry counter and timer
 568     $n_entries = 0;
 569     $time = time();
 570
 571     # loop over the stream
 572     while( my $seq = $seqin->$nextobj ) {
 573         # increment entry counter
 574         $n_entries++;
 575
 576         # report progress if enabled
 577         if (($logchunk > 0) && (($n_entries % $logchunk) == 0)) {
 578             my $elapsed = time() - $time;
 579             printf STDERR
 580                 "\t... loaded $n_entries entries "
 581                 . "(in %.2d:%.2d:%.2d, %5.2f entries/s)\n",
 582                 $elapsed/3600, ($elapsed % 3600)/60, $elapsed % 60,
 583                 $logchunk / $elapsed;
 584             $time = time();
 585         }
 586
 587         # we can't store the structure for structured values yet, so
 588         # flatten them
 589         if($seq->isa("Bio::AnnotatableI")) {
 590             flatten_annotations($seq->annotation);
 591         }
 592         # don't forget to add namespace if the parser doesn't supply one
 593         $seq->namespace($namespace) unless $seq->namespace();
 594         # look up or delete first?
 595         my $lseq;
 596         if($lookup_flag || $remove_flag) {
 597             # look up
 598             $adp = $db->get_object_adaptor($seq);
 599             $lseq = $adp->find_by_unique_key($seq,
 600                                              -obj_factory =>
 601                                              $seqin->object_factory(),
 602                                              -flat_only => $flat_flag);
 603             # found?
 604             if($lseq) {
 605                 # merge old and new if a function for this is provided
 606                 $seq = &$merge_objs($lseq, $seq, $db) if $merge_objs;
 607                 # the return value may indicate to skip to the next
 608                 next unless $seq;
 609             }
 610         }
 611         # try to serialize
 612         eval {
 613             # set the adaptor variable before any operation which may throw
 614             # us out of the eval block
 615             $adp = $lseq ? $lseq->adaptor() : $db->get_object_adaptor($seq);
 616             # delete first if requested
 617             $lseq->remove() if $remove_flag && $lseq;
 618             # on update, skip the rest if we are not supposed to update
 619             if(! ($lseq && $no_update_flag)) {
 620                 # create a persistent object out of the seq if it's
 621                 # not one already (merge_objs may have returned the
 622                 # looked up sequence, i.e., $lseq)
 623                 $pseq = $seq->isa("Bio::DB::PersistentObjectI")
 624                     ? $seq : $db->create_persistent($seq);
 625                 # store the primary key of what we found by lookup (this
 626                 # is going to be an udate then)
 627                 if($lseq && $lseq->primary_key) {
 628                     $pseq->primary_key($lseq->primary_key);
 629                 }
 630                 $pseq->store(); # inserts if primary key not set
 631             }
 632             $adp->commit() unless $testonly_flag;
 633         };
 634         if ($@) {
 635             my $msg = "Could not store ".$seq->object_id().": $@\n";
 636             if($adp) {
 637                 $adp->rollback();
 638             } else {
 639                 $msg .= "\nFailed to load adaptor for ".ref($seq).
 640                     " - not good. You may want to ctrl-c your run ".
 641                     "if you had --safe switched on.";
 642             }
 643             &$throw($msg);
 644         }
 645
 646     }
 647     $seqin->close();
 648 }
 649
 650 # final progress report if enabled
 651 if (($logchunk > 0) && (($n_entries % $logchunk) != 0)) {
 652     my $elapsed = time() - $time;
 653     $elapsed = 1 unless $elapsed; # avoid division by zero
 654     printf STDERR
 655                 "\t... loaded $n_entries entries "
 656                 . "(in %.2d:%.2d:%.2d, %5.2f entries/s)\n",
 657                 $elapsed/3600, ($elapsed % 3600)/60, $elapsed % 60,
 658                 ($n_entries % $logchunk) / $elapsed;
 659 }
 660
 661 $adp->rollback() if $adp && $testonly_flag;
 662
 663 # done!
 664
 665 #################################################################
 666 # Implementation of functions                                   #
 667 #################################################################
 668
 669 sub parse_code{
 670     my $src = shift;
 671     my $code;
 672
 673     # file or subroutine?
 674     if(-r $src) {
 675         if(! (($code = do $src) && (ref($code) eq "CODE"))) {
 676             die "error in parsing code block $src: $@" if $@;
 677             die "unable to read file $src: $!" if $!;
 678             die "failed to run $src, or it failed to return a closure";
 679         }
 680     } else {
 681         $code = eval $src;
 682         die "error in parsing code block \"$src\": $@" if $@;
 683         die "\"$src\" fails to return a closure"
 684             unless ref($code) eq "CODE";
 685     }
 686     return $code;
 687 }
 688
 689 sub setup_pipeline{
 690     my $pipeline = shift;
 691     my @pipemods = ();
 692
 693     # split into modules
 694     my @mods = split(/\|/, $pipeline);
 695     # instantiate a module 'loader'
 696     my $loader = Bio::Root::Root->new();
 697     # load and instantiate each one, then concatenate
 698     foreach my $mod (@mods) {
 699         # separate module name from potential arguments
 700         my $modname = $mod;
 701         my @modargs = ();
 702         if($modname =~ /^(.+)[\(<](.*)[>\)]$/) {
 703             $modname = $1;
 704             @modargs = split(/,/, $2);
 705         }
 706         $loader->_load_module($modname);
 707         my $proc = $modname->new(@modargs);
 708         if(! $proc->isa("Bio::Factory::SequenceProcessorI")) {
 709             die "Pipeline processing module $modname does not implement ".
 710                 "Bio::Factory::SequenceProcessorI. Bummer.\n";
 711         }
 712         $proc->source_stream($pipemods[$#pipemods]) if @pipemods;
 713         push(@pipemods, $proc);
 714     }
 715     return @pipemods;
 716 }
 717
 718 sub flatten_annotations {
 719     my $anncoll = shift;
 720     foreach my $ann ($anncoll->remove_Annotations()) {
 721         if($ann->isa("Bio::Annotation::StructuredValue")) {
 722             foreach my $val ($ann->get_all_values()) {
 723                 $anncoll->add_Annotation(Bio::Annotation::SimpleValue->new(
 724                                            -value => $val,
 725                                            -tagname => $ann->tagname()));
 726             }
 727         } else {
 728             $anncoll->add_Annotation($ann);
 729         }
 730     }
 731 }