2 # BioPerl module for Bio::Tools::Lucy
4 # Copyright Her Majesty the Queen of England
5 # written by Andrew Walsh (paeruginosa@hotmail.com) during employment with
6 # Agriculture and Agri-food Canada, Cereal Research Centre, Winnipeg, MB
8 # You may distribute this module under the same terms as perl itself
9 # POD documentation - main docs before the code
13 Bio::Tools::Lucy - Object for analyzing the output from Lucy,
14 a vector and quality trimming program from TIGR
18 # Create the Lucy object from an existing Lucy output file
19 @params = ('seqfile' => 'lucy.seq', 'lucy_verbose' => 1);
20 $lucyObj = Bio::Tools::Lucy->new(@params);
22 # Get names of all sequences
23 $names = $lucyObj->get_sequence_names();
25 # Print seq and qual values for sequences >400 bp in order to run CAP3
26 foreach $name (@$names) {
27 next unless $lucyObj->length_clear($name) > 400;
28 print SEQ ">$name\n", $lucyObj->sequence($name), "\n";
29 print QUAL ">$name\n", $lucyObj->quality($name), "\n";
32 # Get an array of Bio::PrimarySeq objects
33 @seqObjs = $lucyObj->get_Seq_Objs();
38 Bio::Tools::Lucy.pm provides methods for analyzing the sequence and
39 quality values generated by Lucy program from TIGR.
41 Lucy will identify vector, poly-A/T tails, and poor quality regions in
42 a sequence. (www.genomics.purdue.edu/gcg/other/lucy.pdf)
44 The input to Lucy can be the Phred sequence and quality files
45 generated from running Phred on a set of chromatograms.
47 Lucy can be obtained (free of charge to academic users) from
50 There are a few methods that will only be available if you make some
51 minor changes to the source for Lucy and then recompile. The changes
52 are in the 'lucy.c' file and there is a diff between the original and
53 the modified file in the Appendix
55 Please contact the author of this module if you have any problems
56 making these modifications.
58 You do not have to make these modifications to use this module.
60 =head2 Creating a Lucy object
62 @params = ('seqfile' => 'lucy.seq', 'adv_stderr' => 1,
63 'fwd_desig' => '_F', 'rev_desig' => '_R');
64 $lucyObj = Bio::Tools::Lucy->new(@params);
66 =head2 Using a Lucy object
68 You should get an array with the sequence names in order to use
69 accessor methods. Note: The Lucy binary program will fail unless
70 the sequence names provided as input are unique.
72 $names_ref = $lucyObj->get_sequence_names();
74 This code snippet will produce a Fasta format file with sequence
75 lengths and %GC in the description line.
77 foreach $name (@$names) {
78 print FILE ">$name\t",
79 $lucyObj->length_clear($name), "\t",
80 $lucyObj->per_GC($name), "\n",
81 $lucyObj->sequence($name), "\n";
85 Print seq and qual values for sequences >400 bp in order to assemble
86 them with CAP3 (or other assembler).
88 foreach $name (@$names) {
89 next unless $lucyObj->length_clear($name) > 400;
90 print SEQ ">$name\n", $lucyObj->sequence($name), "\n";
91 print QUAL ">$name\n", $lucyObj->quality($name), "\n";
94 Get all the sequences as Bio::PrimarySeq objects (eg., for use with
95 Bio::Tools::Run::StandaloneBlast to perform BLAST).
97 @seqObjs = $lucyObj->get_Seq_Objs();
99 Or use only those sequences that are full length and have a Poly-A
102 foreach $name (@$names) {
103 next unless ($lucyObj->full_length($name) and $lucy->polyA($name));
104 push @seqObjs, $lucyObj->get_Seq_Obj($name);
108 Get the names of those sequences that were rejected by Lucy.
110 $rejects_ref = $lucyObj->get_rejects();
112 Print the names of the rejects and 1 letter code for reason they
115 foreach $key (sort keys %$rejects_ref) {
116 print "$key: ", $rejects_ref->{$key};
119 There is a lot of other information available about the sequences
120 analyzed by Lucy (see APPENDIX). This module can be used with the
121 DBI module to store this sequence information in a database.
127 User feedback is an integral part of the evolution of this and other
128 Bioperl modules. Send your comments and suggestions preferably to one
129 of the Bioperl mailing lists. Your participation is much appreciated.
131 bioperl-l@bioperl.org - General discussion
132 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
136 Please direct usage questions or support issues to the mailing list:
138 I<bioperl-l@bioperl.org>
140 rather than to the module maintainer directly. Many experienced and
141 reponsive experts will be able look at the problem and quickly
142 address it. Please include a thorough description of the problem
143 with code and data examples if at all possible.
145 =head2 Reporting Bugs
147 Report bugs to the Bioperl bug tracking system to help us keep track
148 the bugs and their resolution. Bug reports can be submitted via the web:
150 https://github.com/bioperl/bioperl-live/issues
154 Andrew G. Walsh paeruginosa@hotmail.com
158 Methods available to Lucy objects are described below. Please note
159 that any method beginning with an underscore is considered internal
160 and should not be called directly.
165 package Bio
::Tools
::Lucy
;
167 use vars
qw($AUTOLOAD @ATTR %OK_FIELD);
171 use base qw(Bio::Root::Root Bio::Root::IO);
172 @ATTR = qw(seqfile qualfile stderrfile infofile lucy_verbose fwd_desig rev_desig adv_stderr);
173 foreach my $attr (@ATTR) {
179 my $attr = $AUTOLOAD;
182 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
183 $self->{$attr} = shift if @_;
184 return $self->{$attr};
190 Usage : $lucyObj = Bio::Tools::Lucy->new(seqfile => lucy.seq, rev_desig => '_R',
192 Function: creates a Lucy object from Lucy analysis files
193 Returns : reference to Bio::Tools::Lucy object
194 Args : seqfile Fasta sequence file generated by Lucy
195 qualfile Quality values file generated by Lucy
196 infofile Info file created when Lucy is run with -debug
198 stderrfile Standard error captured from Lucy when Lucy is run
199 with -info option and STDERR is directed to stderrfile
200 (ie. lucy ... 2> stderrfile).
201 Info in this file will include sequences dropped for low
202 quality. If you've modified Lucy source (see adv_stderr below),
203 it will also include info on which sequences were dropped because
204 they were vector, too short, had no insert, and whether a poly-A
205 tail was found (if Lucy was run with -cdna option).
206 lucy_verbose verbosity level (0-1).
207 fwd_desig The string used to determine whether sequence is a
209 The parser will assume that this match will occus at the
210 end of the sequence name string.
211 rev_desig As above, for reverse reads.
212 adv_stderr Can be set to a true value (1). Will only work if
214 the Lucy source code as outlined in DESCRIPTION and capture
215 the standard error from Lucy.
217 If you don't provide filenames for qualfile, infofile or stderrfile,
218 the module will assume that .qual, .info, and .stderr are the file
219 extensions and search in the same directory as the .seq file for these
222 For example, if you create a Lucy object with $lucyObj =
223 Bio::Tools::Lucy-E<gt>new(seqfile =E<gt>lucy.seq), the module will
224 find lucy.qual, lucy.info and lucy.stderr.
226 You can omit any or all of the quality, info or stderr files, but you
227 will not be able to use all of the object methods (see method
228 documentation below).
233 my ($class,@args) = @_;
234 my $self = $class->SUPER::new
(@args);
239 $value = shift @args;
240 $self->{$attr} = $value;
249 Usage : n/a (internal function)
250 Function: called by new() to parse Lucy output files
258 $self->{seqfile
} =~ /^(\S+)\.\S+$/;
261 $self->warn("Opening $self->{seqfile} for parsing...\n") if $self->{lucy_verbose
};
262 open my $SEQ, '<', $self->{seqfile
} or $self->throw("Could not read sequence file '$self->{seqfile}': $!");
266 while ($line = pop @lines) {
268 if ($line =~ /^>(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/) {
270 if ($self->{fwd_desig
}) {
271 $self->{sequences
}{$name}{direction
} = "F" if $name =~ /^(\S+)($self->{fwd_desig})$/;
273 if ($self->{rev_desig
}) {
274 $self->{sequences
}{$name}{direction
} = "R" if $name =~ /^(\S+)($self->{rev_desig})$/;
276 $self->{sequences
}{$name}{min_clone_len
} = $2; # this is used for TIGR Assembler, as are $3 and $4
277 $self->{sequences
}{$name}{max_clone_len
} = $3;
278 $self->{sequences
}{$name}{med_clone_len
} = $4;
279 $self->{sequences
}{$name}{beg_clear
} = $5;
280 $self->{sequences
}{$name}{end_clear
} = $6;
281 $self->{sequences
}{$name}{length_raw
} = $seq =~ tr/[AGCTN]//; # from what I've seen, these are the bases Phred calls. Please let me know if I'm wrong.
282 my $beg = $5-1; # substr function begins with index 0
283 $seq = $self->{sequences
}{$name}{sequence
} = substr ($seq, $beg, $6-$beg);
284 my $count = $self->{sequences
}{$name}{length_clear
} = $seq =~ tr/[AGCTN]//;
285 my $countGC = $seq =~ tr/[GC]//;
286 $self->{sequences
}{$name}{per_GC
} = $countGC/$count * 100;
294 # now parse quality values (check for presence of quality file first)
295 if ($self->{qualfile
}) {
296 open my $QUAL, '<', $self->{qualfile
} or $self->throw("Could not read quality file '$self->{qualfile}': $!");
299 elsif (-e
"$file.qual") {
300 $self->warn("You did not set qualfile, but I'm opening $file.qual\n") if $self->{lucy_verbose
};
301 $self->qualfile("$file.qual");
302 open my $QUAL, '<', "$file.qual" or $self->throw("Could not read quality file '$file.qual': $!");
306 $self->warn("I did not find a quality file. You will not be able to use all of the accessor methods.\n") if $self->{lucy_verbose
};
310 my (@vals, @slice, $num, $tot, $vals);
312 while ($line = pop @lines) {
314 if ($line =~ /^>(\S+)/) {
316 @vals = split /\s/ , $qual;
317 @slice = @vals[$self->{sequences
}{$name}{beg_clear
} - 1 .. $self->{sequences
}{$name}{end_clear
} - 1];
318 $vals = join "\t", @slice;
319 $self->{sequences
}{$name}{quality
} = $vals;
321 foreach $num (@slice) {
325 $self->{sequences
}{$name}{avg_quality
} = $tot/$num;
333 # determine whether reads are full length
334 if ($self->{infofile
}) {
335 open my $INFO, '<', $self->{infofile
} or $self->throw("Could not read info file '$self->{infofile}': $!");
338 elsif (-e
"$file.info") {
339 $self->warn("You did not set infofile, but I'm opening $file.info\n") if $self->{lucy_verbose
};
340 $self->infofile("$file.info");
341 open my $INFO, '<', "$file.info" or $self->throw("Could not read info file '$file.info': $!");
345 $self->warn("I did not find an info file. You will not be able to use all of the accessor methods.\n") if $self->{lucy_verbose
};
350 /^(\S+).+CLV\s+(\d+)\s+(\d+)$/;
352 $self->{sequences
}{$1}{full_length
} = 1 if $self->{sequences
}{$1}; # will show cleavage info for rejected sequences too
357 # parse rejects (and presence of poly-A if Lucy has been modified)
358 if ($self->{stderrfile
}) {
359 open my $STDERR_LUCY, '<', $self->{stderrfile
} or $self->throw("Could not read quality file '$self->{stderrfile}': $!");
360 @lines = <$STDERR_LUCY>;
362 elsif (-e
"$file.stderr") {
363 $self->warn("You did not set stderrfile, but I'm opening $file.stderr\n") if $self->{lucy_verbose
};
364 $self->stderrfile("$file.stderr");
365 open my $STDERR_LUCY, '<', "$file.stderr" or $self->throw("Could not read quality file '$file.stderr': $!");
366 @lines = <$STDERR_LUCY>;
369 $self->warn("I did not find a standard error file. You will not be able to use all of the accessor methods.\n") if $self->{lucy_verbose
};
373 if ($self->{adv_stderr
}) {
375 $self->{reject
}{$1} = "Q" if /dropping\s+(\S+)/;
376 $self->{reject
}{$1} = "V" if /Vector: (\S+)/;
377 $self->{reject
}{$1} = "E" if /Empty: (\S+)/;
378 $self->{reject
}{$1} = "S" if m{Short/ no insert: (\S+)};
379 $self->{sequences
}{$1}{polyA
} = 1 if /(\S+) has PolyA/;
380 if (/Dropped PolyA: (\S+)/) {
381 $self->{reject
}{$1} = "P";
382 delete $self->{sequences
}{$1};
388 $self->{reject
}{$1} = "R" if /dropping\s+(\S+)/;
396 Usage : $lucyObj->get_Seq_Objs()
397 Function: returns an array of references to Bio::PrimarySeq objects
398 where -id = 'sequence name' and -seq = 'sequence'
400 Returns : array of Bio::PrimarySeq objects
407 my($seqobj, @seqobjs);
408 foreach my $key (sort keys %{$self->{sequences
}}) {
409 $seqobj = Bio
::PrimarySeq
->new( -seq
=> "$self->{sequences}{$key}{sequence}",
411 push @seqobjs, $seqobj;
419 Usage : $lucyObj->get_Seq_Obj($seqname)
420 Function: returns reference to a Bio::PrimarySeq object where -id = 'sequence name'
421 and -seq = 'sequence'
422 Returns : reference to Bio::PrimarySeq object
423 Args : name of a sequence
428 my ($self, $key) = @_;
429 my $seqobj = Bio
::PrimarySeq
->new( -seq
=> "$self->{sequences}{$key}{sequence}",
434 =head2 get_sequence_names
436 Title : get_sequence_names
437 Usage : $lucyObj->get_sequence_names
438 Function: returns reference to an array of names of the sequences analyzed by Lucy.
439 These names are required for most of the accessor methods.
440 Note: The Lucy binary will fail unless sequence names are unique.
441 Returns : array reference
446 sub get_sequence_names
{
448 my @keys = sort keys %{$self->{sequences
}};
455 Usage : $lucyObj->sequence($seqname)
456 Function: returns the DNA sequence of one of the sequences analyzed by Lucy.
458 Args : name of a sequence
463 my ($self, $key) = @_;
464 return $self->{sequences
}{$key}{sequence
};
470 Usage : $lucyObj->quality($seqname)
471 Function: returns the quality values of one of the sequences analyzed by Lucy.
472 This method depends on the user having provided a quality file.
474 Args : name of a sequence
479 my($self, $key) = @_;
480 return $self->{sequences
}{$key}{quality
};
486 Usage : $lucyObj->avg_quality($seqname)
487 Function: returns the average quality value for one of the sequences analyzed by Lucy.
489 Args : name of a sequence
494 my($self, $key) = @_;
495 return $self->{sequences
}{$key}{avg_quality
};
501 Usage : $lucyObj->direction($seqname)
502 Function: returns the direction for one of the sequences analyzed by Lucy
503 providing that 'fwd_desig' or 'rev_desig' were set when the
504 Lucy object was created.
505 Strings returned are: 'F' for forward, 'R' for reverse.
507 Args : name of a sequence
512 my($self, $key) = @_;
513 return $self->{sequences
}{$key}{direction
} if $self->{sequences
}{$key}{direction
};
520 Usage : $lucyObj->length_raw($seqname)
521 Function: returns the length of a DNA sequence prior to quality/ vector
524 Args : name of a sequence
529 my($self, $key) = @_;
530 return $self->{sequences
}{$key}{length_raw
};
536 Usage : $lucyObj->length_clear($seqname)
537 Function: returns the length of a DNA sequence following quality/ vector
540 Args : name of a sequence
545 my($self, $key) = @_;
546 return $self->{sequences
}{$key}{length_clear
};
552 Usage : $lucyObj->start_clear($seqname)
553 Function: returns the beginning position of good quality, vector free DNA sequence
556 Args : name of a sequence
561 my($self, $key) = @_;
562 return $self->{sequences
}{$key}{beg_clear
};
569 Usage : $lucyObj->end_clear($seqname)
570 Function: returns the ending position of good quality, vector free DNA sequence
573 Args : name of a sequence
578 my($self, $key) = @_;
579 return $self->{sequences
}{$key}{end_clear
};
585 Usage : $lucyObj->per_GC($seqname)
586 Function: returns the percente of the good quality, vector free DNA sequence
589 Args : name of a sequence
594 my($self, $key) = @_;
595 return $self->{sequences
}{$key}{per_GC
};
601 Usage : $lucyObj->full_length($seqname)
602 Function: returns the truth value for whether or not the sequence read was
603 full length (ie. vector present on both ends of read). This method
604 depends on the user having provided the 'info' file (Lucy must be
605 run with the -debug 'info_filename' option to get this file).
607 Args : name of a sequence
612 my($self, $key) = @_;
613 return 1 if $self->{sequences
}{$key}{full_length
};
620 Usage : $lucyObj->polyA($seqname)
621 Function: returns the truth value for whether or not a poly-A tail was detected
622 and clipped by Lucy. This method depends on the user having modified
623 the source for Lucy as outlined in DESCRIPTION and invoking Lucy with
624 the -cdna option and saving the standard error.
625 Note, the final sequence will not show the poly-A/T region.
627 Args : name of a sequence
632 my($self, $key) = @_;
633 return 1 if $self->{sequences
}{$key}{polyA
};
640 Usage : $lucyObj->get_rejects()
641 Function: returns a hash containing names of rejects and a 1 letter code for the
642 reason Lucy rejected the sequence.
643 Q- rejected because of low quality values
644 S- sequence was short
645 V- sequence was vector
646 E- sequence was empty
647 P- poly-A/T trimming caused sequence to be too short
648 In order to get the rejects, you must provide a file with the standard
649 error from Lucy. You will only get the quality category rejects unless
650 you have modified the source and recompiled Lucy as outlined in DESCRIPTION.
651 Returns : hash reference
658 return $self->{reject
};
661 =head2 Diff for Lucy source code
664 > /* AGW added next line */
665 > fprintf(stderr, "Empty: %s\n", seqs[i].name);
667 > /* AGW added next line */
668 > fprintf(stderr, "Short/ no insert: %s\n", seqs[i].name);
670 < if (left) seqs[i].left+=left;
673 > seqs[i].left+=left;
674 > /* AGW added next line */
675 > fprintf(stderr, "%s has PolyA (left).\n", seqs[i].name);
678 < if (right) seqs[i].right-=right;
681 > seqs[i].right-=right;
682 > /* AGW added next line */
683 > fprintf(stderr, "%s has PolyA (right).\n", seqs[i].name);
686 > /* AGW added next line */
687 > fprintf(stderr, "Dropped PolyA: %s\n", seqs[i].name);
689 > /* AGW added next line */
690 > fprintf(stderr, "Vector: %s\n", seqs[i].name);
694 =head2 This patch is to be applied to lucy.c from the lucy-1.19p release
697 > /* AGW added next line */
698 > fprintf(stderr, "Short/ no insert: %s\n", seqs[i].name);
700 < if ((seqs[i].len=bases)<=0)
702 > if ((seqs[i].len=bases)<=0) {
703 > /* AGW added next line */
704 > fprintf(stderr, "Empty: %s\n", seqs[i].name);
708 < if (left) seqs[i].left+=left;
711 > seqs[i].left+=left;
712 > /* AGW added next line */
713 > fprintf(stderr, "%s has PolyA (left).\n", seqs[i].name);
716 < if (right) seqs[i].right-=right;
719 > seqs[i].right-=right;
720 > /* AGW added next line */
721 > fprintf(stderr, "%s has PolyA (right).\n", seqs[i].name);
724 > /* AGW added next line */
725 > fprintf(stderr, "Dropped PolyA: %s\n", seqs[i].name);
727 > /* AGW added next line */
728 > fprintf(stderr, "Vector: %s\n", seqs[i].name);