t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / SeqIO / msout.pm
blobacaf4c14701e1f9a37fddabe461d027b9961957f
1 # POD documentation - main docs before the code
3 =head1 NAME
5 Bio::SeqIO::msout - input stream for output by Hudson's ms
7 =head1 SYNOPSIS
9 Do not use this module directly. Use it via the Bio::SeqIO class.
11 =head1 DESCRIPTION
13 ms ( Hudson, R. R. (2002) Generating samples under a Wright-Fisher neutral
14 model. Bioinformatics 18:337-8 ) can be found at
15 http://home.uchicago.edu/~rhudson1/source/mksamples.html.
17 Currently, this object can be used to read output from ms into seq objects.
18 However, because bioperl has no support for haplotypes created using an infinite
19 sites model (where '1' identifies a derived allele and '0' identifies an
20 ancestral allele), the sequences returned by msout are coded using A, T, C and
21 G. To decode the bases, use the sequence conversion table (a hash) returned by
22 get_base_conversion_table(). In the table, 4 and 5 are used when the ancestry is
23 unclear. This should not ever happen when creating files with ms, but it will be
24 used when creating msOUT files from a collection of seq objects ( To be added
25 later ). Alternatively, use get_next_hap() to get a string with 1's and 0's
26 instead of a seq object.
28 =head2 Mapping to Finite Sites
30 This object can now also be used to map haplotypes created using an infinite sites
31 model to sequences of arbitrary finite length. See set_n_sites() for more detail.
32 Thanks to Filipe G. Vieira <fgvieira@berkeley.edu> for the idea and code.
34 =head1 FEEDBACK
36 =head2 Mailing Lists
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to the
40 Bioperl mailing list. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 =head2 Reporting Bugs
47 Report bugs to the Bioperl bug tracking system to help us keep track
48 of the bugs and their resolution. Bug reports can be submitted via the
49 web:
51 https://github.com/bioperl/bioperl-live/issues
53 =head1 AUTHOR - Warren Kretzschmar
55 This module was written by Warren Kretzschmar
57 email: wkretzsch@gmail.com
59 This module grew out of a parser written by Aida Andres.
61 =head1 COPYRIGHT
63 =head2 Public Domain Notice
65 This software/database is ``United States Government Work'' under the
66 terms of the United States Copyright Act. It was written as part of
67 the authors' official duties for the United States Government and thus
68 cannot be copyrighted. This software/database is freely available to
69 the public for use without a copyright notice. Restrictions cannot
70 be placed on its present or future use.
72 Although all reasonable efforts have been taken to ensure the accuracy
73 and reliability of the software and data, the National Human Genome
74 Research Institute (NHGRI) and the U.S. Government does not and cannot
75 warrant the performance or results that may be obtained by using this
76 software or data. NHGRI and the U.S. Government disclaims all
77 warranties as to performance, merchantability or fitness for any
78 particular purpose.
80 =head1 METHODS
82 =cut
84 package Bio::SeqIO::msout;
85 use version;
86 our $API_VERSION = qv('1.1.8');
88 use strict;
89 use base qw(Bio::SeqIO); # This ISA Bio::SeqIO object
90 use Bio::Seq::SeqFactory;
92 =head2 Methods for Internal Use
94 =head3 _initialize
96 Title : _initialize
97 Usage : $stream = Bio::SeqIO::msOUT->new($infile)
98 Function: extracts basic information about the file.
99 Returns : Bio::SeqIO object
100 Args : no_og, gunzip, gzip, n_sites
101 Details :
102 - include 'no_og' flag if the last population of an msout file contains
103 only one haplotype and you don't want the last haplotype to be
104 treated as the outgroup ( suggested when reading data created by ms ).
105 - including 'n_sites' (positive integer) causes all output haplotypes to be
106 mapped to a sequence of length 'n_sites'. See set_n_sites() for more details.
108 =cut
110 sub _initialize {
111 my ( $self, @args ) = @_;
112 $self->SUPER::_initialize(@args);
114 unless ( defined $self->sequence_factory ) {
115 $self->sequence_factory( Bio::Seq::SeqFactory->new() );
117 my ($no_og) = $self->_rearrange( [qw(NO_OG)], @args );
118 my ($n_sites) = $self->_rearrange( [qw(N_SITES)], @args );
120 my %initial_values = (
121 RUNS => undef,
122 N_SITES => undef,
123 SEGSITES => undef,
124 SEEDS => [],
125 MS_INFO_LINE => undef,
126 TOT_RUN_HAPS => undef,
127 POPS => [],
128 NEXT_RUN_NUM => undef, # What run is the next hap from? undef = EOF
129 LAST_READ_HAP_NUM => undef, # What did we just read from
130 LAST_HAPS_RUN_NUM => undef,
131 LAST_READ_POSITIONS => [],
132 LAST_READ_SEGSITES => undef,
133 BUFFER_HAP => undef,
134 NO_OUTGROUP => $no_og,
135 BASE_CONVERSION_TABLE_HASH_REF => {
136 'A' => 0,
137 'T' => 1,
138 'C' => 4,
139 'G' => 5,
142 foreach my $key ( keys %initial_values ) {
143 $self->{$key} = $initial_values{$key};
145 $self->set_n_sites($n_sites);
147 # If the filehandle is defined open it and read a few lines
148 if ( ref( $self->{_filehandle} ) eq 'GLOB' ) {
149 $self->_read_start();
150 return $self;
153 # Otherwise throw a warning
154 else {
155 $self->throw(
156 "No filehandle defined. Please define a file handle through -file when calling msout with Bio::SeqIO"
161 =head3 _read_start
163 Title : _read_start
164 Usage : $stream->_read_start()
165 Function: reads from the filehandle $stream->{_filehandle} all information up to the first haplotype (sequence). Closes the filehandle if all lines have been read.
166 Returns : void
167 Args : none
169 =cut
171 sub _read_start {
172 my $self = shift;
174 my $fh_IN = $self->{_filehandle};
176 # get the first five lines and parse for important info
177 my ( $ms_info_line, $seeds ) = $self->_get_next_clean_hap( $fh_IN, 2, 1 );
179 my @ms_info_line = split( /\s+/, $ms_info_line );
181 my ( $tot_pops, @pop_haplos );
183 # Parsing the ms header line
184 shift @ms_info_line;
185 my $tot_run_haps = shift @ms_info_line;
186 my $runs = shift @ms_info_line;
187 my $segsites;
189 foreach my $word ( 0 .. $#ms_info_line ) {
190 if ( $ms_info_line[$word] eq '-I' ) {
191 $tot_pops = $ms_info_line[ $word + 1 ];
192 for my $pop_num ( 1 .. $tot_pops ) {
193 push @pop_haplos, $ms_info_line[ $word + 1 + $pop_num ];
196 # if @pop_haplos contains a non-digit, then there is an error in the msinfo line.
197 if ( !defined $pop_haplos[-1] || $pop_haplos[-1] =~ /\D/ ) {
198 $self->throw(
199 "Incorrect number of populations in the ms info line (after the -I specifier)"
203 elsif ( $ms_info_line[$word] eq '-s' ) {
204 $segsites = $ms_info_line[ $word + 1 ];
206 else { next; }
209 unless (@pop_haplos) { @pop_haplos = ($tot_run_haps) }
211 my @seeds = split( /\s+/, $seeds );
213 # Save ms info data
214 $self->{RUNS} = $runs;
215 $self->{SEGSITES} = $segsites;
216 $self->{SEEDS} = \@seeds;
217 $self->{MS_INFO_LINE} = $ms_info_line;
218 $self->{TOT_RUN_HAPS} = $tot_run_haps;
219 $self->{POPS} = [@pop_haplos];
221 return;
224 =head2 Methods to Access Data
226 =head3 get_segsites
228 Title : get_segsites
229 Usage : $segsites = $stream->get_segsites()
230 Function: returns the number of segsites in the msOUT file (according to the msOUT header line's -s option), or the current run's segsites if -s was not specified in the command line (in this case the number of segsites varies from run to run).
231 Returns : scalar
232 Args : NONE
234 =cut
236 sub get_segsites {
237 my $self = shift;
238 if ( defined $self->{SEGSITES} ) {
239 return $self->{SEGSITES};
241 else {
242 return $self->get_current_run_segsites;
246 =head3 get_current_run_segsites
248 Title : get_current_run_segsites
249 Usage : $segsites = $stream->get_current_run_segsites()
250 Function: returns the number of segsites in the run of the last read
251 haplotype (sequence).
252 Returns : scalar
253 Args : NONE
255 =cut
257 sub get_current_run_segsites {
258 my $self = shift;
259 return $self->{LAST_READ_SEGSITES};
262 =head3 get_n_sites
264 Title : get_n_sites
265 Usage : $n_sites = $stream->get_n_sites()
266 Function: Gets the number of total sites (variable or not) to be output.
267 Returns : scalar if n_sites option is defined at call time of new()
268 Args : NONE
269 Note :
270 WARNING: Final sequence length might not be equal to n_sites if n_sites is
271 too close to number of segregating sites in the msout file.
273 =cut
275 sub get_n_sites {
276 my ($self) = @_;
278 return $self->{N_SITES};
281 =head3 set_n_sites
283 Title : set_n_sites
284 Usage : $n_sites = $stream->set_n_sites($value)
285 Function: Sets the number of total sites (variable or not) to be output.
286 Returns : 1 on success; throws an error if $value is not a positive integer or undef
287 Args : positive integer
288 Note :
289 WARNING: Final sequence length might not be equal to n_sites if it is
290 too close to number of segregating sites.
291 - n_sites needs to be at least as large as the number of segsites of
292 the next haplotype returned
293 - n_sites may also be set to undef, in which case haplotypes are returned
294 under the infinite sites model assumptions.
296 =cut
298 sub set_n_sites {
299 my ( $self, $value ) = @_;
301 # make sure $value is a positive integer if it is defined
302 if ( defined $value ) {
303 $self->throw(
304 "first argument needs to be a positive integer. argument supplied: $value"
305 ) unless ( $value =~ m/^\d+$/ && $value > 0 );
307 $self->{N_SITES} = $value;
309 return 1;
312 =head3 get_runs
314 Title : get_runs
315 Usage : $runs = $stream->get_runs()
316 Function: returns the number of runs in the msOUT file (according to the
317 msinfo line)
318 Returns : scalar
319 Args : NONE
321 =cut
323 sub get_runs {
324 my $self = shift;
325 return $self->{RUNS};
328 =head3 get_Seeds
330 Title : get_Seeds
331 Usage : @seeds = $stream->get_Seeds()
332 Function: returns an array of the seeds used in the creation of the msOUT file.
333 Returns : array
334 Args : NONE
335 Details : In older versions, ms used three seeds. Newer versions of ms seem to
336 use only one (longer) seed. This function will return all the seeds
337 found.
339 =cut
341 sub get_Seeds {
342 my $self = shift;
343 return @{ $self->{SEEDS} };
346 =head3 get_Positions
348 Title : get_Positions
349 Usage : @positions = $stream->get_Positions()
350 Function: returns an array of the names of each segsite of the run of the last
351 read hap.
352 Returns : array
353 Args : NONE
354 Details : The Positions may or may not vary from run to run depending on the
355 options used with ms.
357 =cut
359 sub get_Positions {
360 my $self = shift;
361 return @{ $self->{LAST_READ_POSITIONS} };
364 =head3 get_tot_run_haps
366 Title : get_tot_run_haps
367 Usage : $number_of_haps_per_run = $stream->get_tot_run_haps()
368 Function: returns the number of haplotypes (sequences) in each run of the msOUT
369 file ( according to the msinfo line ).
370 Returns : scalar >= 0
371 Args : NONE
372 Details : This number should not vary from run to run.
374 =cut
376 sub get_tot_run_haps {
377 my $self = shift;
378 return $self->{TOT_RUN_HAPS};
381 =head3 get_ms_info_line
383 Title : get_ms_info_line
384 Usage : $ms_info_line = $stream->get_ms_info_line()
385 Function: returns the header line of the msOUT file.
386 Returns : scalar
387 Args : NONE
389 =cut
391 sub get_ms_info_line {
392 my $self = shift;
393 return $self->{MS_INFO_LINE};
396 =head3 tot_haps
398 Title : tot_haps
399 Usage : $number_of_haplotypes_in_file = $stream->tot_haps()
400 Function: returns the number of haplotypes (sequences) in the msOUT file.
401 Information gathered from msOUT header line.
402 Returns : scalar
403 Args : NONE
405 =cut
407 sub get_tot_haps {
408 my $self = shift;
409 return ( $self->{TOT_RUN_HAPS} * $self->{RUNS} );
412 =head3 get_Pops
414 Title : get_Pops
415 Usage : @pops = $stream->pops()
416 Function: returns an array of population sizes (order taken from the -I flag in
417 the msOUT header line). This array will include the last hap even if
418 it looks like an outgroup.
419 Returns : array of scalars > 0
420 Args : NONE
422 =cut
424 sub get_Pops {
425 my $self = shift;
426 return @{ $self->{POPS} };
429 =head3 get_next_run_num
431 Title : get_next_run_num
432 Usage : $next_run_number = $stream->next_run_num()
433 Function: returns the number of the ms run that the next haplotype (sequence)
434 will be taken from (starting at 1). Returns undef if the complete
435 file has been read.
436 Returns : scalar > 0 or undef
437 Args : NONE
439 =cut
441 sub get_next_run_num {
442 my $self = shift;
443 return $self->{NEXT_RUN_NUM};
446 =head3 get_last_haps_run_num
448 Title : get_last_haps_run_num
449 Usage : $last_haps_run_number = $stream->get_last_haps_run_num()
450 Function: returns the number of the ms run that the last haplotype (sequence)
451 was taken from (starting at 1). Returns undef if no hap has been
452 read yet.
453 Returns : scalar > 0 or undef
454 Args : NONE
456 =cut
458 sub get_last_haps_run_num {
459 my $self = shift;
460 return $self->{LAST_HAPS_RUN_NUM};
463 =head3 get_last_read_hap_num
465 Title : get_last_read_hap_num
466 Usage : $last_read_hap_num = $stream->get_last_read_hap_num()
467 Function: returns the number (starting with 1) of the last haplotype read from
468 the ms file
469 Returns : scalar >= 0
470 Args : NONE
471 Details : 0 means that no haplotype has been read yet. Is reset to 0 every run.
473 =cut
475 sub get_last_read_hap_num {
476 my $self = shift;
477 return $self->{LAST_READ_HAP_NUM};
480 =head3 outgroup
482 Title : outgroup
483 Usage : $outgroup = $stream->outgroup()
484 Function: returns '1' if the msOUT stream has an outgroup. Returns '0'
485 otherwise.
486 Returns : '1' or '0'
487 Args : NONE
488 Details : This method will return '1' only if the last population in the msOUT
489 file contains only one haplotype. If the last population is not an
490 outgroup then create the msOUT object using 'no_og' as input flag.
491 Also, return 0, if the run has only one population.
493 =cut
495 sub outgroup {
496 my $self = shift;
497 my @pops = $self->get_Pops;
498 if ( $pops[$#pops] == 1 && !defined $self->{NO_OUTGROUP} && @pops > 1 ) {
499 return 1;
501 else { return 0; }
504 =head3 get_next_haps_pop_num
506 Title : get_next_haps_pop_num
507 Usage : ($next_haps_pop_num, $num_haps_left_in_pop) = $stream->get_next_haps_pop_num()
508 Function: First return value is the population number (starting with 1) the
509 next hap will come from. The second return value is the number of haps
510 left to read in the population from which the next hap will come.
511 Returns : (scalar > 0, scalar > 0)
512 Args : NONE
514 =cut
516 sub get_next_haps_pop_num {
517 my $self = shift;
518 my $last_read_hap = $self->get_last_read_hap_num;
519 my @pops = $self->get_Pops;
521 foreach my $pop_num ( 0 .. $#pops ) {
522 if ( $last_read_hap < $pops[$pop_num] ) {
523 return ( $pop_num + 1, $pops[$pop_num] - $last_read_hap );
525 else { $last_read_hap -= $pops[$pop_num] }
528 # In this case we're at the beginning of the next run
529 return ( 1, $pops[0] );
532 =head3 get_next_seq
534 Title : get_next_seq
535 Usage : $seq = $stream->get_next_seq()
536 Function: reads and returns the next sequence (haplotype) in the stream
537 Returns : Bio::Seq object or void if end of file
538 Args : NONE
539 Note : This function is included only to conform to convention. The
540 returned Bio::Seq object holds a halpotype in coded form. Use the hash
541 returned by get_base_conversion_table() to convert 'A', 'T', 'C', 'G'
542 back into 1,2,4 and 5. Use get_next_hap() to retrieve the halptoype as
543 a string of 1,2,4 and 5s instead.
545 =cut
547 sub get_next_seq {
548 my $self = shift;
549 my $seqstring = $self->get_next_hap;
551 return unless ($seqstring);
553 # Used to create unique ID;
554 my $run = $self->get_last_haps_run_num;
556 # Converting numbers to letters so that the haplotypes can be stored as a
557 # seq object
558 my $rh_base_conversion_table = $self->get_base_conversion_table;
559 foreach my $base ( keys %{$rh_base_conversion_table} ) {
560 $seqstring =~ s/($rh_base_conversion_table->{$base})/$base/g;
563 # Fill in non-variable positions
564 my $segsites = $self->get_current_run_segsites;
565 my $n_sites = $self->get_n_sites;
566 if ( defined($n_sites) ) {
568 # make sure that n_sites is at least as large
569 # as segsites for each run. Throw an exception otherwise.
570 $self->throw( "n_sites:\t$n_sites"
571 . "\nsegsites:\t$segsites"
572 . "\nrun:\t$run"
573 . "\nn_sites needs to be at least the number of segsites of every run"
574 ) unless $segsites <= $n_sites;
576 my $seq_len = 0;
577 my @seq;
578 my @pos = $self->get_Positions;
579 for ( my $i = 0 ; $i <= $#pos ; $i++ ) {
580 $pos[$i] *= $n_sites;
581 my $rpt = $pos[$i] - 1 - $seq_len;
582 $rpt = $rpt >= 1 ? $rpt : 0;
583 push( @seq, "A" x ( $rpt ) );
584 $seq_len += length( $seq[-1] );
585 push( @seq, substr( $seqstring, $i, 1 ) );
586 $seq_len += length( $seq[-1] );
588 push( @seq, "A" x ( $n_sites - $seq_len ) );
589 $seqstring = join( "", @seq );
592 my $last_read_hap = $self->get_last_read_hap_num;
593 my $id = 'Hap_' . $last_read_hap . '_Run_' . $run;
594 my $description =
595 "Segsites $segsites;"
596 . " Positions "
597 . ( defined $n_sites ? $n_sites : $segsites ) . ";"
598 . " Haplotype $last_read_hap;"
599 . " Run $run;";
600 my $seq = $self->sequence_factory->create(
601 -seq => $seqstring,
602 -id => $id,
603 -desc => $description,
604 -alphabet => q(dna),
605 -direct => 1,
608 return $seq
612 =head3 next_seq
614 Title : next_seq
615 Usage : $seq = $stream->next_seq()
616 Function: Alias to get_next_seq()
617 Returns : Bio::Seq object or void if end of file
618 Args : NONE
619 Note : This function is only included for convention. It calls get_next_seq().
620 See get_next_seq() for details.
622 =cut
624 sub next_seq {
625 my $self = shift;
626 return $self->get_next_seq();
629 =head3 get_next_hap
631 Title : get_next_hap
632 Usage : $hap = $stream->next_hap()
633 Function: reads and returns the next sequence (haplotype) in the stream.
634 Returns undef if all sequences in stream have been read.
635 Returns : Haplotype string (e.g. '110110000101101045454000101'
636 Args : NONE
637 Note : Use get_next_seq() if you want the halpotype returned as a
638 Bio::Seq object.
640 =cut
642 sub get_next_hap {
643 my $self = shift;
645 # Let's figure out how many haps to read from the input file so that
646 # we get back to the beginning of the next run.
648 my $end_run = 0;
649 if ( $self->{TOT_RUN_HAPS} == $self->{LAST_READ_HAP_NUM} + 1 ) {
650 $end_run = 1;
653 # Setting last_haps_run_num
654 $self->{LAST_HAPS_RUN_NUM} = $self->get_next_run_num;
656 my $last_read_hap = $self->get_last_read_hap_num;
657 my ($seqstring) =
658 $self->_get_next_clean_hap( $self->{_filehandle}, 1, $end_run );
659 if ( !defined $seqstring && $last_read_hap < $self->get_tot_haps ) {
660 $self->throw(
661 "msout file has only $last_read_hap hap(s), which is less than indicated in msinfo line ( "
662 . $self->get_tot_haps
663 . " )" );
666 return $seqstring;
669 =head3 get_next_pop
671 Title : get_next_pop
672 Usage : @seqs = $stream->next_pop()
673 Function: reads and returns all the remaining sequences (haplotypes) in the
674 population of the next sequence. Returns an empty list if no more
675 haps remain to be read in the stream
676 Returns : array of Bio::Seq objects
677 Args : NONE
679 =cut
681 sub get_next_pop {
682 my $self = shift;
684 # Let's figure out how many haps to read from the input file so that
685 # we get back to the beginning of the next run.
687 my @pops = $self->get_Pops;
688 my @seqs; # holds Bio::Seq objects to return
690 # Determine number of the pop that the next hap will be taken from
691 my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num;
693 # If $haps_to_pull == 0, then we need to pull the whole population
694 if ( $haps_to_pull == 0 ) {
695 $haps_to_pull = $pops[ $next_haps_pop_num - 1 ];
698 for ( 1 .. $haps_to_pull ) {
699 my $seq = $self->get_next_seq;
700 next unless defined $seq;
702 # Add Population number information to description
703 $seq->display_id(" Population number $next_haps_pop_num;");
704 push @seqs, $seq;
707 return @seqs;
710 =head3 next_run
712 Title : next_run
713 Usage : @seqs = $stream->next_run()
714 Function: reads and returns all the remaining sequences (haplotypes) in the ms
715 run of the next sequence. Returns an empty list if all haps have been
716 read from the stream.
717 Returns : array of Bio::Seq objects
718 Args : NONE
720 =cut
722 sub get_next_run {
723 my $self = shift;
725 # Let's figure out how many haps to read from the input file so that
726 # we get back to the beginning of the next run.
728 my ( $next_haps_pop_num, $haps_to_pull ) = $self->get_next_haps_pop_num;
729 my @seqs;
731 my @pops = $self->get_Pops;
733 foreach ( $next_haps_pop_num .. $#pops ) {
734 $haps_to_pull += $pops[$_];
737 # Read those haps from the input file
738 # Next hap read will be the first hap of the first pop of the next run.
740 for ( 1 .. $haps_to_pull ) {
741 my $seq = $self->get_next_seq;
742 next unless defined $seq;
744 push @seqs, $seq;
747 return @seqs;
750 =head2 Methods to Retrieve Constants
752 =head3 base_conversion_table
754 Title : get_base_conversion_table
755 Usage : $table_hash_ref = $stream->get_base_conversion_table()
756 Function: returns a reference to a hash. The keys of the hash are the letters '
757 A','T','G','C'. The values associated with each key are the value that
758 each letter in the sequence of a seq object returned by a
759 Bio::SeqIO::msout stream should be translated to.
760 Returns : reference to a hash
761 Args : NONE
762 Synopsis:
764 # retrieve the Bio::Seq object's sequence
765 my $haplotype = $seq->seq;
767 # need to convert all letters to their corresponding numbers.
768 foreach my $base (keys %{$rh_base_conversion_table}){
769 $haplotype =~ s/($base)/$rh_base_conversion_table->{$base}/g;
772 # $haplotype is now an ms style haplotype. (e.g. '100101101455')
774 =cut
776 sub get_base_conversion_table {
777 my $self = shift;
778 return $self->{BASE_CONVERSION_TABLE_HASH_REF};
781 ##############################################################################
782 ## subs for internal use only
783 ##############################################################################
785 sub _get_next_clean_hap {
787 #By Warren Kretzschmar
789 # return the next non-empty line from file handle (chomped line)
790 # skipps to the next run if '//' is encountered
791 my ( $self, $fh, $times, $end_run ) = @_;
792 my @data;
794 unless ( ref($fh) eq q(GLOB) ) { return; }
796 unless ( defined $times && $times > 0 ) {
797 $times = 1;
800 if ( defined $self->{BUFFER_HAP} ) {
801 push @data, $self->{BUFFER_HAP};
802 $self->{BUFFER_HAP} = undef;
803 $self->{LAST_READ_HAP_NUM}++;
804 $times--;
807 while ( 1 <= $times-- ) {
809 # Find next clean line
810 my $data = <$fh>;
811 last if !defined($data);
812 chomp $data;
813 while ( $data !~ /./ ) {
814 $data = <$fh>;
815 chomp $data;
818 # If the next run is encountered here, then we have a programming
819 # or format error
820 if ( $data eq '//' ) { $self->throw("'//' found when not expected\n") }
822 $self->{LAST_READ_HAP_NUM}++;
823 push @data, $data;
826 if ($end_run) {
827 $self->_load_run_info($fh);
830 return (@data);
833 sub _load_run_info {
835 my ( $self, $fh ) = @_;
837 my $data = <$fh>;
839 # getting rid of excess newlines
840 while ( defined($data) && $data !~ /./ ) {
841 $data = <$fh>;
844 # In this case we are at EOF
845 if ( !defined($data) ) { $self->{NEXT_RUN_NUM} = undef; return; }
847 while ( $data !~ /./ ) {
848 $data = <$fh>;
849 chomp $data;
852 chomp $data;
854 # If the next run is encountered, then skip to the next hap and save it in
855 # the buffer.
856 if ( $data eq '//' ) {
857 $self->{NEXT_RUN_NUM}++;
858 $self->{LAST_READ_HAP_NUM} = 0;
859 for ( 1 .. 3 ) {
860 $data = <$fh>;
861 while ( $data !~ /./ ) {
862 $data = <$fh>;
863 chomp $data;
865 chomp $data;
867 if ( $_ eq '1' ) {
868 my @sites = split( /\s+/, $data );
869 $self->{LAST_READ_SEGSITES} = $sites[1];
871 elsif ( $_ eq '2' ) {
872 my @positions = split( /\s+/, $data );
873 shift @positions;
874 $self->{LAST_READ_POSITIONS} = \@positions;
876 else {
877 if ( !defined($data) ) {
878 $self->throw("run $self->{NEXT_RUN_NUM} has no haps./n");
880 $self->{BUFFER_HAP} = $data;
884 else {
885 $self->throw(
886 "'//' not encountered when expected. There are more haplos in one of the msOUT runs than advertised in the msinfo line."