squash waffling test
[bioperl-live.git] / Bio / Location / Fuzzy.pm
blobfa81669390960da42230d0efb33b7b58e7c9ceb4
2 # BioPerl module for Bio::Location::Fuzzy
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Jason Stajich <jason@bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 =head1 NAME
14 Bio::Location::Fuzzy - Implementation of a Location on a Sequence
15 which has unclear start and/or end locations
17 =head1 SYNOPSIS
19 use Bio::Location::Fuzzy;
20 my $fuzzylocation = Bio::Location::Fuzzy->new(
21 -start => '<30',
22 -end => 90,
23 -location_type => '..');
25 print "location string is ", $fuzzylocation->to_FTstring(), "\n";
26 print "location is of the type ", $fuzzylocation->location_type, "\n";
28 =head1 DESCRIPTION
30 This module contains the necessary methods for representing a
31 Fuzzy Location, one that does not have clear start and/or end points.
32 This will initially serve to handle features from Genbank/EMBL feature
33 tables that are written as 1^100 meaning between bases 1 and 100 or
34 E<lt>100..300 meaning it starts somewhere before 100. Advanced
35 implementations of this interface may be able to handle the necessary
36 logic of overlaps/intersection/contains/union. It was constructed to
37 handle fuzzy locations that can be represented in Genbank/EMBL and
38 Swissprot.
40 =head1 FEEDBACK
42 User feedback is an integral part of the evolution of this and other
43 Bioperl modules. Send your comments and suggestions preferably to one
44 of the Bioperl mailing lists. Your participation is much appreciated.
46 bioperl-l@bioperl.org - General discussion
47 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
49 =head2 Support
51 Please direct usage questions or support issues to the mailing list:
53 I<bioperl-l@bioperl.org>
55 rather than to the module maintainer directly. Many experienced and
56 reponsive experts will be able look at the problem and quickly
57 address it. Please include a thorough description of the problem
58 with code and data examples if at all possible.
60 =head2 Reporting Bugs
62 Report bugs to the Bioperl bug tracking system to help us keep track
63 the bugs and their resolution. Bug reports can be submitted via the
64 web:
66 https://github.com/bioperl/bioperl-live/issues
68 =head1 AUTHOR - Jason Stajich
70 Email jason-at-bioperl-dot-org
72 =head1 APPENDIX
74 The rest of the documentation details each of the object
75 methods. Internal methods are usually preceded with a _
77 =cut
79 # Let the code begin...
81 package Bio::Location::Fuzzy;
82 use strict;
84 use base qw(Bio::Location::Atomic Bio::Location::FuzzyLocationI);
86 our @LOCATIONCODESBSANE = (undef, 'EXACT', 'WITHIN', 'BETWEEN', 'UNCERTAIN',
87 'BEFORE', 'AFTER');
89 our %FUZZYCODES = ( 'EXACT' => '..', # Position is 'exact
90 # Exact position is unknown, but is within the range specified, ((1.2)..100)
91 'WITHIN' => '.',
92 # 1^2
93 'BETWEEN' => '^',
94 'IN-BETWEEN' => '^',
95 'UNCERTAIN' => '?',
96 # <100
97 'BEFORE' => '<',
98 # >10
99 'AFTER' => '>');
101 # The following regular expressions map to fuzzy location types. Every
102 # expression must match the complete encoded point string, and must
103 # contain two groups identifying min and max. Empty matches are automatic.
104 # converted to undef, except for 'EXACT', for which max is set to equal
105 # min.
107 our %FUZZYPOINTENCODE = (
108 '\>(\d+)(.{0})' => 'AFTER',
109 '\<(.{0})(\d+)' => 'BEFORE',
110 '(\d+)' => 'EXACT',
111 '\?(\d*)' => 'UNCERTAIN',
112 '(\d+)(.{0})\>' => 'AFTER',
113 '(.{0})(\d+)\<' => 'BEFORE',
114 '(\d+)\.(\d+)' => 'WITHIN',
115 '(\d+)\^(\d+)' => 'BETWEEN',
118 our %FUZZYRANGEENCODE = ( '\.' => 'WITHIN',
119 '\.\.' => 'EXACT',
120 '\^' => 'IN-BETWEEN' );
122 =head2 new
124 Title : new
125 Usage : my $fuzzyloc = Bio::Location::Fuzzy->new( @args);
126 Function:
127 Returns :
128 Args : -start => value for start (initialize by superclass)
129 -end => value for end (initialize by superclass)
130 -strand => value for strand (initialize by superclass)
131 -location_type => either ('EXACT','WITHIN','IN-BETWEEN',
132 'UNCERTAIN') OR ( 1,2,3,4)
133 -start_ext=> extension for start - defaults to 0,
134 -start_fuz= fuzzy code for start can be
135 ('EXACT','WITHIN','BETWEEN','BEFORE','AFTER',
136 'UNCERTAIN' ) OR
137 a value 1 - 5 corresponding to index+1 above
138 -end_ext=> extension for end - defaults to 0,
139 -end_fuz= fuzzy code for end can be
140 ('EXACT','WITHIN','BETWEEN','BEFORE','AFTER',
141 'UNCERTAIN') OR
142 a value 1 - 5 corresponding to index+1 above
144 =cut
146 sub new {
147 my ($class, @args) = @_;
148 my $self = $class->SUPER::new(@args);
149 my ($location_type, $start_ext, $start_fuz, $end_ext, $end_fuz) =
150 $self->_rearrange([ qw(LOCATION_TYPE START_EXT START_FUZ
151 END_EXT END_FUZ )
152 ], @args);
154 $location_type && $self->location_type($location_type);
155 $start_ext && $self->max_start($self->min_start + $start_ext);
156 $end_ext && $self->max_end($self->min_end + $end_ext);
157 $start_fuz && $self->start_pos_type($start_fuz);
158 $end_fuz && $self->end_pos_type($end_fuz);
160 return $self;
163 =head2 location_type
165 Title : location_type
166 Usage : my $location_type = $location->location_type();
167 Function: Get location type encoded as text
168 Returns : string ('EXACT', 'WITHIN', 'IN-BETWEEN', 'UNCERTAIN')
169 Args : none
171 =cut
173 sub location_type {
174 my ($self,$value) = @_;
175 if( defined $value || ! defined $self->{'_location_type'} ) {
176 $value = 'EXACT' unless defined $value;
177 if(! defined $FUZZYCODES{$value} ) {
178 $value = uc($value);
179 if( $value =~ /\.\./ ) {
180 $value = 'EXACT';
181 } elsif( $value =~ /^\.$/ ) {
182 $value = 'WITHIN';
183 } elsif( $value =~ /\^/ ) {
184 $value = 'IN-BETWEEN';
185 $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [".
186 $self->start. "] and [". $self->end. "]")
187 if defined $self->start && defined $self->end &&
188 ($self->end - 1 == $self->start);
189 } elsif( $value =~ /\?/ ) {
190 $value = 'UNCERTAIN';
191 } elsif( $value ne 'EXACT' && $value ne 'WITHIN' &&
192 $value ne 'IN-BETWEEN' ) {
193 $self->throw("Did not specify a valid location type");
196 $self->{'_location_type'} = $value;
198 return $self->{'_location_type'};
201 =head1 LocationI methods
203 =head2 length
205 Title : length
206 Usage : $length = $fuzzy_loc->length();
207 Function: Get the length of this location.
209 Note that the length of a fuzzy location will always depend
210 on the currently active interpretation of start and end. The
211 result will therefore vary for different CoordinatePolicy objects.
213 Returns : an integer
214 Args : none
216 =cut
218 #sub length {
219 # my($self) = @_;
220 # return $self->SUPER::length() if( !$self->start || !$self->end);
221 # $self->warn('Length is not valid for a FuzzyLocation');
222 # return 0;
225 =head2 start
227 Title : start
228 Usage : $start = $fuzzy->start();
229 Function: get/set start of this range, handling fuzzy_starts
230 Returns : a positive integer representing the start of the location
231 Args : start location on set (can be fuzzy point string)
233 =cut
235 sub start {
236 my($self,$value) = @_;
237 if( defined $value ) {
238 my ($encode,$min,$max) = $self->_fuzzypointdecode($value);
239 $self->start_pos_type($encode);
240 $self->min_start($min);
241 $self->max_start($max);
244 $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations ["
245 . $self->SUPER::start. "] and [". $self->SUPER::end. "]")
246 if $self->location_type eq 'IN-BETWEEN' && defined $self->SUPER::end &&
247 ($self->SUPER::end - 1 == $self->SUPER::start);
249 return $self->SUPER::start();
252 =head2 end
254 Title : end
255 Usage : $end = $fuzzy->end();
256 Function: get/set end of this range, handling fuzzy_ends
257 Returns : a positive integer representing the end of the range
258 Args : end location on set (can be fuzzy string)
260 =cut
262 sub end {
263 my($self,$value) = @_;
264 if( defined $value ) {
265 my ($encode,$min,$max) = $self->_fuzzypointdecode($value);
266 $self->end_pos_type($encode);
267 $self->min_end($min);
268 $self->max_end($max);
271 $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [".
272 $self->SUPER::start. "] and [". $self->SUPER::end. "]")
273 if $self->location_type eq 'IN-BETWEEN' && defined $self->SUPER::start &&
274 ($self->SUPER::end - 1 == $self->SUPER::start);
276 return $self->SUPER::end();
279 =head2 min_start
281 Title : min_start
282 Usage : $min_start = $fuzzy->min_start();
283 Function: get/set the minimum starting point
284 Returns : the minimum starting point from the contained sublocations
285 Args : integer or undef on set
287 =cut
289 sub min_start {
290 my ($self,@args) = @_;
292 if(@args) {
293 $self->{'_min_start'} = $args[0]; # the value may be undef!
295 return $self->{'_min_start'};
298 =head2 max_start
300 Title : max_start
301 Usage : my $maxstart = $location->max_start();
302 Function: Get/set maximum starting location of feature startpoint
303 Returns : integer or undef if no maximum starting point.
304 Args : integer or undef on set
306 =cut
308 sub max_start {
309 my ($self,@args) = @_;
311 if(@args) {
312 $self->{'_max_start'} = $args[0]; # the value may be undef!
314 return $self->{'_max_start'};
317 =head2 start_pos_type
319 Title : start_pos_type
320 Usage : my $start_pos_type = $location->start_pos_type();
321 Function: Get/set start position type.
322 Returns : type of position coded as text
323 ('BEFORE','AFTER','EXACT','WITHIN','BETWEEN','UNCERTAIN')
324 Args : a string on set
326 =cut
328 sub start_pos_type {
329 my ($self,$value) = @_;
330 if(defined $value && $value =~ /^\d+$/ ) {
331 if( $value == 0 ) { $value = 'EXACT'; }
332 else {
333 my $v = $LOCATIONCODESBSANE[$value];
334 if( ! defined $v ) {
335 $self->warn("Provided value $value which I don't understand,".
336 " reverting to 'EXACT'");
337 $v = 'EXACT';
339 $value = $v;
342 if(defined($value)) {
343 $self->{'_start_pos_type'} = $value;
345 return $self->{'_start_pos_type'};
348 =head2 min_end
350 Title : min_end
351 Usage : my $minend = $location->min_end();
352 Function: Get/set minimum ending location of feature endpoint
353 Returns : integer or undef if no minimum ending point.
354 Args : integer or undef on set
356 =cut
358 sub min_end {
359 my ($self,@args) = @_;
361 if(@args) {
362 $self->{'_min_end'} = $args[0]; # the value may be undef!
364 return $self->{'_min_end'};
367 =head2 max_end
369 Title : max_end
370 Usage : my $maxend = $location->max_end();
371 Function: Get/set maximum ending location of feature endpoint
372 Returns : integer or undef if no maximum ending point.
373 Args : integer or undef on set
375 =cut
377 sub max_end {
378 my ($self,@args) = @_;
380 if(@args) {
381 $self->{'_max_end'} = $args[0]; # the value may be undef!
383 return $self->{'_max_end'};
386 =head2 end_pos_type
388 Title : end_pos_type
389 Usage : my $end_pos_type = $location->end_pos_type();
390 Function: Get/set end position type.
391 Returns : type of position coded as text
392 ('BEFORE','AFTER','EXACT','WITHIN','BETWEEN','UNCERTAIN')
393 Args : a string on set
395 =cut
397 sub end_pos_type {
398 my ($self,$value) = @_;
399 if( defined $value && $value =~ /^\d+$/ ) {
400 if( $value == 0 ) { $value = 'EXACT'; }
401 else {
402 my $v = $LOCATIONCODESBSANE[$value];
403 if( ! defined $v ) {
404 $self->warn("Provided value $value which I don't understand,".
405 " reverting to 'EXACT'");
406 $v = 'EXACT';
408 $value = $v;
412 if(defined($value)) {
413 $self->{'_end_pos_type'} = $value;
415 return $self->{'_end_pos_type'};
418 =head2 seq_id
420 Title : seq_id
421 Usage : my $seqid = $location->seq_id();
422 Function: Get/Set seq_id that location refers to
423 Returns : seq_id
424 Args : [optional] seq_id value to set
426 =cut
428 =head2 coordinate_policy
430 Title : coordinate_policy
432 Usage : $policy = $location->coordinate_policy();
433 $location->coordinate_policy($mypolicy); # set may not be possible
434 Function: Get the coordinate computing policy employed by this object.
436 See Bio::Location::CoordinatePolicyI for documentation about
437 the policy object and its use.
439 The interface *does not* require implementing classes to accept
440 setting of a different policy. The implementation provided here
441 does, however, allow to do so.
443 Implementors of this interface are expected to initialize every
444 new instance with a CoordinatePolicyI object. The implementation
445 provided here will return a default policy object if none has
446 been set yet. To change this default policy object call this
447 method as a class method with an appropriate argument. Note that
448 in this case only subsequently created Location objects will be
449 affected.
451 Returns : A Bio::Location::CoordinatePolicyI implementing object.
452 Args : On set, a Bio::Location::CoordinatePolicyI implementing object.
454 See L<Bio::Location::CoordinatePolicyI>
456 =cut
458 =head2 to_FTstring
460 Title : to_FTstring
461 Usage : my $locstr = $location->to_FTstring()
462 Function: Get/Set seq_id that location refers to
463 Returns : seq_id
464 Args : [optional] seq_id value to set
466 =cut
468 sub to_FTstring {
469 my ($self) = @_;
470 my (%vals) = ( 'start' => $self->start,
471 'min_start' => $self->min_start,
472 'max_start' => $self->max_start,
473 'start_code' => $self->start_pos_type,
474 'end' => $self->end,
475 'min_end' => $self->min_end,
476 'max_end' => $self->max_end,
477 'end_code' => $self->end_pos_type );
479 my (%strs) = ( 'start' => '',
480 'end' => '');
481 my ($delimiter) = $FUZZYCODES{$self->location_type};
482 $delimiter = $FUZZYCODES{'EXACT'} if ($self->location_type eq 'UNCERTAIN');
484 my $policy = ref($self->coordinate_policy);
486 # I'm lazy, lets do this in a loop since behaviour will be the same for
487 # start and end
488 # The CoordinatePolicy now dictates start/end data here (bug 992) - cjf
489 foreach my $point ( qw(start end) ) {
490 if( ($vals{$point."_code"} ne 'EXACT') &&
491 ($vals{$point."_code"} ne 'UNCERTAIN') ) {
493 # must have max and min defined to use 'WITHIN', 'BETWEEN'
494 if ((!defined $vals{"min_$point"} ||
495 !defined $vals{"max_$point"}) &&
496 ( $vals{$point."_code"} eq 'WITHIN' ||
497 $vals{$point."_code"} eq 'BETWEEN'))
499 $vals{"min_$point"} = '' unless defined $vals{"min_$point"};
500 $vals{"max_$point"} = '' unless defined $vals{"max_$point"};
502 $self->warn("Fuzzy codes for start are in a strange state, (".
503 join(",", ($vals{"min_$point"},
504 $vals{"max_$point"},
505 $vals{$point."_code"})). ")");
506 return '';
509 if (defined $vals{$point."_code"} &&
510 ($vals{$point."_code"} eq 'BEFORE' ||
511 $vals{$point."_code"} eq 'AFTER'))
513 $strs{$point} .= $FUZZYCODES{$vals{$point."_code"}};
514 $strs{$point} .= $vals{"$point"};
517 if( defined $vals{$point."_code"} &&
518 ($vals{$point."_code"} eq 'WITHIN' ||
519 $vals{$point."_code"} eq 'BETWEEN'))
521 # Expect odd results with anything but WidestCoordPolicy for now
522 $strs{$point} .= ($point eq 'start') ?
523 $vals{"$point"}.
524 $FUZZYCODES{$vals{$point."_code"}}.
525 $vals{'max_'.$point}
527 $vals{'min_'.$point}.
528 $FUZZYCODES{$vals{$point."_code"}}.
529 $vals{"$point"};
530 $strs{$point} = "(".$strs{$point}.")";
533 } elsif ($vals{$point."_code"} eq 'UNCERTAIN') {
534 $strs{$point} = $FUZZYCODES{$vals{$point."_code"}};
535 $strs{$point} .= $vals{$point} if defined $vals{$point};
536 } else {
537 $strs{$point} = $vals{$point};
541 my $str = $strs{'start'} . $delimiter . $strs{'end'};
542 if($self->is_remote() && $self->seq_id()) {
543 $str = $self->seq_id() . ":" . $str;
545 if( defined $self->strand &&
546 $self->strand == -1 &&
547 $self->location_type() ne "UNCERTAIN") {
548 $str = "complement(" . $str . ")";
549 } elsif($self->location_type() eq "WITHIN") {
550 $str = "(".$str.")";
552 return $str;
555 =head2 valid_Location
557 Title : valid_Location
558 Usage : if ($location->valid_location) {...};
559 Function: boolean method to determine whether location is considered valid
560 (has minimum requirements for Simple implementation)
561 Returns : Boolean value: true if location is valid, false otherwise
562 Args : none
564 =cut
566 =head2 _fuzzypointdecode
568 Title : _fuzzypointdecode
569 Usage : ($type,$min,$max) = $self->_fuzzypointdecode('<5');
570 Function: Decode a fuzzy string.
571 Returns : A 3-element array consisting of the type of location, the
572 minimum integer, and the maximum integer describing the range
573 of coordinates this start or endpoint refers to. Minimum or
574 maximum coordinate may be undefined.
575 : Returns empty array on fail.
576 Args : fuzzypoint string
578 =cut
580 sub _fuzzypointdecode {
581 my ($self, $string) = @_;
582 return () if( !defined $string);
583 # strip off leading and trailing space
584 $string =~ s/^\s*(\S+)\s*/$1/;
585 foreach my $pattern ( keys %FUZZYPOINTENCODE ) {
586 if( $string =~ /^$pattern$/ ) {
587 my ($min,$max) = ($1,$2) unless (($1 eq '') && (!defined $2));
588 if( ($FUZZYPOINTENCODE{$pattern} eq 'EXACT') ||
589 ($FUZZYPOINTENCODE{$pattern} eq 'UNCERTAIN')
591 $max = $min;
592 } else {
593 $max = undef if((defined $max) && (length($max) == 0));
594 $min = undef if((defined $min) && (length($min) == 0));
596 return ($FUZZYPOINTENCODE{$pattern},$min,$max);
599 if( $self->verbose >= 1 ) {
600 $self->warn("could not find a valid fuzzy encoding for $string");
602 return ();