sync w/ main trunk
[bioperl-live.git] / Bio / Factory / FTLocationFactory.pm
blob783431456eb3de8a3ee9793ca4a44f5f3dcfe033
1 # $Id$
3 # BioPerl module for Bio::Factory::FTLocationFactory
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Hilmar Lapp <hlapp at gmx.net>
9 # Copyright Hilmar Lapp
11 # You may distribute this module under the same terms as perl itself
13 # (c) Hilmar Lapp, hlapp at gnf.org, 2002.
14 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
16 # You may distribute this module under the same terms as perl itself.
17 # Refer to the Perl Artistic License (see the license accompanying this
18 # software package, or see http://www.perl.com/language/misc/Artistic.html)
19 # for the terms under which you may use, modify, and redistribute this module.
21 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
22 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
23 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
26 # POD documentation - main docs before the code
28 =head1 NAME
30 Bio::Factory::FTLocationFactory - A FeatureTable Location Parser
32 =head1 SYNOPSIS
34 # parse a string into a location object
35 $loc = Bio::Factory::FTLocationFactory->from_string("join(100..200,
36 400..500");
38 =head1 DESCRIPTION
40 Implementation of string-encoded location parsing for the Genbank feature
41 table encoding of locations.
43 =head1 FEEDBACK
45 =head2 Mailing Lists
47 User feedback is an integral part of the evolution of this and other
48 Bioperl modules. Send your comments and suggestions preferably to
49 the Bioperl mailing list. Your participation is much appreciated.
51 bioperl-l@bioperl.org - General discussion
52 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
54 =head2 Support
56 Please direct usage questions or support issues to the mailing list:
58 L<bioperl-l@bioperl.org>
60 rather than to the module maintainer directly. Many experienced and
61 reponsive experts will be able look at the problem and quickly
62 address it. Please include a thorough description of the problem
63 with code and data examples if at all possible.
65 =head2 Reporting Bugs
67 Report bugs to the Bioperl bug tracking system to help us keep track
68 of the bugs and their resolution. Bug reports can be submitted via the
69 web:
71 http://bugzilla.open-bio.org/
73 =head1 AUTHOR - Hilmar Lapp
75 Email hlapp at gmx.net
77 =head1 CONTRIBUTORS
79 Jason Stajich, jason-at-bioperl-dot-org
80 Chris Fields, cjfields-at-uiuc-dot-edu
82 =head1 APPENDIX
84 The rest of the documentation details each of the object methods.
85 Internal methods are usually preceded with a _
87 =cut
90 # Let the code begin...
92 package Bio::Factory::FTLocationFactory;
93 use vars qw($LOCREG);
94 use strict;
96 # Object preamble - inherits from Bio::Root::Root
98 use Bio::Location::Simple;
99 use Bio::Location::Split;
100 use Bio::Location::Fuzzy;
103 use base qw(Bio::Root::Root Bio::Factory::LocationFactoryI);
105 BEGIN {
106 # the below is an optimized regex obj. from J. Freidl's Mastering Reg Exp.
107 $LOCREG = qr{
109 [^()]+
112 (??{$LOCREG})
115 }x;
118 =head2 new
120 Title : new
121 Usage : my $obj = Bio::Factory::FTLocationFactory->new();
122 Function: Builds a new Bio::Factory::FTLocationFactory object
123 Returns : an instance of Bio::Factory::FTLocationFactory
124 Args :
126 =cut
128 =head2 from_string
130 Title : from_string
131 Usage : $loc = $locfactory->from_string("100..200");
132 Function: Parses the given string and returns a Bio::LocationI implementing
133 object representing the location encoded by the string.
135 This implementation parses the Genbank feature table
136 encoding of locations.
137 Example :
138 Returns : A Bio::LocationI implementing object.
139 Args : A string.
141 =cut
143 sub from_string {
144 my ($self,$locstr,$op) = @_;
145 my $loc;
147 #$self->debug("$locstr\n");
149 # $op for operator (error handling)
151 # run on first pass only
152 # Note : These location types are now deprecated in GenBank (Oct. 2006)
153 if (!defined($op)) {
154 # convert all (X.Y) to [X.Y]
155 $locstr =~ s{\((\d+\.\d+)\)}{\[$1\]}g;
156 # convert ABC123:(X..Y) to ABC123:[X..Y]
157 # we should never see the above
158 $locstr =~ s{:\((\d+\.{2}\d+)\)}{:\[$1\]}g;
161 if ($locstr =~ m{(.*?)\(($LOCREG)\)(.*)}o) { # any matching parentheses?
163 my ($beg, $mid, $end) = ($1, $2, $3);
164 my (@sublocs) = (split(q(,),$beg), $mid, split(q(,),$end));
166 my @loc_objs;
167 my $loc_obj;
169 SUBLOCS:
170 while (@sublocs) {
171 my $subloc = shift @sublocs;
172 next if !$subloc;
173 my $oparg = ($subloc eq 'join' || $subloc eq 'bond' ||
174 $subloc eq 'order' || $subloc eq 'complement') ? $subloc : undef;
175 # has operator, requires further work (recurse)
176 if ($oparg) {
177 my $sub = shift @sublocs;
178 # simple split operators (no recursive calls needed)
179 if (($oparg eq 'join' || $oparg eq 'order' || $oparg eq 'bond' )
180 && $sub !~ m{(?:join|order|bond)}) {
181 my @splitlocs = split(q(,), $sub);
182 $loc_obj = Bio::Location::Split->new(-verbose => 1,
183 -splittype => $oparg);
184 while (my $splitloc = shift @splitlocs) {
185 next unless $splitloc;
186 my $sobj;
187 if ($splitloc =~ m{\(($LOCREG)\)}) {
188 my $comploc = $1;
189 $sobj = $self->_parse_location($comploc);
190 $sobj->strand(-1);
191 } else {
192 $sobj = $self->_parse_location($splitloc);
194 $loc_obj->add_sub_Location($sobj);
196 } else {
197 $loc_obj = $self->from_string($sub, $oparg);
198 # reinsure the operator is set correctly for this level
199 # unless it is complement
200 $loc_obj->splittype($oparg) unless $oparg eq 'complement';
203 # no operator, simple or fuzzy
204 else {
205 $loc_obj = $self->from_string($subloc,1);
207 $loc_obj->strand(-1) if ($op && $op eq 'complement');
208 push @loc_objs, $loc_obj;
210 my $ct = @loc_objs;
211 if ($op && !($op eq 'join' || $op eq 'order' || $op eq 'bond')
212 && $ct > 1 ) {
213 $self->throw("Bad operator $op: had multiple locations ".
214 scalar(@loc_objs).", should be SplitLocationI");
216 if ($ct > 1) {
217 $loc = Bio::Location::Split->new();
218 $loc->add_sub_Location(shift @loc_objs) while (@loc_objs);
219 return $loc;
220 } else {
221 $loc = shift @loc_objs;
222 return $loc;
224 } else { # simple location(s)
225 $loc = $self->_parse_location($locstr);
226 $loc->strand(-1) if ($op && $op eq 'complement');
228 return $loc;
231 =head2 _parse_location
233 Title : _parse_location
234 Usage : $loc = $locfactory->_parse_location( $loc_string)
236 Function: Parses the given location string and returns a location object
237 with start() and end() and strand() set appropriately.
238 Note that this method is private.
239 Returns : A Bio::LocationI implementing object or undef on failure
240 Args : location string
242 =cut
244 sub _parse_location {
245 my ($self, $locstr) = @_;
246 my ($loc, $seqid);
247 #$self->debug( "Location parse, processing $locstr\n");
248 # 'remote' location?
249 if($locstr =~ m{^(\S+):(.*)$}o) {
250 # yes; memorize remote ID and strip from location string
251 $seqid = $1;
252 $locstr = $2;
255 # split into start and end
256 my ($start, $end) = split(/\.\./, $locstr);
257 # remove enclosing parentheses if any; note that because of parentheses
258 # possibly surrounding the entire location the parentheses around start
259 # and/or may be asymmetrical
260 # Note: these are from X.Y fuzzy locations, which are deprecated!
261 $start =~ s/(?:^\[+|\]+$)//g if $start;
262 $end =~ s/(?:^\[+|\]+$)//g if $end;
264 # Is this a simple (exact) or a fuzzy location? Simples have exact start
265 # and end, or is between two adjacent bases. Everything else is fuzzy.
266 my $loctype = ".."; # exact with start and end as default
268 $loctype = '?' if ( ($locstr =~ /\?/) && ($locstr !~ /\?\d+/) );
270 my $locclass = "Bio::Location::Simple";
271 if(! defined($end)) {
272 if($locstr =~ /(\d+)([\.\^])(\d+)/) {
273 $start = $1;
274 $end = $3;
275 $loctype = $2;
276 $locclass = "Bio::Location::Fuzzy"
277 unless (abs($end-$start) <= 1) && ($loctype eq "^");
278 } else {
279 $end = $start;
282 # start_num and end_num are for the numeric only versions of
283 # start and end so they can be compared
284 # in a few lines
285 my ($start_num, $end_num) = ($start,$end);
286 if ( ($start =~ /[\>\<\?\.\^]/) || ($end =~ /[\>\<\?\.\^]/) ) {
287 $locclass = 'Bio::Location::Fuzzy';
288 if($start =~ /(\d+)/) {
289 ($start_num) = $1;
290 } else {
291 $start_num = 0
293 if ($end =~ /(\d+)/) {
294 ($end_num) = $1;
295 } else { $end_num = 0 }
297 my $strand = 1;
299 if( $start_num > $end_num && $loctype ne '?') {
300 ($start,$end,$strand) = ($end,$start,-1);
302 # instantiate location and initialize
303 $loc = $locclass->new(-verbose => $self->verbose,
304 -start => $start,
305 -end => $end,
306 -strand => $strand,
307 -location_type => $loctype);
308 # set remote ID if remote location
309 if($seqid) {
310 $loc->is_remote(1);
311 $loc->seq_id($seqid);
314 # done (hopefully)
315 return $loc;