use 'latest' library versions for maximum lifespan as an example
[gae-samples.git] / 24hrsinsf / geobox.py
blob392ab314b6a633658bda860b7f29504e08667556
1 #!/usr/bin/env python
3 # Copyright 2008 Brett Slatkin
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 """Defines the 'geobox' model and how to query on it.
19 It's hard to do bounding-box queries with non-relational databases because
20 they do not have the histograms necessary for using multiple inequality
21 filters in the same query.
23 Geobox queries get around this by pre-computing different resolutions of
24 bounding boxes and saving those in the database. Then to query for a bounding
25 box, you just query for the resolution of geo box that you're interested in.
27 A geobox is defined by a string ordered "lat|lon|lat|lon" that looks like:
28 "37.78452999999|-122.39532395324|37.78452999998|-122.39532395323"
30 Where the two sets of coordinates form a bounding box. The first coordinate set
31 is the west/north-most corner of the bounding box. The second coordinate set is
32 the east/south-most corner of the bounding box.
34 Each geo value is represented with many decimal places of accuracy, up to the
35 resolution of the data source. The number of decimal places present for latitude
36 and longitude is called the "resolution" of the geobox. For example, 15
37 decimal-point values would be called "resolution 15". Each coordinate in the
38 geobox must have all digits defined for its corresponding resolution. That means
39 even trailing zeros should be present.
41 To query for members of a bounding box, we start with some input coordinates
42 like lat=37.78452 long=-122.39532 (both resolution 5). We then round these
43 coordinates up and down to the nearest "slice" to generate a geobox. A "slice"
44 is how finely to divide each level of resolution in the geobox. The minimum
45 slice size is 1, the maximum does not have a limit, since larger slices will
46 just spill over into lower resolutions (hopefully the examples will explain).
48 Some examples:
50 resolution=5, slice=2, and lat=37.78452 long=-122.39532:
51 "37.78452|-122.39532|37.78450|-122.39530"
53 resolution=5, slice=10, and lat=37.78452 long=-122.39532:
54 "37.78460|-122.39540|37.78450|-122.39530"
56 resolution=5, slice=25, and lat=37.78452 long=-122.39532:
57 "37.78475|-122.39550|37.78450|-122.39525"
60 Another way to explain this is to say we compute a geobox for a point by
61 figuring out the closest known geobox that surrounds the point at the current
62 level of resolution
64 ------------
65 | | x = actual point location (37.78, -122.39)
66 | | box = surrounding box (top=37.79, left=-122.39,
67 | x | bottom=37.78, right=-122.38)
68 | |
69 ------------
72 With App Engine, we can make this query fast by having a list property of
73 geobox strings associated with an entity. We can pre-compute the geobox for
74 all queryable entities at multiple resolutions and slices and add those to
75 the list property. Then the query is just an equals filter on the geobox string.
77 The great thing about using an equals filter is that we can use a geobox
78 query along with other equality and inequality filters to produce a rich
79 query that lets us find things in a bounding box that fit other criteria
80 (e.g., restaurants of a certain type in a location with a price range under
81 $20 per person).
84 Other techniques can be used to make geobox queries more accurate. For example,
85 if you compute the difference between a point's latitude and it's right and left
86 geobox coordinates, you can figure out how close to the edge of the box you are,
87 and possibly query the current box and the adjacent box in order to get data in
88 both areas:
90 ------------ ------------
91 | | |
92 | x| |
93 | | |
94 | | |
95 ------------ ------------
96 Geobox 1 Geobox 2
98 This could also be extended to querying for multiple tiles if the point is
99 close to a corner of both latitude and longitude, which would result in four
100 geoboxes being queried in all:
102 ------------ ------------
103 | | |
104 | | |
105 | | |
106 | x| |
107 ------------ ------------
108 | | |
109 | | |
110 | | |
111 | | |
112 ------------ ------------
115 Another technique is to query concentric geoboxes at the same time and do the
116 final ordering of results in memory. The success of this approach very much
117 depends on the amount of data returned by each query (since too much data in
118 the concentric boxes will overwhelm the in-memory sort).
120 -----------
121 | ------- |
122 | | --- | |
123 | | | x | | |
124 | | --- | |
125 | ------- |
126 -----------
129 __author__ = "bslatkin@gmail.com (Brett Slatkin)"
131 __all__ = ["compute"]
133 import decimal
136 def _round_slice_down(coord, slice):
137 try:
138 remainder = coord % slice
139 if coord > 0:
140 return coord - remainder + slice
141 else:
142 return coord - remainder
143 except decimal.InvalidOperation:
144 # This happens when the slice is too small for the current coordinate.
145 # That means we've already got zeros in the slice's position, so we're
146 # already rounded down as far as we can go.
147 return coord
150 def compute_tuple(lat, lon, resolution, slice):
151 """Computes the tuple Geobox for a coordinate with a resolution and slice."""
152 decimal.getcontext().prec = resolution + 3
153 lat = decimal.Decimal(str(lat))
154 lon = decimal.Decimal(str(lon))
155 slice = decimal.Decimal(str(1.0 * slice * 10 ** -resolution))
157 adjusted_lat = _round_slice_down(lat, slice)
158 adjusted_lon = _round_slice_down(lon, slice)
159 return (adjusted_lat, adjusted_lon - slice,
160 adjusted_lat - slice, adjusted_lon)
163 def format_tuple(values, resolution):
164 """Returns the string representation of a geobox tuple."""
165 format = "%%0.%df" % resolution
166 return "|".join(format % v for v in values)
169 def compute(lat, lon, resolution, slice):
170 """Computes the Geobox for a coordinate with a resolution and slice."""
171 return format_tuple(compute_tuple(lat, lon, resolution, slice), resolution)
174 def compute_set(lat, lon, resolution, slice):
175 """Computes the set of adjacent Geoboxes for a coordinate."""
176 decimal.getcontext().prec = resolution + 3
177 primary_box = compute_tuple(lat, lon, resolution, slice)
178 slice = decimal.Decimal(str(1.0 * slice * 10 ** -resolution))
180 geobox_values = []
181 for i in xrange(-1, 2):
182 lat_delta = slice * i
183 for j in xrange(-1, 2):
184 lon_delta = slice * j
185 adjusted_box = (primary_box[0] + lat_delta, primary_box[1] + lon_delta,
186 primary_box[2] + lat_delta, primary_box[3] + lon_delta)
187 geobox_values.append(format_tuple(adjusted_box, resolution))
189 return geobox_values
192 def test():
193 tests = [
194 (("37.78452", "-122.39532", 6, 10), "37.784530|-122.395330|37.784520|-122.395320"),
195 (("37.78452", "-122.39532", 6, 25), "37.784525|-122.395325|37.784500|-122.395300"),
196 (("37.78452", "-122.39532", 7, 25), "37.7845225|-122.3953225|37.7845200|-122.3953200"),
197 (("37.78452", "-122.39532", 7, 1), "37.7845201|-122.3953201|37.7845200|-122.3953200"),
198 (("37.78452", "-122.39532", 5, 15), "37.78455|-122.39535|37.78440|-122.39520"),
199 (("37.78452", "-122.39532", 4, 17), "37.7859|-122.3966|37.7842|-122.3949"),
200 (("37.78452", "-122.39531", 4, 17), "37.7859|-122.3966|37.7842|-122.3949"),
201 (("37.78452", "-122.39667", 4, 17), "37.7859|-122.3983|37.7842|-122.3966"),
203 for args, expected in tests:
204 print "Testing compute%s, expecting %s" % (args, expected)
205 value = compute(*args)
206 assert value == expected, "Found: " + value
208 expected = sorted([
209 '37.784500|-122.395350|37.784475|-122.395325',
210 '37.784500|-122.395325|37.784475|-122.395300',
211 '37.784500|-122.395300|37.784475|-122.395275',
212 '37.784525|-122.395350|37.784500|-122.395325',
213 '37.784525|-122.395325|37.784500|-122.395300',
214 '37.784525|-122.395300|37.784500|-122.395275',
215 '37.784550|-122.395350|37.784525|-122.395325',
216 '37.784550|-122.395325|37.784525|-122.395300',
217 '37.784550|-122.395300|37.784525|-122.395275'
219 print "Testing compute_set, expecting %s" % expected
220 value = sorted(compute_set("37.78452", "-122.39532", 6, 25))
221 assert value == expected, "Failed, found: " + value
222 print "Tests passed"
225 if __name__ == "__main__":
226 test()