bugfix: population must be an int, although passed as string
[osm-ro-tools.git] / OsmLocImport.py
blobc6c0ab80e8eceba08c9c45230c260f45f487b1b2
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 import re
5 import unicodedata
6 from xml.dom import minidom
8 import os
9 if os.path.exists('OsmApi.py'):
10 import sys
11 sys.path.insert(0,'.')
12 from OsmApi import OsmApi
14 from math import cos, radians
16 if os.path.exists('siruta.py'):
17 if not '.' in sys.path:
18 sys.path.insert(0,'.')
19 from sirutacsv import SirutaDictReader
23 def stripAccents(s):
24 """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
26 try:
27 s = unicode(s,'utf-8')
28 except TypeError:
29 pass
30 return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
33 def simplifyName(unicode_name):
34 """a function to turn into lowercase ASCII any name (by stripping accents)"""
36 simplename = unicode_name.lower()
37 simplename = stripAccents(simplename)
39 return simplename
41 def getNodesList(osmxmlcontents):
42 """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
44 if len(osmxmlcontents) > 1:
45 raise ValueError('Too many osm blocks in one XML')
47 # select only the nodes from the OSM XML
48 xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
50 # TODO: use an internal data member for the api
51 api = OsmApi()
52 osmnodes = []
53 for xmlnode in xmlnodes:
54 osmnode = api._DomParseNode(xmlnode)
55 osmnodes.append(osmnode)
57 return osmnodes
60 def getMatchingPlaces(osmapielements,placename):
61 """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
62 list = []
64 sname = simplifyName (placename)
66 for i in osmapielements:
67 try:
68 if len( i[u"tag"][u"place"] ) > 0 and \
69 sname == simplifyName(i[u"tag"][u"name"]):
70 list.append(i)
71 except KeyError:
72 # that node didn't have the 'place' or a 'name' tag, so is uninteresting
73 pass
75 return list
78 def locatePlaceInXML(xml,placename):
79 """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
80 into the xml document 'xml'. The input xml document is the one returned by the OSM API
81 through a query for data within a bounding box"""
83 if os.path.exists(xml):
84 xmldoc = minidom.parse(xml)
85 else:
86 xmldoc = minidom.parseString(xml.encode("utf-8"))
88 # each OSM XML has a single root osm element
89 osmxmlcontents = xmldoc.getElementsByTagName('osm')
91 nodeslist = getNodesList(osmxmlcontents)
93 nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
95 places = getMatchingPlaces(nodeswithtags, placename)
97 return places
99 def getArea ( bbox_str ):
100 """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
102 path = "/api/0.6/map?bbox=" + bbox_str
104 # TODO: use an internal data member for the api
105 api = OsmApi()
106 data = api._get ( path )
108 return data
110 def getMapAroundPoint(lon, lat, bbox_km = 10):
111 """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
112 within a bbox_km area"""
114 # one degree latitude is approximately 111km
115 # and we want to know what's half of bbox_km in lat degrees
116 delta_lat = bbox_km / 222.0
118 lat = float (lat)
119 lon = float (lon)
120 # one degree longitude is a cos(lat) * 111
121 # and we want to know what's half of bbox_km in lon degrees
122 delta_lon = cos( radians (lat) ) * delta_lat
124 lat_b = lat - delta_lat
125 lat_t = lat + delta_lat
127 lon_l = lon - delta_lon
128 lon_r = lon + delta_lon
131 path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
133 area_xml_string = getArea ( path )
135 return area_xml_string
137 def simpleName(placename):
138 """Removes from a name of a place any prefix that indicates its clasification"""
139 simpleplacename = placename.lstrip(u"Municipiul ")
140 simpleplacename = simpleplacename.lstrip(u"Oraș ")
142 return simpleplacename
144 def sirutaRankToPlace(sirutarank, population):
145 """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
147 Cod Denumire tip de unitate administrativ teritorială
149 40 Judeţ, municipiul Bucureşti
150 1 Municipiu reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
151 2 Oraş ce aparţine de judeţ, altul decât oraş reşedinţă de judeţ
152 3 Comună
153 4 Municipiu, altul decât reşedinţă de judeţ
154 5 Oraş reşedinţă de judeţ
155 6 Sector al municipiului Bucureşti
156 9 Localitate componentă, reşedinţă de municipiu
157 10 Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
158 11 Sat ce aparţine de municipiu
159 17 Localitate componentă reşedinţă a oraşului
160 18 Localitate componentă a unui oraş, alta decât reşedinţă de oraş
161 19 Sat care aparţine unui oraş
162 22 Sat reşedinţă de comună
163 23 Sat ce aparţine de comună, altul decât reşedinţă de comună
166 rank = int(sirutarank)
168 # municipii, reședințe de județ, reședințe de municipiu
169 if rank in [ 1, 4, 9, 40 ]:
170 return u"city"
171 # orașe, orașe reședință de județ, reședințe ale orașelor
172 if rank in [ 2, 5, 17 ]:
173 return u"town"
174 # localități componente ale orașelor sau municpiilor, altele decât reședințele
175 if rank in [ 10, 18 ]:
176 return u"village"
177 # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
178 if rank in [ 3, 11, 19, 22, 23 ]:
179 # doar satele non-reședință ce aparțin de comune pot fi cătune (hamlet)
180 if rank == 23 and int(population) < 50:
181 return u"hamlet"
182 else:
183 return u"village"
184 # sectoarele municipiului București
185 if rank in [ 6 ]:
186 return u"sector"
188 raise ValueError, "Unexpected rank value in siruta data"
190 def nodeDictForPlace(sirutadict, oldnode = None):
191 """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
192 the existing data which is present in 'oldnode'"""
194 node = {}
195 tags = {}
197 ## XXX: for some reason this doesn't seem to work, we'll use uninteresting
198 #tags[u"name"] = sirutadict[u"name"]
199 if oldnode:
200 node = oldnode
201 tags = oldnode[u"tag"]
203 if not u"population" in tags:
204 tags[u"population"] = sirutadict[u"population2002"]
206 if u"postal_code" in tags:
207 if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
208 tags.pop(u"postal_code")
210 if u"addr:postcode" in tags:
211 if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
212 tags.pop(u"addr:postcode")
215 else:
216 node[u"lat"] = float(sirutadict[u"lat"])
217 node[u"lon"] = float(sirutadict[u"lon"])
218 tags[u"population"] = sirutadict[u"population2002"]
220 # consistently add the 2002 census data
221 tags[u"population:census:2002"] = sirutadict[u"population2002"]
223 # this should probably be ran even for existing nodes
224 tags[u"place"] = sirutaRankToPlace(sirutadict[u"siruta:type"], tags[u"population"])
226 # clean up siruta:name_sup
227 sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
229 uninteresting = [ \
230 u"lon", \
231 u"lat", \
232 u'siruta:rank', \
233 u"population2002", \
234 u"region", \
235 u"siruta:region_id", \
236 u"siruta:enviro_type", \
237 u"siruta:sortcode" ]
239 mergetags = sirutadict.copy()
240 for tag in sirutadict:
241 if tag in uninteresting:
242 mergetags.pop(tag)
244 tags.update(mergetags)
246 simplesup = simpleName(sirutadict[u"siruta:name_sup"])
247 if u"is_in" in tags:
248 is_in = [ x.strip() for x in tags[u"is_in"].replace(",", ";").split(";") ]
249 nextpos = 0
250 if (not simplesup in is_in) and (tags[u"name"] <> simplesup):
251 is_in.insert(0,simplesup)
252 nextpos = 1
253 if not sirutadict[u"siruta:county"] in is_in:
254 is_in.insert(nextpos,sirutadict[u"siruta:county"])
255 else:
256 is_in = [u"România"]
257 is_in.insert(0,sirutadict[u"siruta:county"])
258 if tags[u"name"] <> simplesup:
259 is_in.insert(0,simplesup)
261 tags[u"is_in"] = u";".join(is_in)
263 node[u"tag"] = tags
265 return node
268 def readAndProcessSirutaCsv(file, comment = None, source = u"geo-spatial.org"):
269 """reads the input CSV file and processes each entry"""
271 csvfile = open (file, 'r')
272 reader = SirutaDictReader( csvfile )
274 homedir = os.environ['HOME']
275 api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
277 if not comment:
278 comment = 'import places from ' + file
279 cs_tags = { 'comment' : comment , 'source' : source }
281 api.ChangesetCreate(cs_tags)
283 for csvplace in reader:
285 print "Processing data for %s ..." % ( csvplace[u"name"] )
286 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"] )
287 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"] )
289 if len(existing_nodes) == 0:
290 # node doesn't exist for this place, or is far; we can create the node
291 nodedict = nodeDictForPlace ( csvplace )
293 api.NodeCreate(nodedict)
294 print "Created new node for %s" % ( csvplace[u"name"] )
296 elif len(existing_nodes) == 1:
297 # there is an existing code, so we merge with that
298 referencenode = existing_nodes[0].copy()
299 nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
301 if nodedict == referencenode:
302 print u"Skipping: No changes needed for node %s" % ( csvplace[u"name"] )
303 else:
304 api.NodeUpdate(nodedict)
305 print "Updated existing node for %s" % ( csvplace[u"name"] )
307 else:
308 # I am confused, more than one node with the same simplified name
310 print >> sys.stderr, u"Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s,lon=%s)" % (
311 csvplace[u"name"],
312 len(existing_nodes),
313 csvplace[u"lat"],
314 csvplace[u"lon"] )
316 api.ChangesetClose()