remove the created_by tags from the nodes themselves, that info is in the changeset...
[osm-ro-tools.git] / OsmLocImport.py
blob66f0bb197feacba3eca3462c39eef66903fb024f
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 import re
5 import unicodedata
6 from xml.dom import minidom
8 import os
9 if os.path.exists('OsmApi.py'):
10 import sys
11 sys.path.insert(0,'.')
12 from OsmApi import OsmApi
14 from math import cos, radians
16 if os.path.exists('siruta.py'):
17 if not '.' in sys.path:
18 sys.path.insert(0,'.')
19 from sirutacsv import SirutaDictReader
21 import getopt
24 def stripAccents(s):
25 """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
27 try:
28 s = unicode(s,'utf-8')
29 except TypeError:
30 pass
31 s = s.replace(u"-", u" ")
32 return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
35 def simplifyName(unicode_name):
36 """a function to turn into lowercase ASCII any name (by stripping accents)"""
38 simplename = unicode_name.lower()
39 simplename = stripAccents(simplename)
41 return simplename
43 def getNodesList(osmxmlcontents):
44 """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
46 if len(osmxmlcontents) > 1:
47 raise ValueError('Too many osm blocks in one XML')
49 # select only the nodes from the OSM XML
50 xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
52 # TODO: use an internal data member for the api
53 api = OsmApi()
54 osmnodes = []
55 for xmlnode in xmlnodes:
56 osmnode = api._DomParseNode(xmlnode)
57 osmnodes.append(osmnode)
59 return osmnodes
62 def getMatchingPlaces(osmapielements,placename, sirutacode = None):
63 """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
64 list = []
66 sname = simplifyName (placename)
68 for i in osmapielements:
69 try:
70 if len( i[u"tag"][u"place"] ) > 0 and \
71 sname == simplifyName(i[u"tag"][u"name"]):
72 list.append(i)
73 except KeyError:
74 # that node didn't have the 'place' or a 'name' tag, so is uninteresting
75 pass
77 # try to see if there's already a siruta code attached
78 # if it is, filter by it, and if the result is non-empty, return that list
79 if len(list) > 1 and sirutacode:
80 samesiruta = getSameSiruta(list, sirutacode)
81 if len(samesiruta) != 0:
82 list = samesiruta
83 return list
86 def locatePlaceInXML(xml,placename,sirutacode=None):
87 """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
88 into the xml document 'xml'. The input xml document is the one returned by the OSM API
89 through a query for data within a bounding box"""
91 if os.path.exists(xml):
92 xmldoc = minidom.parse(xml)
93 else:
94 xmldoc = minidom.parseString(xml.encode("utf-8"))
96 # each OSM XML has a single root osm element
97 osmxmlcontents = xmldoc.getElementsByTagName('osm')
99 nodeslist = getNodesList(osmxmlcontents)
101 nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
103 places = getMatchingPlaces(nodeswithtags, placename, sirutacode)
105 return places
107 def getArea ( bbox_str ):
108 """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
110 path = "/api/0.6/map?bbox=" + bbox_str
112 # TODO: use an internal data member for the api
113 api = OsmApi()
114 data = api._get ( path )
116 return data
118 def getMapAroundPoint(lon, lat, bbox_km = 10):
119 """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
120 within a bbox_km area"""
122 # one degree latitude is approximately 111km
123 # and we want to know what's half of bbox_km in lat degrees
124 delta_lat = bbox_km / 222.0
126 lat = float (lat)
127 lon = float (lon)
128 # one degree longitude is a cos(lat) * 111
129 # and we want to know what's half of bbox_km in lon degrees
130 delta_lon = cos( radians (lat) ) * delta_lat
132 lat_b = lat - delta_lat
133 lat_t = lat + delta_lat
135 lon_l = lon - delta_lon
136 lon_r = lon + delta_lon
139 path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
141 area_xml_string = getArea ( path )
143 return area_xml_string
145 def simpleName(placename):
146 """Removes from a name of a place any prefix that indicates its clasification"""
147 simpleplacename = placename.replace(u"Municipiul ",u"",1)
148 simpleplacename = simpleplacename.replace(u"Oraș ",u"",1)
150 return simpleplacename
152 def sirutaTypeToPlace(sirutarank, population):
153 """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
155 Cod Denumire tip de unitate administrativ teritorială
157 40 Judeţ, municipiul Bucureşti
158 1 Municipiu reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
159 2 Oraş ce aparţine de judeţ, altul decât oraş reşedinţă de judeţ
160 3 Comună
161 4 Municipiu, altul decât reşedinţă de judeţ
162 5 Oraş reşedinţă de judeţ
163 6 Sector al municipiului Bucureşti
164 9 Localitate componentă, reşedinţă de municipiu
165 10 Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
166 11 Sat ce aparţine de municipiu
167 17 Localitate componentă reşedinţă a oraşului
168 18 Localitate componentă a unui oraş, alta decât reşedinţă de oraş
169 19 Sat care aparţine unui oraş
170 22 Sat reşedinţă de comună
171 23 Sat ce aparţine de comună, altul decât reşedinţă de comună
174 rank = int(sirutarank)
176 # municipii, reședințe de județ, reședințe de municipiu
177 if rank in [ 1, 4, 9, 40 ]:
178 return u"city"
179 # orașe, orașe reședință de județ, reședințe ale orașelor
180 if rank in [ 2, 5, 17 ]:
181 return u"town"
182 # localități componente ale orașelor sau municpiilor, altele decât reședințele
183 if rank in [ 10, 18 ]:
184 return u"village"
185 # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
186 if rank in [ 3, 11, 19, 22, 23 ]:
187 # doar satele non-reședință ce aparțin de comune pot fi cătune (hamlet)
188 if rank == 23 and int(population) < 50:
189 return u"hamlet"
190 else:
191 return u"village"
192 # sectoarele municipiului București
193 if rank in [ 6 ]:
194 return u"sector"
196 raise ValueError, "Unexpected rank value in siruta data"
198 def nodeDictForPlace(sirutadict, oldnode = None):
199 """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
200 the existing data which is present in 'oldnode'"""
202 node = {}
203 tags = {}
205 # it seems some of the input contains no data for the population and the rank,
206 # which probably means 0 for population, and we don't care for rank
208 if sirutadict[u"population2002"] == u"":
209 sirutadict[u"population2002"] = u"0"
211 if oldnode:
212 node = oldnode
213 tags = oldnode[u"tag"]
215 if not u"population" in tags:
216 tags[u"population"] = sirutadict[u"population2002"]
218 if u"postal_code" in tags:
219 if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
220 tags.pop(u"postal_code")
222 if u"addr:postcode" in tags:
223 if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
224 tags.pop(u"addr:postcode")
227 else:
228 node[u"lat"] = float(sirutadict[u"lat"])
229 node[u"lon"] = float(sirutadict[u"lon"])
230 tags[u"population"] = sirutadict[u"population2002"]
232 # consistently add the 1992 census data
233 tags[u"population:census:1992"] = sirutadict[u"population2002"]
235 # this should probably be ran even for existing nodes
236 tags[u"place"] = sirutaTypeToPlace(sirutadict[u"siruta:type"], tags[u"population"])
238 # clean up siruta:name_sup
239 sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
241 uninteresting = [ \
242 u"lon", \
243 u"lat", \
244 u'siruta:rank', \
245 u"population2002", \
246 u"region", \
247 u"siruta:region_id", \
248 u"siruta:enviro_type", \
249 u"siruta:sortcode" ]
251 mergetags = sirutadict.copy()
252 for tag in sirutadict:
253 if tag in uninteresting:
254 mergetags.pop(tag)
256 tags.update(mergetags)
258 simplesup = simpleName(sirutadict[u"siruta:name_sup"])
259 is_in = [u"România"]
260 tags[u"is_in:country"] = u"România"
261 is_in.insert(0,sirutadict[u"siruta:county"])
262 tags[u"is_in:county"] = sirutadict[u"siruta:county"]
263 if tags[u"name"] <> simplesup:
264 is_in.insert(0,simplesup)
266 tags[u"is_in"] = u";".join(is_in)
268 # prune the created_by tag since is deprecated to have it on the
269 # node and the changeset contains that info anyway
270 if u"created_by" in tags:
271 tags.pop(u"created_by")
273 node[u"tag"] = tags
275 return node
277 def getSameSiruta(elementlist, sirutacode):
278 """returns a list with all the elements in list which have the siruta:code == sirutacode"""
280 newlist = []
281 for x in elementlist:
282 try:
283 if x[u"tag"][u"siruta:code"] == sirutacode:
284 newlist.append(x.copy())
285 except KeyError:
286 pass
288 return newlist
290 def readAndProcessSirutaCsv(file, comment = None, source = None):
291 """reads the input CSV file and processes each entry"""
293 csvfile = open (file, 'r')
294 reader = SirutaDictReader( csvfile )
296 homedir = os.environ['HOME']
297 api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
299 if not comment:
300 comment = 'import places from ' + file
301 else:
302 comment = unicode(comment,'utf-8')
303 if not source:
304 source = u"http://geo-spatial.org siruta data import"
305 else:
306 source = unicode(source,'utf-8')
307 cs_tags = { 'comment' : comment , 'source' : source }
309 api.ChangesetCreate(cs_tags)
311 for csvplace in reader:
313 uname = csvplace[u"name"].encode("utf-8")
314 print "Processing data for %s ..." % ( uname )
315 sys.stdout.flush()
316 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"] )
317 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"], csvplace[u"siruta:code"] )
319 if len(existing_nodes) == 0:
320 # node doesn't exist for this place, or is far; we can create the node
321 nodedict = nodeDictForPlace ( csvplace )
323 api.NodeCreate(nodedict)
324 print "Created new node for %s" % ( uname )
326 elif len(existing_nodes) == 1:
327 # there is an existing code, so we merge with that
328 referencenode = existing_nodes[0].copy()
329 # dictionaries don't get copied by default
330 referencenode[u"tag"] = existing_nodes[0][u"tag"].copy()
331 nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
333 if nodedict == referencenode:
334 print "Skipping: No changes needed for node %s" % ( uname )
335 else:
336 api.NodeUpdate(nodedict)
337 print "Updated existing node for %s" % ( uname )
339 else:
340 # I am confused, more than one node with the same simplified name
342 print >> sys.stderr, "Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s,lon=%s)" % (
343 uname,
344 len(existing_nodes),
345 csvplace[u"lat"].encode("utf-8"),
346 csvplace[u"lon"].encode("utf-8") )
348 sys.stdout.flush()
349 sys.stderr.flush()
351 api.ChangesetClose()
352 csvfile.close()
354 def usage():
355 print "%s [-c|--comment <comment>] [-s|--source <source>] -i <inputcsv>" % sys.argv[0]
357 def main(argv = None):
359 if argv is None:
360 argv = sys.argv
362 try:
363 opts, args = getopt.getopt(sys.argv[1:], "hi:c:s:",
364 ["help", "input=", "comment=", "source="] )
365 except getopt.GetoptError, err:
366 # print help information and exit:
367 print str(err)
368 usage()
369 return 2
371 file = None
372 comment = None
373 source = None
375 for o,a in opts:
376 if o in ("-h", "help"):
377 usage()
378 return 0
379 elif o in ("-i", "--input"):
380 file = a
381 elif o in ("-s", "--source"):
382 source = a
383 elif o in ("-c", "--comment"):
384 comment = a
386 if not file:
387 print "Input csv file (-i option) is mandatory. Run the script with -h for online help."
388 return 2
391 readAndProcessSirutaCsv(file, source=source, comment=comment)
394 if __name__ == "__main__":
395 sys.exit(main())