test the not implemented yet _applyChanges2Map
[osm-ro-tools.git] / OsmLocImport.py
blob4bfed6ad4219c3d558c4f9e6359a3e0edb151c15
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 import re
5 import unicodedata
6 from xml.dom import minidom
8 import os
9 if os.path.exists('OsmApi.py'):
10 import sys
11 sys.path.insert(0,'.')
12 from OsmApi import OsmApi
14 from math import cos, radians
16 if os.path.exists('siruta.py'):
17 if not '.' in sys.path:
18 sys.path.insert(0,'.')
19 from sirutacsv import SirutaDictReader
21 import getopt
24 def stripAccents(s):
25 """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
27 try:
28 s = unicode(s,'utf-8')
29 except TypeError:
30 pass
31 s = s.replace(u"-", u" ")
32 return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
35 def simplifyName(unicode_name):
36 """a function to turn into lowercase ASCII any name (by stripping accents)"""
38 simplename = unicode_name.lower()
39 simplename = stripAccents(simplename)
41 return simplename
43 def getNodesList(osmxmlcontents):
44 """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
46 if len(osmxmlcontents) > 1:
47 raise ValueError('Too many osm blocks in one XML')
49 # select only the nodes from the OSM XML
50 xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
52 # TODO: use an internal data member for the api
53 api = OsmApi()
54 osmnodes = []
55 for xmlnode in xmlnodes:
56 osmnode = api._DomParseNode(xmlnode)
57 osmnodes.append(osmnode)
59 return osmnodes
62 def getMatchingPlaces(osmapielements,placename, sirutacode = None):
63 """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
64 list = []
66 # TODO: try somehow to match î with â when is correct to do so
67 # should also match i to â (when is correct)
68 # TODO: maybe calculating the lexical distance is the way to fix this
69 # and match also typo-ed names with correct ones
70 sname = simplifyName (placename)
72 for i in osmapielements:
73 try:
74 if len( i[u"tag"][u"place"] ) > 0 and \
75 sname == simplifyName(i[u"tag"][u"name"]):
76 list.append(i)
77 except KeyError:
78 # that node didn't have the 'place' or a 'name' tag, so is uninteresting
79 pass
81 # try to see if there's already a siruta code attached
82 # if it is, filter by it, and if the result is non-empty, return that list
83 if len(list) > 1 and sirutacode:
84 samesiruta = getSameSiruta(list, sirutacode)
85 if len(samesiruta) != 0:
86 list = samesiruta
87 return list
90 def locatePlaceInXML(xml,placename,sirutacode=None):
91 """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
92 into the xml document 'xml'. The input xml document is the one returned by the OSM API
93 through a query for data within a bounding box"""
95 if os.path.exists(xml):
96 xmldoc = minidom.parse(xml)
97 else:
98 xmldoc = minidom.parseString(xml.encode("utf-8"))
100 # each OSM XML has a single root osm element
101 osmxmlcontents = xmldoc.getElementsByTagName('osm')
103 nodeslist = getNodesList(osmxmlcontents)
105 nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
107 places = getMatchingPlaces(nodeswithtags, placename, sirutacode)
109 return places
111 def getArea ( bbox_str ):
112 """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
114 path = "/api/0.6/map?bbox=" + bbox_str
116 # TODO: use an internal data member for the api
117 api = OsmApi()
118 data = api._get ( path )
120 return data
122 def getMapAroundPoint(lon, lat, bbox_km = 10):
123 """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
124 within a bbox_km area"""
126 # one degree latitude is approximately 111km
127 # and we want to know what's half of bbox_km in lat degrees
128 delta_lat = bbox_km / 222.0
130 lat = float (lat)
131 lon = float (lon)
132 # one degree longitude is a cos(lat) * 111
133 # and we want to know what's half of bbox_km in lon degrees
134 delta_lon = cos( radians (lat) ) * delta_lat
136 lat_b = lat - delta_lat
137 lat_t = lat + delta_lat
139 lon_l = lon - delta_lon
140 lon_r = lon + delta_lon
143 path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
145 area_xml_string = getArea ( path )
147 return area_xml_string
149 def simpleName(placename):
150 """Removes from a name of a place any prefix that indicates its clasification"""
151 simpleplacename = placename.replace(u"Municipiul ",u"",1)
152 simpleplacename = simpleplacename.replace(u"Oraș ",u"",1)
154 return simpleplacename
156 def sirutaTypeToPlace(sirutarank, population):
157 """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
159 Cod Denumire tip de unitate administrativ teritorială
161 40 Judeţ, municipiul Bucureşti
162 1 Municipiu reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
163 2 Oraş ce aparţine de judeţ, altul decât oraş reşedinţă de judeţ
164 3 Comună
165 4 Municipiu, altul decât reşedinţă de judeţ
166 5 Oraş reşedinţă de judeţ
167 6 Sector al municipiului Bucureşti
168 9 Localitate componentă, reşedinţă de municipiu
169 10 Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
170 11 Sat ce aparţine de municipiu
171 17 Localitate componentă reşedinţă a oraşului
172 18 Localitate componentă a unui oraş, alta decât reşedinţă de oraş
173 19 Sat care aparţine unui oraş
174 22 Sat reşedinţă de comună
175 23 Sat ce aparţine de comună, altul decât reşedinţă de comună
178 rank = int(sirutarank)
180 # municipii, reședințe de județ, reședințe de municipiu
181 if rank in [ 1, 4, 9, 40 ]:
182 return u"city"
183 # orașe, orașe reședință de județ, reședințe ale orașelor
184 if rank in [ 2, 5, 17 ]:
185 return u"town"
186 # localități componente ale orașelor sau municpiilor, altele decât reședințele
187 if rank in [ 10, 18 ]:
188 return u"village"
189 # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
190 if rank in [ 3, 11, 19, 22, 23 ]:
191 # doar satele non-reședință ce aparțin de comune pot fi cătune (hamlet)
192 if rank == 23 and int(population) < 50:
193 return u"hamlet"
194 else:
195 return u"village"
196 # sectoarele municipiului București
197 if rank in [ 6 ]:
198 return u"sector"
200 raise ValueError, "Unexpected rank value in siruta data"
202 def nodeDictForPlace(sirutadict, oldnode = None):
203 """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
204 the existing data which is present in 'oldnode'"""
206 node = {}
207 tags = {}
209 # it seems some of the input contains no data for the population and the rank,
210 # which probably means 0 for population, and we don't care for rank
212 if sirutadict[u"population2002"] == u"":
213 sirutadict[u"population2002"] = u"0"
215 if oldnode:
216 node = oldnode
217 tags = oldnode[u"tag"]
219 if not u"population" in tags:
220 tags[u"population"] = sirutadict[u"population2002"]
222 if u"postal_code" in tags:
223 if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
224 tags.pop(u"postal_code")
226 if u"addr:postcode" in tags:
227 if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
228 tags.pop(u"addr:postcode")
231 else:
232 node[u"lat"] = float(sirutadict[u"lat"])
233 node[u"lon"] = float(sirutadict[u"lon"])
234 tags[u"population"] = sirutadict[u"population2002"]
236 # consistently add the 1992 census data
237 tags[u"population:census:1992"] = sirutadict[u"population2002"]
239 # this should probably be ran even for existing nodes
240 tags[u"place"] = sirutaTypeToPlace(sirutadict[u"siruta:type"], tags[u"population"])
242 # clean up siruta:name_sup
243 sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
245 uninteresting = [ \
246 u"lon", \
247 u"lat", \
248 u'siruta:rank', \
249 u"population2002", \
250 u"region", \
251 u"siruta:region_id", \
252 u"siruta:enviro_type", \
253 u"siruta:sortcode" ]
255 mergetags = sirutadict.copy()
256 for tag in sirutadict:
257 if tag in uninteresting:
258 mergetags.pop(tag)
260 tags.update(mergetags)
262 simplesup = simpleName(sirutadict[u"siruta:name_sup"])
263 is_in = [u"România"]
264 tags[u"is_in:country"] = u"România"
265 is_in.insert(0,sirutadict[u"siruta:county"])
266 tags[u"is_in:county"] = sirutadict[u"siruta:county"]
267 if tags[u"name"] <> simplesup:
268 is_in.insert(0,simplesup)
270 tags[u"is_in"] = u";".join(is_in)
272 # prune the created_by tag since is deprecated to have it on the
273 # node and the changeset contains that info anyway
274 if u"created_by" in tags:
275 tags.pop(u"created_by")
277 node[u"tag"] = tags
279 return node
281 def getSameSiruta(elementlist, sirutacode):
282 """returns a list with all the elements in list which have the siruta:code == sirutacode"""
284 newlist = []
285 for x in elementlist:
286 try:
287 if x[u"tag"][u"siruta:code"] == sirutacode:
288 newlist.append(x.copy())
289 except KeyError:
290 pass
292 return newlist
294 def readAndProcessSirutaCsv(file, comment = None, source = None, bbox_km = 10):
295 """reads the input CSV file and processes each entry"""
297 csvfile = open (file, 'r')
298 reader = SirutaDictReader( csvfile )
300 homedir = os.environ['HOME']
301 api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
303 if not comment:
304 comment = 'import places from ' + file
305 else:
306 comment = unicode(comment,'utf-8')
307 if not source:
308 source = u"http://geo-spatial.org siruta data import"
309 else:
310 source = unicode(source,'utf-8')
311 cs_tags = { 'comment' : comment , 'source' : source }
313 print >> sys.stderr, "New pocessing started..."
314 api.ChangesetCreate(cs_tags)
316 for csvplace in reader:
318 uname = csvplace[u"name"].encode("utf-8")
319 print "Processing data for %s ..." % ( uname )
320 sys.stdout.flush()
321 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"], bbox_km = bbox_km )
322 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"], csvplace[u"siruta:code"] )
324 if len(existing_nodes) == 0:
325 # node doesn't exist for this place, or is far; we can create the node
326 nodedict = nodeDictForPlace ( csvplace )
328 api.NodeCreate(nodedict)
329 print "Created new node for %s" % ( uname )
331 elif len(existing_nodes) == 1:
332 # there is an existing code, so we merge with that
333 referencenode = existing_nodes[0].copy()
334 # dictionaries don't get copied by default
335 referencenode[u"tag"] = existing_nodes[0][u"tag"].copy()
336 nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
338 if nodedict == referencenode:
339 print "Skipping: No changes needed for node %s" % ( uname )
340 else:
341 api.NodeUpdate(nodedict)
342 print "Updated existing node for %s" % ( uname )
344 else:
345 # I am confused, more than one node with the same simplified name
347 print >> sys.stderr, "Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s&lon=%s)" % (
348 uname,
349 len(existing_nodes),
350 csvplace[u"lat"].encode("utf-8"),
351 csvplace[u"lon"].encode("utf-8") )
353 sys.stdout.flush()
354 sys.stderr.flush()
356 api.ChangesetClose()
357 csvfile.close()
359 def usage():
360 print "%s [-c|--comment <comment>] [-s|--source <source>] -i <inputcsv>" % sys.argv[0]
362 def main(argv = None):
364 if argv is None:
365 argv = sys.argv
367 try:
368 opts, args = getopt.getopt(sys.argv[1:], "hi:c:s:b:",
369 ["help", "input=", "comment=", "source=","bbox="] )
370 except getopt.GetoptError, err:
371 # print help information and exit:
372 print str(err)
373 usage()
374 return 2
376 file = None
377 comment = None
378 source = None
379 bbox_km = 10
381 for o,a in opts:
382 if o in ("-h", "help"):
383 usage()
384 return 0
385 elif o in ("-i", "--input"):
386 file = a
387 elif o in ("-s", "--source"):
388 source = a
389 elif o in ("-c", "--comment"):
390 comment = a
391 elif o in ("-b", "--bbox"):
392 bbox_km = float(a)
394 if not file:
395 print "Input csv file (-i option) is mandatory. Run the script with -h for online help."
396 return 2
399 readAndProcessSirutaCsv(file, source=source, comment=comment, bbox_km=bbox_km)
402 if __name__ == "__main__":
403 sys.exit(main())