bugfix: allow utf-8 comments and source strings
[osm-ro-tools.git] / OsmLocImport.py
blob545ee11e406a85ad732e3804ab072bb0ee2c125a
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 import re
5 import unicodedata
6 from xml.dom import minidom
8 import os
9 if os.path.exists('OsmApi.py'):
10 import sys
11 sys.path.insert(0,'.')
12 from OsmApi import OsmApi
14 from math import cos, radians
16 if os.path.exists('siruta.py'):
17 if not '.' in sys.path:
18 sys.path.insert(0,'.')
19 from sirutacsv import SirutaDictReader
21 import getopt
24 def stripAccents(s):
25 """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
27 try:
28 s = unicode(s,'utf-8')
29 except TypeError:
30 pass
31 s = s.replace(u"-", u" ")
32 return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
35 def simplifyName(unicode_name):
36 """a function to turn into lowercase ASCII any name (by stripping accents)"""
38 simplename = unicode_name.lower()
39 simplename = stripAccents(simplename)
41 return simplename
43 def getNodesList(osmxmlcontents):
44 """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
46 if len(osmxmlcontents) > 1:
47 raise ValueError('Too many osm blocks in one XML')
49 # select only the nodes from the OSM XML
50 xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
52 # TODO: use an internal data member for the api
53 api = OsmApi()
54 osmnodes = []
55 for xmlnode in xmlnodes:
56 osmnode = api._DomParseNode(xmlnode)
57 osmnodes.append(osmnode)
59 return osmnodes
62 def getMatchingPlaces(osmapielements,placename):
63 """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
64 list = []
66 sname = simplifyName (placename)
68 for i in osmapielements:
69 try:
70 if len( i[u"tag"][u"place"] ) > 0 and \
71 sname == simplifyName(i[u"tag"][u"name"]):
72 list.append(i)
73 except KeyError:
74 # that node didn't have the 'place' or a 'name' tag, so is uninteresting
75 pass
77 return list
80 def locatePlaceInXML(xml,placename):
81 """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
82 into the xml document 'xml'. The input xml document is the one returned by the OSM API
83 through a query for data within a bounding box"""
85 if os.path.exists(xml):
86 xmldoc = minidom.parse(xml)
87 else:
88 xmldoc = minidom.parseString(xml.encode("utf-8"))
90 # each OSM XML has a single root osm element
91 osmxmlcontents = xmldoc.getElementsByTagName('osm')
93 nodeslist = getNodesList(osmxmlcontents)
95 nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
97 places = getMatchingPlaces(nodeswithtags, placename)
99 return places
101 def getArea ( bbox_str ):
102 """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
104 path = "/api/0.6/map?bbox=" + bbox_str
106 # TODO: use an internal data member for the api
107 api = OsmApi()
108 data = api._get ( path )
110 return data
112 def getMapAroundPoint(lon, lat, bbox_km = 10):
113 """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
114 within a bbox_km area"""
116 # one degree latitude is approximately 111km
117 # and we want to know what's half of bbox_km in lat degrees
118 delta_lat = bbox_km / 222.0
120 lat = float (lat)
121 lon = float (lon)
122 # one degree longitude is a cos(lat) * 111
123 # and we want to know what's half of bbox_km in lon degrees
124 delta_lon = cos( radians (lat) ) * delta_lat
126 lat_b = lat - delta_lat
127 lat_t = lat + delta_lat
129 lon_l = lon - delta_lon
130 lon_r = lon + delta_lon
133 path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
135 area_xml_string = getArea ( path )
137 return area_xml_string
139 def simpleName(placename):
140 """Removes from a name of a place any prefix that indicates its clasification"""
141 simpleplacename = placename.replace(u"Municipiul ",u"",1)
142 simpleplacename = simpleplacename.replace(u"Oraș ",u"",1)
144 return simpleplacename
146 def sirutaTypeToPlace(sirutarank, population):
147 """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
149 Cod Denumire tip de unitate administrativ teritorială
151 40 Judeţ, municipiul Bucureşti
152 1 Municipiu reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
153 2 Oraş ce aparţine de judeţ, altul decât oraş reşedinţă de judeţ
154 3 Comună
155 4 Municipiu, altul decât reşedinţă de judeţ
156 5 Oraş reşedinţă de judeţ
157 6 Sector al municipiului Bucureşti
158 9 Localitate componentă, reşedinţă de municipiu
159 10 Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
160 11 Sat ce aparţine de municipiu
161 17 Localitate componentă reşedinţă a oraşului
162 18 Localitate componentă a unui oraş, alta decât reşedinţă de oraş
163 19 Sat care aparţine unui oraş
164 22 Sat reşedinţă de comună
165 23 Sat ce aparţine de comună, altul decât reşedinţă de comună
168 rank = int(sirutarank)
170 # municipii, reședințe de județ, reședințe de municipiu
171 if rank in [ 1, 4, 9, 40 ]:
172 return u"city"
173 # orașe, orașe reședință de județ, reședințe ale orașelor
174 if rank in [ 2, 5, 17 ]:
175 return u"town"
176 # localități componente ale orașelor sau municpiilor, altele decât reședințele
177 if rank in [ 10, 18 ]:
178 return u"village"
179 # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
180 if rank in [ 3, 11, 19, 22, 23 ]:
181 # doar satele non-reședință ce aparțin de comune pot fi cătune (hamlet)
182 if rank == 23 and int(population) < 50:
183 return u"hamlet"
184 else:
185 return u"village"
186 # sectoarele municipiului București
187 if rank in [ 6 ]:
188 return u"sector"
190 raise ValueError, "Unexpected rank value in siruta data"
192 def nodeDictForPlace(sirutadict, oldnode = None):
193 """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
194 the existing data which is present in 'oldnode'"""
196 node = {}
197 tags = {}
199 # it seems some of the input contains no data for the population and the rank,
200 # which probably means 0 for population, and we don't care for rank
202 if sirutadict[u"population2002"] == u"":
203 sirutadict[u"population2002"] = u"0"
205 if oldnode:
206 node = oldnode
207 tags = oldnode[u"tag"]
209 if not u"population" in tags:
210 tags[u"population"] = sirutadict[u"population2002"]
212 if u"postal_code" in tags:
213 if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
214 tags.pop(u"postal_code")
216 if u"addr:postcode" in tags:
217 if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
218 tags.pop(u"addr:postcode")
221 else:
222 node[u"lat"] = float(sirutadict[u"lat"])
223 node[u"lon"] = float(sirutadict[u"lon"])
224 tags[u"population"] = sirutadict[u"population2002"]
226 # consistently add the 1992 census data
227 tags[u"population:census:1992"] = sirutadict[u"population2002"]
229 # this should probably be ran even for existing nodes
230 tags[u"place"] = sirutaTypeToPlace(sirutadict[u"siruta:type"], tags[u"population"])
232 # clean up siruta:name_sup
233 sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
235 uninteresting = [ \
236 u"lon", \
237 u"lat", \
238 u'siruta:rank', \
239 u"population2002", \
240 u"region", \
241 u"siruta:region_id", \
242 u"siruta:enviro_type", \
243 u"siruta:sortcode" ]
245 mergetags = sirutadict.copy()
246 for tag in sirutadict:
247 if tag in uninteresting:
248 mergetags.pop(tag)
250 tags.update(mergetags)
252 simplesup = simpleName(sirutadict[u"siruta:name_sup"])
253 is_in = [u"România"]
254 tags[u"is_in:country"] = u"România"
255 is_in.insert(0,sirutadict[u"siruta:county"])
256 tags[u"is_in:county"] = sirutadict[u"siruta:county"]
257 if tags[u"name"] <> simplesup:
258 is_in.insert(0,simplesup)
260 tags[u"is_in"] = u";".join(is_in)
262 node[u"tag"] = tags
264 return node
267 def readAndProcessSirutaCsv(file, comment = None, source = None):
268 """reads the input CSV file and processes each entry"""
270 csvfile = open (file, 'r')
271 reader = SirutaDictReader( csvfile )
273 homedir = os.environ['HOME']
274 api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
276 if not comment:
277 comment = 'import places from ' + file
278 else:
279 comment = unicode(comment,'utf-8')
280 if not source:
281 source = u"http://geo-spatial.org siruta data import"
282 else:
283 source = unicode(source,'utf-8')
284 cs_tags = { 'comment' : comment , 'source' : source }
286 api.ChangesetCreate(cs_tags)
288 for csvplace in reader:
290 uname = csvplace[u"name"].encode("utf-8")
291 print "Processing data for %s ..." % ( uname )
292 sys.stdout.flush()
293 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"] )
294 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"] )
296 if len(existing_nodes) == 0:
297 # node doesn't exist for this place, or is far; we can create the node
298 nodedict = nodeDictForPlace ( csvplace )
300 api.NodeCreate(nodedict)
301 print "Created new node for %s" % ( uname )
303 elif len(existing_nodes) == 1:
304 # there is an existing code, so we merge with that
305 referencenode = existing_nodes[0].copy()
306 # dictionaries don't get copied by default
307 referencenode[u"tag"] = existing_nodes[0][u"tag"].copy()
308 nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
310 if nodedict == referencenode:
311 print "Skipping: No changes needed for node %s" % ( uname )
312 else:
313 api.NodeUpdate(nodedict)
314 print "Updated existing node for %s" % ( uname )
316 else:
317 # I am confused, more than one node with the same simplified name
319 print >> sys.stderr, "Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s,lon=%s)" % (
320 uname,
321 len(existing_nodes),
322 csvplace[u"lat"],
323 csvplace[u"lon"] )
325 sys.stdout.flush()
326 sys.stderr.flush()
328 api.ChangesetClose()
329 csvfile.close()
331 def usage():
332 print "%s [-c|--comment <comment>] [-s|--source <source>] -i <inputcsv>" % sys.argv[0]
334 def main(argv = None):
336 if argv is None:
337 argv = sys.argv
339 try:
340 opts, args = getopt.getopt(sys.argv[1:], "hi:c:s:",
341 ["help", "input=", "comment=", "source="] )
342 except getopt.GetoptError, err:
343 # print help information and exit:
344 print str(err)
345 usage()
346 return 2
348 file = None
349 comment = None
350 source = None
352 for o,a in opts:
353 if o in ("-h", "help"):
354 usage()
355 return 0
356 elif o in ("-i", "--input"):
357 file = a
358 elif o in ("-s", "--source"):
359 source = a
360 elif o in ("-c", "--comment"):
361 comment = a
363 if not file:
364 print "Input csv file (-i option) is mandatory. Run the script with -h for online help."
365 return 2
368 readAndProcessSirutaCsv(file, source=source, comment=comment)
371 if __name__ == "__main__":
372 sys.exit(main())