isolate the code to identify same siruta code list in a function
[osm-ro-tools.git] / OsmLocImport.py
blobc8935e4322aff91ab3f01018b9fa2bb1cb6a421c
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 import re
5 import unicodedata
6 from xml.dom import minidom
8 import os
9 if os.path.exists('OsmApi.py'):
10 import sys
11 sys.path.insert(0,'.')
12 from OsmApi import OsmApi
14 from math import cos, radians
16 if os.path.exists('siruta.py'):
17 if not '.' in sys.path:
18 sys.path.insert(0,'.')
19 from sirutacsv import SirutaDictReader
21 import getopt
24 def stripAccents(s):
25 """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
27 try:
28 s = unicode(s,'utf-8')
29 except TypeError:
30 pass
31 s = s.replace(u"-", u" ")
32 return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
35 def simplifyName(unicode_name):
36 """a function to turn into lowercase ASCII any name (by stripping accents)"""
38 simplename = unicode_name.lower()
39 simplename = stripAccents(simplename)
41 return simplename
43 def getNodesList(osmxmlcontents):
44 """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
46 if len(osmxmlcontents) > 1:
47 raise ValueError('Too many osm blocks in one XML')
49 # select only the nodes from the OSM XML
50 xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
52 # TODO: use an internal data member for the api
53 api = OsmApi()
54 osmnodes = []
55 for xmlnode in xmlnodes:
56 osmnode = api._DomParseNode(xmlnode)
57 osmnodes.append(osmnode)
59 return osmnodes
62 def getMatchingPlaces(osmapielements,placename):
63 """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
64 list = []
66 sname = simplifyName (placename)
68 for i in osmapielements:
69 try:
70 if len( i[u"tag"][u"place"] ) > 0 and \
71 sname == simplifyName(i[u"tag"][u"name"]):
72 list.append(i)
73 except KeyError:
74 # that node didn't have the 'place' or a 'name' tag, so is uninteresting
75 pass
77 return list
80 def locatePlaceInXML(xml,placename):
81 """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
82 into the xml document 'xml'. The input xml document is the one returned by the OSM API
83 through a query for data within a bounding box"""
85 if os.path.exists(xml):
86 xmldoc = minidom.parse(xml)
87 else:
88 xmldoc = minidom.parseString(xml.encode("utf-8"))
90 # each OSM XML has a single root osm element
91 osmxmlcontents = xmldoc.getElementsByTagName('osm')
93 nodeslist = getNodesList(osmxmlcontents)
95 nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
97 places = getMatchingPlaces(nodeswithtags, placename)
99 return places
101 def getArea ( bbox_str ):
102 """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
104 path = "/api/0.6/map?bbox=" + bbox_str
106 # TODO: use an internal data member for the api
107 api = OsmApi()
108 data = api._get ( path )
110 return data
112 def getMapAroundPoint(lon, lat, bbox_km = 10):
113 """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
114 within a bbox_km area"""
116 # one degree latitude is approximately 111km
117 # and we want to know what's half of bbox_km in lat degrees
118 delta_lat = bbox_km / 222.0
120 lat = float (lat)
121 lon = float (lon)
122 # one degree longitude is a cos(lat) * 111
123 # and we want to know what's half of bbox_km in lon degrees
124 delta_lon = cos( radians (lat) ) * delta_lat
126 lat_b = lat - delta_lat
127 lat_t = lat + delta_lat
129 lon_l = lon - delta_lon
130 lon_r = lon + delta_lon
133 path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
135 area_xml_string = getArea ( path )
137 return area_xml_string
139 def simpleName(placename):
140 """Removes from a name of a place any prefix that indicates its clasification"""
141 simpleplacename = placename.replace(u"Municipiul ",u"",1)
142 simpleplacename = simpleplacename.replace(u"Oraș ",u"",1)
144 return simpleplacename
146 def sirutaTypeToPlace(sirutarank, population):
147 """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
149 Cod Denumire tip de unitate administrativ teritorială
151 40 Judeţ, municipiul Bucureşti
152 1 Municipiu reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
153 2 Oraş ce aparţine de judeţ, altul decât oraş reşedinţă de judeţ
154 3 Comună
155 4 Municipiu, altul decât reşedinţă de judeţ
156 5 Oraş reşedinţă de judeţ
157 6 Sector al municipiului Bucureşti
158 9 Localitate componentă, reşedinţă de municipiu
159 10 Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
160 11 Sat ce aparţine de municipiu
161 17 Localitate componentă reşedinţă a oraşului
162 18 Localitate componentă a unui oraş, alta decât reşedinţă de oraş
163 19 Sat care aparţine unui oraş
164 22 Sat reşedinţă de comună
165 23 Sat ce aparţine de comună, altul decât reşedinţă de comună
168 rank = int(sirutarank)
170 # municipii, reședințe de județ, reședințe de municipiu
171 if rank in [ 1, 4, 9, 40 ]:
172 return u"city"
173 # orașe, orașe reședință de județ, reședințe ale orașelor
174 if rank in [ 2, 5, 17 ]:
175 return u"town"
176 # localități componente ale orașelor sau municpiilor, altele decât reședințele
177 if rank in [ 10, 18 ]:
178 return u"village"
179 # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
180 if rank in [ 3, 11, 19, 22, 23 ]:
181 # doar satele non-reședință ce aparțin de comune pot fi cătune (hamlet)
182 if rank == 23 and int(population) < 50:
183 return u"hamlet"
184 else:
185 return u"village"
186 # sectoarele municipiului București
187 if rank in [ 6 ]:
188 return u"sector"
190 raise ValueError, "Unexpected rank value in siruta data"
192 def nodeDictForPlace(sirutadict, oldnode = None):
193 """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
194 the existing data which is present in 'oldnode'"""
196 node = {}
197 tags = {}
199 # it seems some of the input contains no data for the population and the rank,
200 # which probably means 0 for population, and we don't care for rank
202 if sirutadict[u"population2002"] == u"":
203 sirutadict[u"population2002"] = u"0"
205 if oldnode:
206 node = oldnode
207 tags = oldnode[u"tag"]
209 if not u"population" in tags:
210 tags[u"population"] = sirutadict[u"population2002"]
212 if u"postal_code" in tags:
213 if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
214 tags.pop(u"postal_code")
216 if u"addr:postcode" in tags:
217 if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
218 tags.pop(u"addr:postcode")
221 else:
222 node[u"lat"] = float(sirutadict[u"lat"])
223 node[u"lon"] = float(sirutadict[u"lon"])
224 tags[u"population"] = sirutadict[u"population2002"]
226 # consistently add the 1992 census data
227 tags[u"population:census:1992"] = sirutadict[u"population2002"]
229 # this should probably be ran even for existing nodes
230 tags[u"place"] = sirutaTypeToPlace(sirutadict[u"siruta:type"], tags[u"population"])
232 # clean up siruta:name_sup
233 sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
235 uninteresting = [ \
236 u"lon", \
237 u"lat", \
238 u'siruta:rank', \
239 u"population2002", \
240 u"region", \
241 u"siruta:region_id", \
242 u"siruta:enviro_type", \
243 u"siruta:sortcode" ]
245 mergetags = sirutadict.copy()
246 for tag in sirutadict:
247 if tag in uninteresting:
248 mergetags.pop(tag)
250 tags.update(mergetags)
252 simplesup = simpleName(sirutadict[u"siruta:name_sup"])
253 is_in = [u"România"]
254 tags[u"is_in:country"] = u"România"
255 is_in.insert(0,sirutadict[u"siruta:county"])
256 tags[u"is_in:county"] = sirutadict[u"siruta:county"]
257 if tags[u"name"] <> simplesup:
258 is_in.insert(0,simplesup)
260 tags[u"is_in"] = u";".join(is_in)
262 node[u"tag"] = tags
264 return node
266 def getSameSiruta(elementlist, sirutacode):
267 """returns a list with all the elements in list which have the siruta:code == sirutacode"""
269 newlist = []
270 for x in elementlist:
271 try:
272 if x[u"tag"][u"siruta:code"] == sirutacode:
273 newlist.append(x.copy())
274 except KeyError:
275 pass
277 return newlist
279 def readAndProcessSirutaCsv(file, comment = None, source = None):
280 """reads the input CSV file and processes each entry"""
282 csvfile = open (file, 'r')
283 reader = SirutaDictReader( csvfile )
285 homedir = os.environ['HOME']
286 api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
288 if not comment:
289 comment = 'import places from ' + file
290 else:
291 comment = unicode(comment,'utf-8')
292 if not source:
293 source = u"http://geo-spatial.org siruta data import"
294 else:
295 source = unicode(source,'utf-8')
296 cs_tags = { 'comment' : comment , 'source' : source }
298 api.ChangesetCreate(cs_tags)
300 for csvplace in reader:
302 uname = csvplace[u"name"].encode("utf-8")
303 print "Processing data for %s ..." % ( uname )
304 sys.stdout.flush()
305 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"] )
306 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"] )
308 if len(existing_nodes) == 0:
309 # node doesn't exist for this place, or is far; we can create the node
310 nodedict = nodeDictForPlace ( csvplace )
312 api.NodeCreate(nodedict)
313 print "Created new node for %s" % ( uname )
315 elif len(existing_nodes) > 1:
316 # I am confused, more than one node with the same simplified name
317 # try to see if there's already a siruta code attached
319 newlist = getSameSiruta( existing_nodes, csvplace[u"siruta:code"] )
320 if len(newlist) == 1:
321 existing_nodes = newlist
322 else:
323 print >> sys.stderr, "Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s,lon=%s)" % (
324 uname,
325 len(existing_nodes),
326 csvplace[u"lat"].encode("utf-8"),
327 csvplace[u"lon"].encode("utf-8") )
329 if len(existing_nodes) == 1:
330 # there is an existing code, so we merge with that
331 referencenode = existing_nodes[0].copy()
332 # dictionaries don't get copied by default
333 referencenode[u"tag"] = existing_nodes[0][u"tag"].copy()
334 nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
336 if nodedict == referencenode:
337 print "Skipping: No changes needed for node %s" % ( uname )
338 else:
339 api.NodeUpdate(nodedict)
340 print "Updated existing node for %s" % ( uname )
342 sys.stdout.flush()
343 sys.stderr.flush()
345 api.ChangesetClose()
346 csvfile.close()
348 def usage():
349 print "%s [-c|--comment <comment>] [-s|--source <source>] -i <inputcsv>" % sys.argv[0]
351 def main(argv = None):
353 if argv is None:
354 argv = sys.argv
356 try:
357 opts, args = getopt.getopt(sys.argv[1:], "hi:c:s:",
358 ["help", "input=", "comment=", "source="] )
359 except getopt.GetoptError, err:
360 # print help information and exit:
361 print str(err)
362 usage()
363 return 2
365 file = None
366 comment = None
367 source = None
369 for o,a in opts:
370 if o in ("-h", "help"):
371 usage()
372 return 0
373 elif o in ("-i", "--input"):
374 file = a
375 elif o in ("-s", "--source"):
376 source = a
377 elif o in ("-c", "--comment"):
378 comment = a
380 if not file:
381 print "Input csv file (-i option) is mandatory. Run the script with -h for online help."
382 return 2
385 readAndProcessSirutaCsv(file, source=source, comment=comment)
388 if __name__ == "__main__":
389 sys.exit(main())