OsmLocImport.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 import re
   5 import unicodedata
   6 from xml.dom import minidom
   7
   8 import os
   9 if os.path.exists('OsmApi.py'):
  10         import sys
  11         sys.path.insert(0,'.')
  12 from OsmApi import OsmApi
  13
  14 from math import cos, radians
  15
  16 if os.path.exists('siruta.py'):
  17         if not '.' in sys.path:
  18                 sys.path.insert(0,'.')
  19 from sirutacsv import SirutaDictReader
  20
  21
  22
  23 def stripAccents(s):
  24         """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
  25
  26         try:
  27                 s = unicode(s,'utf-8')
  28         except TypeError:
  29                 pass
  30         return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
  31
  32
  33 def simplifyName(unicode_name):
  34         """a function to turn into lowercase ASCII any name (by stripping accents)"""
  35
  36         simplename = unicode_name.lower()
  37         simplename = stripAccents(simplename)
  38
  39         return simplename
  40
  41 def getNodesList(osmxmlcontents):
  42         """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
  43
  44         if len(osmxmlcontents) > 1:
  45                 raise ValueError('Too many osm blocks in one XML')
  46
  47         # select only the nodes from the OSM XML
  48         xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
  49
  50         # TODO: use an internal data member for the api
  51         api = OsmApi()
  52         osmnodes = []
  53         for xmlnode in xmlnodes:
  54                 osmnode = api._DomParseNode(xmlnode)
  55                 osmnodes.append(osmnode)
  56
  57         return osmnodes
  58
  59
  60 def getMatchingPlaces(osmapielements,placename):
  61         """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
  62         list = []
  63
  64         sname = simplifyName (placename)
  65
  66         for i in osmapielements:
  67                 try:
  68                         if len( i[u"tag"][u"place"] ) > 0 and \
  69                                 sname == simplifyName(i[u"tag"][u"name"]):
  70                                 list.append(i)
  71                 except KeyError:
  72                         # that node didn't have the 'place' or a 'name' tag, so is uninteresting
  73                         pass
  74
  75         return list
  76
  77
  78 def locatePlaceInXML(xml,placename):
  79         """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
  80 into the xml document 'xml'. The input xml document is the one returned by the OSM API
  81 through a query for data within a bounding box"""
  82
  83         if os.path.exists(xml):
  84                 xmldoc = minidom.parse(xml)
  85         else:
  86                 xmldoc = minidom.parseString(xml.encode("utf-8"))
  87
  88         # each OSM XML has a single root osm element
  89         osmxmlcontents = xmldoc.getElementsByTagName('osm')
  90
  91         nodeslist = getNodesList(osmxmlcontents)
  92
  93         nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
  94
  95         places = getMatchingPlaces(nodeswithtags, placename)
  96
  97         return places
  98
  99 def getArea ( bbox_str ):
 100         """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
 101
 102         path = "/api/0.6/map?bbox=" + bbox_str
 103
 104         # TODO: use an internal data member for the api
 105         api = OsmApi()
 106         data = api._get ( path )
 107
 108         return data
 109
 110 def getMapAroundPoint(lon, lat, bbox_km = 10):
 111         """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
 112 within a bbox_km area"""
 113
 114         # one degree latitude is approximately 111km
 115         # and we want to know what's half of bbox_km in lat degrees
 116         delta_lat = bbox_km / 222.0
 117
 118         lat = float (lat)
 119         lon = float (lon)
 120         # one degree longitude is a cos(lat) * 111
 121         # and we want to know what's half of bbox_km in lon degrees
 122         delta_lon = cos( radians (lat) ) * delta_lat
 123
 124         lat_b = lat - delta_lat
 125         lat_t = lat + delta_lat
 126
 127         lon_l = lon - delta_lon
 128         lon_r = lon + delta_lon
 129
 130
 131         path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
 132
 133         area_xml_string = getArea ( path )
 134
 135         return area_xml_string
 136
 137 def simpleName(placename):
 138         """Removes from a name of a place any prefix that indicates its clasification"""
 139         simpleplacename = placename.lstrip(u"Municipiul ")
 140         simpleplacename = simpleplacename.lstrip(u"Oraș ")
 141
 142         return simpleplacename
 143
 144 def sirutaRankToPlace(sirutarank, population):
 145         """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
 146
 147 Cod     Denumire tip de unitate administrativ teritorială
 148
 149  40     Judeţ, municipiul Bucureşti
 150   1     Municipiu  reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
 151   2     Oraş ce aparţine de judeţ, altul decât oraş  reşedinţă de judeţ
 152   3     Comună
 153   4     Municipiu, altul decât reşedinţă de judeţ
 154   5     Oraş reşedinţă de judeţ
 155   6     Sector al  municipiului Bucureşti
 156   9     Localitate  componentă,  reşedinţă de  municipiu
 157  10     Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
 158  11     Sat ce aparţine de municipiu
 159  17     Localitate componentă reşedinţă a oraşului
 160  18     Localitate  componentă a unui oraş, alta decât reşedinţă de  oraş
 161  19     Sat care aparţine  unui oraş
 162  22     Sat reşedinţă de comună
 163  23     Sat ce aparţine de comună, altul  decât reşedinţă de comună
 164 """
 165
 166         rank = int(sirutarank)
 167
 168         # municipii, reședințe de județ, reședințe de municipiu
 169         if rank in [ 1, 4, 9, 40 ]:
 170                 return u"city"
 171         # orașe, orașe reședință de județ, reședințe ale orașelor
 172         if rank in [ 2, 5, 17 ]:
 173                 return u"town"
 174         # localități componente ale orașelor sau municpiilor, altele decât reședințele
 175         if rank in [ 10, 18 ]:
 176                 return u"village"
 177         # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
 178         if rank in [ 3, 11, 19, 22, 23 ]:
 179                 # doar satele non-reședință ce aparțin de comune pot fi cătune (hamlet)
 180                 if rank == 23 and int(population) < 50:
 181                         return u"hamlet"
 182                 else:
 183                         return u"village"
 184         # sectoarele municipiului București
 185         if rank in [ 6 ]:
 186                 return u"sector"
 187
 188         raise ValueError, "Unexpected rank value in siruta data"
 189
 190 def nodeDictForPlace(sirutadict, oldnode = None):
 191         """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
 192 the existing data which is present in 'oldnode'"""
 193
 194         node = {}
 195         tags = {}
 196
 197         ## XXX: for some reason this doesn't seem to work, we'll use uninteresting
 198         #tags[u"name"] = sirutadict[u"name"]
 199         if oldnode:
 200                 node = oldnode
 201                 tags = oldnode[u"tag"]
 202
 203                 if not u"population" in tags:
 204                         tags[u"population"] = sirutadict[u"population2002"]
 205
 206                 if u"postal_code" in tags:
 207                         if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
 208                                 tags.pop(u"postal_code")
 209
 210                 if u"addr:postcode" in tags:
 211                         if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
 212                                 tags.pop(u"addr:postcode")
 213
 214
 215         else:
 216                 node[u"lat"] = float(sirutadict[u"lat"])
 217                 node[u"lon"] = float(sirutadict[u"lon"])
 218                 tags[u"population"] = sirutadict[u"population2002"]
 219
 220         # consistently add the 2002 census data
 221         tags[u"population:census:2002"] = sirutadict[u"population2002"]
 222
 223         # this should probably be ran even for existing nodes
 224         tags[u"place"] = sirutaRankToPlace(sirutadict[u"siruta:type"], tags[u"population"])
 225
 226         # clean up siruta:name_sup
 227         sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
 228
 229         uninteresting = [ \
 230                                 u"lon", \
 231                                 u"lat", \
 232                                 u'siruta:rank', \
 233                                 u"population2002", \
 234                                 u"region", \
 235                                 u"siruta:region_id", \
 236                                 u"siruta:enviro_type", \
 237                                 u"siruta:sortcode" ]
 238
 239         mergetags = sirutadict.copy()
 240         for tag in sirutadict:
 241                 if tag in uninteresting:
 242                         mergetags.pop(tag)
 243
 244         tags.update(mergetags)
 245
 246         simplesup = simpleName(sirutadict[u"siruta:name_sup"])
 247         if u"is_in" in tags:
 248                 is_in = [ x.strip() for x in tags[u"is_in"].replace(",", ";").split(";") ]
 249                 nextpos = 0
 250                 if (not simplesup in is_in) and (tags[u"name"] <> simplesup):
 251                         is_in.insert(0,simplesup)
 252                         nextpos = 1
 253                 if not sirutadict[u"siruta:county"] in is_in:
 254                         is_in.insert(nextpos,sirutadict[u"siruta:county"])
 255         else:
 256                 is_in = [u"România"]
 257                 is_in.insert(0,sirutadict[u"siruta:county"])
 258                 if tags[u"name"] <> simplesup:
 259                         is_in.insert(0,simplesup)
 260
 261         tags[u"is_in"] = u";".join(is_in)
 262
 263         node[u"tag"] = tags
 264
 265         return node
 266
 267
 268 def readAndProcessSirutaCsv(file, comment = None, source = u"geo-spatial.org"):
 269         """reads the input CSV file and processes each entry"""
 270
 271         csvfile = open (file, 'r')
 272         reader = SirutaDictReader( csvfile )
 273
 274         homedir = os.environ['HOME']
 275         api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
 276
 277         if not comment:
 278                 comment = 'import places from ' + file
 279         cs_tags = { 'comment' : comment , 'source' : source }
 280
 281         api.ChangesetCreate(cs_tags)
 282
 283         for csvplace in reader:
 284
 285                 print "Processing data for %s ..." % ( csvplace[u"name"] )
 286                 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"] )
 287                 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"] )
 288
 289                 if len(existing_nodes) == 0:
 290                         # node doesn't exist for this place, or is far; we can create the node
 291                         nodedict = nodeDictForPlace ( csvplace )
 292
 293                         api.NodeCreate(nodedict)
 294                         print "Created new node for %s" % ( csvplace[u"name"] )
 295
 296                 elif len(existing_nodes) == 1:
 297                         # there is an existing code, so we merge with that
 298                         referencenode = existing_nodes[0].copy()
 299                         nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
 300
 301                         if nodedict == referencenode:
 302                                 print u"Skipping: No changes needed for node %s" % ( csvplace[u"name"] )
 303                         else:
 304                                 api.NodeUpdate(nodedict)
 305                                 print "Updated existing node for %s" % ( csvplace[u"name"] )
 306
 307                 else:
 308                         # I am confused, more than one node with the same simplified name
 309
 310                         print >> sys.stderr, u"Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s,lon=%s)" % (
 311                                         csvplace[u"name"],
 312                                         len(existing_nodes),
 313                                         csvplace[u"lat"],
 314                                         csvplace[u"lon"] )
 315
 316         api.ChangesetClose()