OsmLocImport.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 import re
   5 import unicodedata
   6 from xml.dom import minidom
   7
   8 import os
   9 if os.path.exists('OsmApi.py'):
  10         import sys
  11         sys.path.insert(0,'.')
  12 from OsmApi import OsmApi
  13
  14 from math import cos, radians
  15
  16 if os.path.exists('siruta.py'):
  17         if not '.' in sys.path:
  18                 sys.path.insert(0,'.')
  19 from sirutacsv import SirutaDictReader
  20
  21
  22
  23 def stripAccents(s):
  24         """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
  25
  26         try:
  27                 s = unicode(s,'utf-8')
  28         except TypeError:
  29                 pass
  30         return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
  31
  32
  33 def simplifyName(unicode_name):
  34         """a function to turn into lowercase ASCII any name (by stripping accents)"""
  35
  36         simplename = unicode_name.lower()
  37         simplename = stripAccents(simplename)
  38
  39         return simplename
  40
  41 def getNodesList(osmxmlcontents):
  42         """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
  43
  44         if len(osmxmlcontents) > 1:
  45                 raise ValueError('Too many osm blocks in one XML')
  46
  47         # select only the nodes from the OSM XML
  48         xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
  49
  50         # TODO: use an internal data member for the api
  51         api = OsmApi()
  52         osmnodes = []
  53         for xmlnode in xmlnodes:
  54                 osmnode = api._DomParseNode(xmlnode)
  55                 osmnodes.append(osmnode)
  56
  57         return osmnodes
  58
  59
  60 def getMatchingPlaces(osmapielements,placename):
  61         """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
  62         list = []
  63
  64         sname = simplifyName (placename)
  65
  66         for i in osmapielements:
  67                 try:
  68                         if len( i[u"tag"][u"place"] ) > 0 and \
  69                                 sname == simplifyName(i[u"tag"][u"name"]):
  70                                 list.append(i)
  71                 except KeyError:
  72                         # that node didn't have the 'place' or a 'name' tag, so is uninteresting
  73                         pass
  74
  75         return list
  76
  77
  78 def locatePlaceInXML(xml,placename):
  79         """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
  80 into the xml document 'xml'. The input xml document is the one returned by the OSM API
  81 through a query for data within a bounding box"""
  82
  83         if os.path.exists(xml):
  84                 xmldoc = minidom.parse(xml)
  85         else:
  86                 xmldoc = minidom.parseString(xml.encode("utf-8"))
  87
  88         # each OSM XML has a single root osm element
  89         osmxmlcontents = xmldoc.getElementsByTagName('osm')
  90
  91         nodeslist = getNodesList(osmxmlcontents)
  92
  93         nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
  94
  95         places = getMatchingPlaces(nodeswithtags, placename)
  96
  97         return places
  98
  99 def getArea ( bbox_str ):
 100         """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
 101
 102         path = "/api/0.6/map?bbox=" + bbox_str
 103
 104         # TODO: use an internal data member for the api
 105         api = OsmApi()
 106         data = api._get ( path )
 107
 108         return data
 109
 110 def getMapAroundPoint(lon, lat, bbox_km = 10):
 111         """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
 112 within a bbox_km area"""
 113
 114         # one degree latitude is approximately 111km
 115         # and we want to know what's half of bbox_km in lat degrees
 116         delta_lat = bbox_km / 222.0
 117
 118         lat = float (lat)
 119         lon = float (lon)
 120         # one degree longitude is a cos(lat) * 111
 121         # and we want to know what's half of bbox_km in lon degrees
 122         delta_lon = cos( radians (lat) ) * delta_lat
 123
 124         lat_b = lat - delta_lat
 125         lat_t = lat + delta_lat
 126
 127         lon_l = lon - delta_lon
 128         lon_r = lon + delta_lon
 129
 130
 131         path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
 132
 133         area_xml_string = getArea ( path )
 134
 135         return area_xml_string
 136
 137 def simpleName(placename):
 138         """Removes from a name of a place any prefix that indicates its clasification"""
 139         simpleplacename = placename.lstrip(u"Municipiul ")
 140         simpleplacename = simpleplacename.lstrip(u"Oraș ")
 141
 142         return simpleplacename
 143
 144 def sirutaRankToPlace(sirutarank, population):
 145         """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
 146
 147 Cod     Denumire tip de unitate administrativ teritorială
 148
 149  40     Judeţ, municipiul Bucureşti
 150   1     Municipiu  reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
 151   2     Oraş ce aparţine de judeţ, altul decât oraş  reşedinţă de judeţ
 152   3     Comună
 153   4     Municipiu, altul decât reşedinţă de judeţ
 154   5     Oraş reşedinţă de judeţ
 155   6     Sector al  municipiului Bucureşti
 156   9     Localitate  componentă,  reşedinţă de  municipiu
 157  10     Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
 158  11     Sat ce aparţine de municipiu
 159  17     Localitate componentă reşedinţă a oraşului
 160  18     Localitate  componentă a unui oraş, alta decât reşedinţă de  oraş
 161  19     Sat care aparţine  unui oraş
 162  22     Sat reşedinţă de comună
 163  23     Sat ce aparţine de comună, altul  decât reşedinţă de comună
 164 """
 165
 166         rank = int(sirutarank)
 167
 168         # municipii, reședințe de județ, reședințe de municipiu
 169         if rank in [ 1, 4, 9, 40 ]:
 170                 return u"city"
 171         # orașe, orașe reședință de județ, reședințe ale orașelor
 172         if rank in [ 2, 5, 17 ]:
 173                 return u"town"
 174         # localități componente ale orașelor sau municpiilor, altele decât reședințele
 175         if rank in [ 10, 18 ]:
 176                 return u"village"
 177         # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
 178         if rank in [ 3, 11, 19, 22, 23 ]:
 179                 # reședințele de comună nu trebuie să fie cătune (hamlet)
 180                 if rank == 22:
 181                         return u"village"
 182                 if population < 51:
 183                         return u"hamlet"
 184                 else:
 185                         return u"village"
 186         # sectoarele municipiului București
 187         if rank in [ 6 ]:
 188                 return u"sector"
 189
 190         raise ValueError, "Unexpected rank value in siruta data"
 191
 192 def nodeDictForPlace(sirutadict, oldnode = None):
 193         """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
 194 the existing data which is present in 'oldnode'"""
 195
 196         node = {}
 197         tags = {}
 198
 199         tags[u"name"] = sirutadict[u"name"]
 200         if oldnode:
 201                 node = oldnode
 202                 tags = oldnode[u"tag"]
 203
 204                 if u"population" in tags:
 205                         tags[u"population:census:2002"] = sirutadict[u"population2002"]
 206                 else:
 207                         tags[u"population"] = sirutadict[u"population2002"]
 208
 209                 if u"postal_code" in tags:
 210                         if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
 211                                 tags.pop(u"postal_code")
 212
 213                 if u"addr:postcode" in tags:
 214                         if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
 215                                 tags.pop(u"addr:postcode")
 216
 217
 218         else:
 219                 node[u"lat"] = float(sirutadict[u"lat"])
 220                 node[u"lon"] = float(sirutadict[u"lon"])
 221                 tags[u"population"] = sirutadict[u"population2002"]
 222
 223         # this should probably be ran even for existing nodes
 224         tags[u"place"] = sirutaRankToPlace(sirutadict[u"siruta:type"], tags[u"population"])
 225
 226         # clean up siruta:name_sup
 227         sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
 228
 229         uninteresting = [ \
 230                                 u"lon", \
 231                                 u"lat", \
 232                                 u"name", \
 233                                 u'siruta:rank', \
 234                                 u"population2002", \
 235                                 u"region", \
 236                                 u"siruta:region_id", \
 237                                 u"siruta:enviro_type", \
 238                                 u"siruta:sortcode" ]
 239
 240         mergetags = sirutadict.copy()
 241         for tag in sirutadict:
 242                 if tag in uninteresting:
 243                         mergetags.pop(tag)
 244
 245         tags.update(mergetags)
 246
 247         simplesup = simpleName(sirutadict[u"siruta:name_sup"])
 248         if u"is_in" in tags:
 249                 is_in = [ x.strip() for x in tags[u"is_in"].replace(",", ";").split(";") ]
 250                 nextpos = 0
 251                 if (not simplesup in is_in) and (tags[u"name"] <> simplesup):
 252                         is_in.insert(0,simplesup)
 253                         nextpos = 1
 254                 if not sirutadict[u"siruta:county"] in is_in:
 255                         is_in.insert(nextpos,sirutadict[u"siruta:county"])
 256         else:
 257                 is_in = [u"România"]
 258                 is_in.insert(0,sirutadict[u"siruta:county"])
 259                 if tags[u"name"] <> simplesup:
 260                         is_in.insert(0,simplesup)
 261
 262         tags[u"is_in"] = u";".join(is_in)
 263
 264         node[u"tag"] = tags
 265
 266         return node
 267
 268
 269 def readAndProcessSirutaCsv(file, comment = None, source = u"geo-spatial.org"):
 270         """reads the input CSV file and processes each entry"""
 271
 272         csvfile = open (file, 'r')
 273         reader = SirutaDictReader( csvfile )
 274
 275         homedir = os.environ['HOME']
 276         api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
 277
 278         if not comment:
 279                 comment = 'import places from ' + file
 280         cs_tags = { 'comment' : comment , 'source' : source }
 281
 282         api.ChangesetCreate(cs_tags)
 283
 284         for csvplace in reader:
 285
 286                 print "Processing data for %s ..." % ( csvplace[u"name"] )
 287                 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"] )
 288                 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"] )
 289
 290                 if len(existing_nodes) == 0:
 291                         # node doesn't exist for this place, or is far; we can create the node
 292                         nodedict = nodeDictForPlace ( csvplace )
 293
 294                         api.NodeCreate(nodedict)
 295                         print "Created new node for %s" % ( csvplace[u"name"] )
 296
 297                 elif len(existing_nodes) == 1:
 298                         # there is an existing code, so we merge with that
 299                         nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
 300
 301                         api.NodeUpdate(nodedict)
 302                         print "Updated existing node for %s" % ( csvplace[u"name"] )
 303
 304                 else:
 305                         # I am confused, more than one node with the same simplified name
 306
 307                         print >> sys.stderr, u"Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s,lon=%s)" % (
 308                                         csvplace[u"name"],
 309                                         len(existing_nodes),
 310                                         csvplace[u"lat"],
 311                                         csvplace[u"lon"] )
 312
 313         api.ChangesetClose()