OsmLocImport.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 import re
   5 import unicodedata
   6 from xml.dom import minidom
   7
   8 import os
   9 if os.path.exists('OsmApi.py'):
  10         import sys
  11         sys.path.insert(0,'.')
  12 from OsmApi import OsmApi
  13
  14 from math import cos, radians
  15
  16 if os.path.exists('siruta.py'):
  17         if not '.' in sys.path:
  18                 sys.path.insert(0,'.')
  19 from sirutacsv import SirutaDictReader
  20
  21
  22
  23 def stripAccents(s):
  24         """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
  25
  26         try:
  27                 s = unicode(s,'utf-8')
  28         except TypeError:
  29                 pass
  30         return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
  31
  32
  33 def simplifyName(unicode_name):
  34         """a function to turn into lowercase ASCII any name (by stripping accents)"""
  35
  36         simplename = unicode_name.lower()
  37         simplename = stripAccents(simplename)
  38
  39         return simplename
  40
  41 def getNodesList(osmxmlcontents):
  42         """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
  43
  44         if len(osmxmlcontents) > 1:
  45                 raise ValueError('Too many osm blocks in one XML')
  46
  47         # select only the nodes from the OSM XML
  48         xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
  49
  50         # TODO: use an internal data member for the api
  51         api = OsmApi()
  52         osmnodes = []
  53         for xmlnode in xmlnodes:
  54                 osmnode = api._DomParseNode(xmlnode)
  55                 osmnodes.append(osmnode)
  56
  57         return osmnodes
  58
  59
  60 def getMatchingPlaces(osmapielements,placename):
  61         """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
  62         list = []
  63
  64         sname = simplifyName (placename)
  65
  66         for i in osmapielements:
  67                 try:
  68                         if len( i[u"tag"][u"place"] ) > 0 and \
  69                                 sname == simplifyName(i[u"tag"][u"name"]):
  70                                 list.append(i)
  71                 except KeyError:
  72                         # that node didn't have the 'place' or a 'name' tag, so is uninteresting
  73                         pass
  74
  75         return list
  76
  77
  78 def locatePlaceInXML(xml,placename):
  79         """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
  80 into the xml document 'xml'. The input xml document is the one returned by the OSM API
  81 through a query for data within a bounding box"""
  82
  83         if os.path.exists(xml):
  84                 xmldoc = minidom.parse(xml)
  85         else:
  86                 xmldoc = minidom.parseString(xml.encode("utf-8"))
  87
  88         # each OSM XML has a single root osm element
  89         osmxmlcontents = xmldoc.getElementsByTagName('osm')
  90
  91         nodeslist = getNodesList(osmxmlcontents)
  92
  93         nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
  94
  95         places = getMatchingPlaces(nodeswithtags, placename)
  96
  97         return places
  98
  99 def getArea ( bbox_str ):
 100         """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
 101
 102         path = "/api/0.6/map?bbox=" + bbox_str
 103
 104         # TODO: use an internal data member for the api
 105         api = OsmApi()
 106         data = api._get ( path )
 107
 108         return data
 109
 110 def getMapAroundPoint(lon, lat, bbox_km = 10):
 111         """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
 112 within a bbox_km area"""
 113
 114         # one degree latitude is approximately 111km
 115         # and we want to know what's half of bbox_km in lat degrees
 116         delta_lat = bbox_km / 222.0
 117
 118         lat = float (lat)
 119         lon = float (lon)
 120         # one degree longitude is a cos(lat) * 111
 121         # and we want to know what's half of bbox_km in lon degrees
 122         delta_lon = cos( radians (lat) ) * delta_lat
 123
 124         lat_b = lat - delta_lat
 125         lat_t = lat + delta_lat
 126
 127         lon_l = lon - delta_lon
 128         lon_r = lon + delta_lon
 129
 130
 131         path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
 132
 133         area_xml_string = getArea ( path )
 134
 135         return area_xml_string
 136
 137 def simpleName(placename):
 138         """Removes from a name of a place any prefix that indicates its clasification"""
 139         simpleplacename = placename.lstrip(u"Municipiul ")
 140         simpleplacename = simpleplacename.lstrip(u"Oraș ")
 141
 142         return simpleplacename
 143
 144 def sirutaRankToPlace(sirutarank, population):
 145         """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
 146
 147 Cod     Denumire tip de unitate administrativ teritorială
 148
 149  40     Judeţ, municipiul Bucureşti
 150   1     Municipiu  reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
 151   2     Oraş ce aparţine de judeţ, altul decât oraş  reşedinţă de judeţ
 152   3     Comună
 153   4     Municipiu, altul decât reşedinţă de judeţ
 154   5     Oraş reşedinţă de judeţ
 155   6     Sector al  municipiului Bucureşti
 156   9     Localitate  componentă,  reşedinţă de  municipiu
 157  10     Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
 158  11     Sat ce aparţine de municipiu
 159  17     Localitate componentă reşedinţă a oraşului
 160  18     Localitate  componentă a unui oraş, alta decât reşedinţă de  oraş
 161  19     Sat care aparţine  unui oraş
 162  22     Sat reşedinţă de comună
 163  23     Sat ce aparţine de comună, altul  decât reşedinţă de comună
 164 """
 165
 166         rank = int(sirutarank)
 167
 168         # municipii, reședințe de județ, reședințe de municipiu
 169         if rank in [ 1, 4, 9, 40 ]:
 170                 return u"city"
 171         # orașe, orașe reședință de județ, reședințe ale orașelor
 172         if rank in [ 2, 5, 17 ]:
 173                 return u"town"
 174         # localități componente ale orașelor sau municpiilor, altele decât reședințele
 175         if rank in [ 10, 18 ]:
 176                 return u"village"
 177         # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
 178         if rank in [ 3, 11, 19, 22, 23 ]:
 179                 # reședințele de comună nu trebuie să fie cătune (hamlet)
 180                 if rank == 22:
 181                         return u"village"
 182                 if population < 51:
 183                         return u"hamlet"
 184                 else:
 185                         return u"village"
 186         # sectoarele municipiului București
 187         if rank in [ 6 ]:
 188                 return u"sector"
 189
 190         raise ValueError, "Unexpected rank value in siruta data"
 191
 192 def nodeDictForPlace(sirutadict, oldnode = None):
 193         """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
 194 the existing data which is present in 'oldnode'"""
 195
 196         node = {}
 197         tags = {}
 198
 199         tags[u"name"] = sirutadict[u"name"]
 200         if oldnode:
 201                 node = oldnode
 202                 tags = oldnode[u"tag"]
 203
 204                 if not u"population" in tags:
 205                         tags[u"population"] = sirutadict[u"population2002"]
 206
 207                 if u"postal_code" in tags:
 208                         if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
 209                                 tags.pop(u"postal_code")
 210
 211                 if u"addr:postcode" in tags:
 212                         if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
 213                                 tags.pop(u"addr:postcode")
 214
 215
 216         else:
 217                 node[u"lat"] = float(sirutadict[u"lat"])
 218                 node[u"lon"] = float(sirutadict[u"lon"])
 219                 tags[u"population"] = sirutadict[u"population2002"]
 220
 221         # consistently add the 2002 census data
 222         tags[u"population:census:2002"] = sirutadict[u"population2002"]
 223
 224         # this should probably be ran even for existing nodes
 225         tags[u"place"] = sirutaRankToPlace(sirutadict[u"siruta:type"], tags[u"population"])
 226
 227         # clean up siruta:name_sup
 228         sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
 229
 230         uninteresting = [ \
 231                                 u"lon", \
 232                                 u"lat", \
 233                                 u"name", \
 234                                 u'siruta:rank', \
 235                                 u"population2002", \
 236                                 u"region", \
 237                                 u"siruta:region_id", \
 238                                 u"siruta:enviro_type", \
 239                                 u"siruta:sortcode" ]
 240
 241         mergetags = sirutadict.copy()
 242         for tag in sirutadict:
 243                 if tag in uninteresting:
 244                         mergetags.pop(tag)
 245
 246         tags.update(mergetags)
 247
 248         simplesup = simpleName(sirutadict[u"siruta:name_sup"])
 249         if u"is_in" in tags:
 250                 is_in = [ x.strip() for x in tags[u"is_in"].replace(",", ";").split(";") ]
 251                 nextpos = 0
 252                 if (not simplesup in is_in) and (tags[u"name"] <> simplesup):
 253                         is_in.insert(0,simplesup)
 254                         nextpos = 1
 255                 if not sirutadict[u"siruta:county"] in is_in:
 256                         is_in.insert(nextpos,sirutadict[u"siruta:county"])
 257         else:
 258                 is_in = [u"România"]
 259                 is_in.insert(0,sirutadict[u"siruta:county"])
 260                 if tags[u"name"] <> simplesup:
 261                         is_in.insert(0,simplesup)
 262
 263         tags[u"is_in"] = u";".join(is_in)
 264
 265         node[u"tag"] = tags
 266
 267         return node
 268
 269
 270 def readAndProcessSirutaCsv(file, comment = None, source = u"geo-spatial.org"):
 271         """reads the input CSV file and processes each entry"""
 272
 273         csvfile = open (file, 'r')
 274         reader = SirutaDictReader( csvfile )
 275
 276         homedir = os.environ['HOME']
 277         api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
 278
 279         if not comment:
 280                 comment = 'import places from ' + file
 281         cs_tags = { 'comment' : comment , 'source' : source }
 282
 283         api.ChangesetCreate(cs_tags)
 284
 285         for csvplace in reader:
 286
 287                 print "Processing data for %s ..." % ( csvplace[u"name"] )
 288                 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"] )
 289                 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"] )
 290
 291                 if len(existing_nodes) == 0:
 292                         # node doesn't exist for this place, or is far; we can create the node
 293                         nodedict = nodeDictForPlace ( csvplace )
 294
 295                         api.NodeCreate(nodedict)
 296                         print "Created new node for %s" % ( csvplace[u"name"] )
 297
 298                 elif len(existing_nodes) == 1:
 299                         # there is an existing code, so we merge with that
 300                         nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
 301
 302                         api.NodeUpdate(nodedict)
 303                         print "Updated existing node for %s" % ( csvplace[u"name"] )
 304
 305                 else:
 306                         # I am confused, more than one node with the same simplified name
 307
 308                         print >> sys.stderr, u"Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s,lon=%s)" % (
 309                                         csvplace[u"name"],
 310                                         len(existing_nodes),
 311                                         csvplace[u"lat"],
 312                                         csvplace[u"lon"] )
 313
 314         api.ChangesetClose()