OsmLocImport.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 import re
   5 import unicodedata
   6 from xml.dom import minidom
   7
   8 import os
   9 if os.path.exists('OsmApi.py'):
  10         import sys
  11         sys.path.insert(0,'.')
  12 from OsmApi import OsmApi
  13
  14 from math import cos, radians
  15
  16 if os.path.exists('siruta.py'):
  17         if not '.' in sys.path:
  18                 sys.path.insert(0,'.')
  19 from sirutacsv import SirutaDictReader
  20
  21 import getopt
  22
  23
  24 def stripAccents(s):
  25         """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
  26
  27         try:
  28                 s = unicode(s,'utf-8')
  29         except TypeError:
  30                 pass
  31         s = s.replace(u"-", u" ")
  32         return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
  33
  34
  35 def simplifyName(unicode_name):
  36         """a function to turn into lowercase ASCII any name (by stripping accents)"""
  37
  38         simplename = unicode_name.lower()
  39         simplename = stripAccents(simplename)
  40
  41         return simplename
  42
  43 def getNodesList(osmxmlcontents):
  44         """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
  45
  46         if len(osmxmlcontents) > 1:
  47                 raise ValueError('Too many osm blocks in one XML')
  48
  49         # select only the nodes from the OSM XML
  50         xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
  51
  52         # TODO: use an internal data member for the api
  53         api = OsmApi()
  54         osmnodes = []
  55         for xmlnode in xmlnodes:
  56                 osmnode = api._DomParseNode(xmlnode)
  57                 osmnodes.append(osmnode)
  58
  59         return osmnodes
  60
  61
  62 def getMatchingPlaces(osmapielements,placename, sirutacode = None):
  63         """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
  64         list = []
  65
  66         # TODO: try somehow to match î with â when is correct to do so
  67         #       should also match i to â (when is correct)
  68         # TODO: maybe calculating the lexical distance is the way to fix this
  69         #       and match also typo-ed names with correct ones
  70         sname = simplifyName (placename)
  71
  72         for i in osmapielements:
  73                 try:
  74                         if len( i[u"tag"][u"place"] ) > 0 and \
  75                                 sname == simplifyName(i[u"tag"][u"name"]):
  76                                 list.append(i)
  77                 except KeyError:
  78                         # that node didn't have the 'place' or a 'name' tag, so is uninteresting
  79                         pass
  80
  81         # try to see if there's already a siruta code attached
  82         # if it is, filter by it, and if the result is non-empty, return that list
  83         if len(list) > 1 and sirutacode:
  84                 samesiruta = getSameSiruta(list, sirutacode)
  85                 if len(samesiruta) != 0:
  86                         list = samesiruta
  87         return list
  88
  89
  90 def locatePlaceInXML(xml,placename,sirutacode=None):
  91         """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
  92 into the xml document 'xml'. The input xml document is the one returned by the OSM API
  93 through a query for data within a bounding box"""
  94
  95         if os.path.exists(xml):
  96                 xmldoc = minidom.parse(xml)
  97         else:
  98                 xmldoc = minidom.parseString(xml.encode("utf-8"))
  99
 100         # each OSM XML has a single root osm element
 101         osmxmlcontents = xmldoc.getElementsByTagName('osm')
 102
 103         nodeslist = getNodesList(osmxmlcontents)
 104
 105         nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
 106
 107         places = getMatchingPlaces(nodeswithtags, placename, sirutacode)
 108
 109         return places
 110
 111 def getArea ( bbox_str ):
 112         """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
 113
 114         path = "/api/0.6/map?bbox=" + bbox_str
 115
 116         # TODO: use an internal data member for the api
 117         api = OsmApi()
 118         data = api._get ( path )
 119
 120         return data
 121
 122 def getMapAroundPoint(lon, lat, bbox_km = 10):
 123         """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
 124 within a bbox_km area"""
 125
 126         # one degree latitude is approximately 111km
 127         # and we want to know what's half of bbox_km in lat degrees
 128         delta_lat = bbox_km / 222.0
 129
 130         lat = float (lat)
 131         lon = float (lon)
 132         # one degree longitude is a cos(lat) * 111
 133         # and we want to know what's half of bbox_km in lon degrees
 134         delta_lon = cos( radians (lat) ) * delta_lat
 135
 136         lat_b = lat - delta_lat
 137         lat_t = lat + delta_lat
 138
 139         lon_l = lon - delta_lon
 140         lon_r = lon + delta_lon
 141
 142
 143         path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
 144
 145         area_xml_string = getArea ( path )
 146
 147         return area_xml_string
 148
 149 def simpleName(placename):
 150         """Removes from a name of a place any prefix that indicates its clasification"""
 151         simpleplacename = placename.replace(u"Municipiul ",u"",1)
 152         simpleplacename = simpleplacename.replace(u"Oraș ",u"",1)
 153
 154         return simpleplacename
 155
 156 def sirutaTypeToPlace(sirutarank, population):
 157         """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
 158
 159 Cod     Denumire tip de unitate administrativ teritorială
 160
 161  40     Judeţ, municipiul Bucureşti
 162   1     Municipiu  reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
 163   2     Oraş ce aparţine de judeţ, altul decât oraş  reşedinţă de judeţ
 164   3     Comună
 165   4     Municipiu, altul decât reşedinţă de judeţ
 166   5     Oraş reşedinţă de judeţ
 167   6     Sector al  municipiului Bucureşti
 168   9     Localitate  componentă,  reşedinţă de  municipiu
 169  10     Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
 170  11     Sat ce aparţine de municipiu
 171  17     Localitate componentă reşedinţă a oraşului
 172  18     Localitate  componentă a unui oraş, alta decât reşedinţă de  oraş
 173  19     Sat care aparţine  unui oraş
 174  22     Sat reşedinţă de comună
 175  23     Sat ce aparţine de comună, altul  decât reşedinţă de comună
 176 """
 177
 178         rank = int(sirutarank)
 179
 180         # municipii, reședințe de județ, reședințe de municipiu
 181         if rank in [ 1, 4, 9, 40 ]:
 182                 return u"city"
 183         # orașe, orașe reședință de județ, reședințe ale orașelor
 184         if rank in [ 2, 5, 17 ]:
 185                 return u"town"
 186         # localități componente ale orașelor sau municpiilor, altele decât reședințele
 187         if rank in [ 10, 18 ]:
 188                 return u"village"
 189         # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
 190         if rank in [ 3, 11, 19, 22, 23 ]:
 191                 # doar satele non-reședință ce aparțin de comune pot fi cătune (hamlet)
 192                 if rank == 23 and int(population) < 50:
 193                         return u"hamlet"
 194                 else:
 195                         return u"village"
 196         # sectoarele municipiului București
 197         if rank in [ 6 ]:
 198                 return u"sector"
 199
 200         raise ValueError, "Unexpected rank value in siruta data"
 201
 202 def nodeDictForPlace(sirutadict, oldnode = None):
 203         """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
 204 the existing data which is present in 'oldnode'"""
 205
 206         node = {}
 207         tags = {}
 208
 209         # it seems some of the input contains no data for the population and the rank,
 210         # which probably means 0 for population, and we don't care for rank
 211
 212         if sirutadict[u"population2002"] == u"":
 213                 sirutadict[u"population2002"] = u"0"
 214
 215         if oldnode:
 216                 node = oldnode
 217                 tags = oldnode[u"tag"]
 218
 219                 if not u"population" in tags:
 220                         tags[u"population"] = sirutadict[u"population2002"]
 221
 222                 if u"postal_code" in tags:
 223                         if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
 224                                 tags.pop(u"postal_code")
 225
 226                 if u"addr:postcode" in tags:
 227                         if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
 228                                 tags.pop(u"addr:postcode")
 229
 230
 231         else:
 232                 node[u"lat"] = float(sirutadict[u"lat"])
 233                 node[u"lon"] = float(sirutadict[u"lon"])
 234                 tags[u"population"] = sirutadict[u"population2002"]
 235
 236         # consistently add the 1992 census data
 237         tags[u"population:census:1992"] = sirutadict[u"population2002"]
 238
 239         # this should probably be ran even for existing nodes
 240         tags[u"place"] = sirutaTypeToPlace(sirutadict[u"siruta:type"], tags[u"population"])
 241
 242         # clean up siruta:name_sup
 243         sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
 244
 245         uninteresting = [ \
 246                                 u"lon", \
 247                                 u"lat", \
 248                                 u'siruta:rank', \
 249                                 u"population2002", \
 250                                 u"region", \
 251                                 u"siruta:region_id", \
 252                                 u"siruta:enviro_type", \
 253                                 u"siruta:sortcode" ]
 254
 255         mergetags = sirutadict.copy()
 256         for tag in sirutadict:
 257                 if tag in uninteresting:
 258                         mergetags.pop(tag)
 259
 260         tags.update(mergetags)
 261
 262         simplesup = simpleName(sirutadict[u"siruta:name_sup"])
 263         is_in = [u"România"]
 264         tags[u"is_in:country"] = u"România"
 265         is_in.insert(0,sirutadict[u"siruta:county"])
 266         tags[u"is_in:county"] = sirutadict[u"siruta:county"]
 267         if tags[u"name"] <> simplesup:
 268                 is_in.insert(0,simplesup)
 269
 270         tags[u"is_in"] = u";".join(is_in)
 271
 272         # prune the created_by tag since is deprecated to have it on the
 273         # node and the changeset contains that info anyway
 274         if u"created_by" in tags:
 275                 tags.pop(u"created_by")
 276
 277         node[u"tag"] = tags
 278
 279         return node
 280
 281 def getSameSiruta(elementlist, sirutacode):
 282         """returns a list with all the elements in list which have the siruta:code == sirutacode"""
 283
 284         newlist = []
 285         for x in elementlist:
 286                 try:
 287                         if x[u"tag"][u"siruta:code"] == sirutacode:
 288                                 newlist.append(x.copy())
 289                 except KeyError:
 290                         pass
 291
 292         return newlist
 293
 294 def readAndProcessSirutaCsv(file, comment = None, source = None, bbox_km = 10):
 295         """reads the input CSV file and processes each entry"""
 296
 297         csvfile = open (file, 'r')
 298         reader = SirutaDictReader( csvfile )
 299
 300         homedir = os.environ['HOME']
 301         api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
 302
 303         if not comment:
 304                 comment = 'import places from ' + file
 305         else:
 306                 comment = unicode(comment,'utf-8')
 307         if not source:
 308                 source = u"http://geo-spatial.org siruta data import"
 309         else:
 310                 source = unicode(source,'utf-8')
 311         cs_tags = { 'comment' : comment , 'source' : source }
 312
 313         print >> sys.stderr, "New pocessing started..."
 314         api.ChangesetCreate(cs_tags)
 315
 316         for csvplace in reader:
 317
 318                 uname = csvplace[u"name"].encode("utf-8")
 319                 print "Processing data for %s ..." % ( uname )
 320                 sys.stdout.flush()
 321                 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"], bbox_km = bbox_km )
 322                 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"], csvplace[u"siruta:code"] )
 323
 324                 if len(existing_nodes) == 0:
 325                         # node doesn't exist for this place, or is far; we can create the node
 326                         nodedict = nodeDictForPlace ( csvplace )
 327
 328                         api.NodeCreate(nodedict)
 329                         print "Created new node for %s" % ( uname )
 330
 331                 elif len(existing_nodes) == 1:
 332                         # there is an existing code, so we merge with that
 333                         referencenode = existing_nodes[0].copy()
 334                         # dictionaries don't get copied by default
 335                         referencenode[u"tag"] = existing_nodes[0][u"tag"].copy()
 336                         nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
 337
 338                         if nodedict == referencenode:
 339                                 print "Skipping: No changes needed for node %s" % ( uname )
 340                         else:
 341                                 api.NodeUpdate(nodedict)
 342                                 print "Updated existing node for %s" % ( uname )
 343
 344                 else:
 345                         # I am confused, more than one node with the same simplified name
 346
 347                         print >> sys.stderr, "Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s&lon=%s)" % (
 348                                         uname,
 349                                         len(existing_nodes),
 350                                         csvplace[u"lat"].encode("utf-8"),
 351                                         csvplace[u"lon"].encode("utf-8") )
 352
 353                 sys.stdout.flush()
 354                 sys.stderr.flush()
 355
 356         api.ChangesetClose()
 357         csvfile.close()
 358
 359 def usage():
 360         print "%s [-c|--comment <comment>] [-s|--source <source>] -i <inputcsv>" % sys.argv[0]
 361
 362 def main(argv = None):
 363
 364         if argv is None:
 365                 argv = sys.argv
 366
 367         try:
 368                 opts, args = getopt.getopt(sys.argv[1:], "hi:c:s:b:",
 369                                 ["help", "input=", "comment=", "source=","bbox="] )
 370         except getopt.GetoptError, err:
 371                 # print help information and exit:
 372                 print str(err)
 373                 usage()
 374                 return 2
 375
 376         file = None
 377         comment = None
 378         source = None
 379         bbox_km = 10
 380
 381         for o,a in opts:
 382                 if o in ("-h", "help"):
 383                         usage()
 384                         return 0
 385                 elif o in ("-i", "--input"):
 386                         file = a
 387                 elif o in ("-s", "--source"):
 388                         source = a
 389                 elif o in ("-c", "--comment"):
 390                         comment = a
 391                 elif o in ("-b", "--bbox"):
 392                         bbox_km = float(a)
 393
 394         if not file:
 395                 print "Input csv file (-i option) is mandatory. Run the script with -h for online help."
 396                 return 2
 397
 398
 399         readAndProcessSirutaCsv(file, source=source, comment=comment, bbox_km=bbox_km)
 400
 401
 402 if __name__ == "__main__":
 403         sys.exit(main())