OsmLocImport.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 import re
   5 import unicodedata
   6 from xml.dom import minidom
   7
   8 import os
   9 if os.path.exists('OsmApi.py'):
  10         import sys
  11         sys.path.insert(0,'.')
  12 from OsmApi import OsmApi
  13
  14 from math import cos, radians
  15
  16 if os.path.exists('siruta.py'):
  17         if not '.' in sys.path:
  18                 sys.path.insert(0,'.')
  19 from sirutacsv import SirutaDictReader
  20
  21 import getopt
  22
  23
  24 def stripAccents(s):
  25         """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
  26
  27         try:
  28                 s = unicode(s,'utf-8')
  29         except TypeError:
  30                 pass
  31         s = s.replace(u"-", u" ")
  32         return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
  33
  34
  35 def simplifyName(unicode_name):
  36         """a function to turn into lowercase ASCII any name (by stripping accents)"""
  37
  38         simplename = unicode_name.lower()
  39         simplename = stripAccents(simplename)
  40
  41         return simplename
  42
  43 def getNodesList(osmxmlcontents):
  44         """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
  45
  46         if len(osmxmlcontents) > 1:
  47                 raise ValueError('Too many osm blocks in one XML')
  48
  49         # select only the nodes from the OSM XML
  50         xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
  51
  52         # TODO: use an internal data member for the api
  53         api = OsmApi()
  54         osmnodes = []
  55         for xmlnode in xmlnodes:
  56                 osmnode = api._DomParseNode(xmlnode)
  57                 osmnodes.append(osmnode)
  58
  59         return osmnodes
  60
  61
  62 def getMatchingPlaces(osmapielements,placename, sirutacode = None):
  63         """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
  64         list = []
  65
  66         sname = simplifyName (placename)
  67
  68         for i in osmapielements:
  69                 try:
  70                         if len( i[u"tag"][u"place"] ) > 0 and \
  71                                 sname == simplifyName(i[u"tag"][u"name"]):
  72                                 list.append(i)
  73                 except KeyError:
  74                         # that node didn't have the 'place' or a 'name' tag, so is uninteresting
  75                         pass
  76
  77         # try to see if there's already a siruta code attached
  78         # if it is, filter by it, and if the result is non-empty, return that list
  79         if len(list) > 1 and sirutacode:
  80                 samesiruta = getSameSiruta(list, sirutacode)
  81                 if len(samesiruta) != 0:
  82                         list = samesiruta
  83         return list
  84
  85
  86 def locatePlaceInXML(xml,placename,sirutacode=None):
  87         """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
  88 into the xml document 'xml'. The input xml document is the one returned by the OSM API
  89 through a query for data within a bounding box"""
  90
  91         if os.path.exists(xml):
  92                 xmldoc = minidom.parse(xml)
  93         else:
  94                 xmldoc = minidom.parseString(xml.encode("utf-8"))
  95
  96         # each OSM XML has a single root osm element
  97         osmxmlcontents = xmldoc.getElementsByTagName('osm')
  98
  99         nodeslist = getNodesList(osmxmlcontents)
 100
 101         nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
 102
 103         places = getMatchingPlaces(nodeswithtags, placename, sirutacode)
 104
 105         return places
 106
 107 def getArea ( bbox_str ):
 108         """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
 109
 110         path = "/api/0.6/map?bbox=" + bbox_str
 111
 112         # TODO: use an internal data member for the api
 113         api = OsmApi()
 114         data = api._get ( path )
 115
 116         return data
 117
 118 def getMapAroundPoint(lon, lat, bbox_km = 10):
 119         """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
 120 within a bbox_km area"""
 121
 122         # one degree latitude is approximately 111km
 123         # and we want to know what's half of bbox_km in lat degrees
 124         delta_lat = bbox_km / 222.0
 125
 126         lat = float (lat)
 127         lon = float (lon)
 128         # one degree longitude is a cos(lat) * 111
 129         # and we want to know what's half of bbox_km in lon degrees
 130         delta_lon = cos( radians (lat) ) * delta_lat
 131
 132         lat_b = lat - delta_lat
 133         lat_t = lat + delta_lat
 134
 135         lon_l = lon - delta_lon
 136         lon_r = lon + delta_lon
 137
 138
 139         path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
 140
 141         area_xml_string = getArea ( path )
 142
 143         return area_xml_string
 144
 145 def simpleName(placename):
 146         """Removes from a name of a place any prefix that indicates its clasification"""
 147         simpleplacename = placename.replace(u"Municipiul ",u"",1)
 148         simpleplacename = simpleplacename.replace(u"Oraș ",u"",1)
 149
 150         return simpleplacename
 151
 152 def sirutaTypeToPlace(sirutarank, population):
 153         """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
 154
 155 Cod     Denumire tip de unitate administrativ teritorială
 156
 157  40     Judeţ, municipiul Bucureşti
 158   1     Municipiu  reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
 159   2     Oraş ce aparţine de judeţ, altul decât oraş  reşedinţă de judeţ
 160   3     Comună
 161   4     Municipiu, altul decât reşedinţă de judeţ
 162   5     Oraş reşedinţă de judeţ
 163   6     Sector al  municipiului Bucureşti
 164   9     Localitate  componentă,  reşedinţă de  municipiu
 165  10     Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
 166  11     Sat ce aparţine de municipiu
 167  17     Localitate componentă reşedinţă a oraşului
 168  18     Localitate  componentă a unui oraş, alta decât reşedinţă de  oraş
 169  19     Sat care aparţine  unui oraş
 170  22     Sat reşedinţă de comună
 171  23     Sat ce aparţine de comună, altul  decât reşedinţă de comună
 172 """
 173
 174         rank = int(sirutarank)
 175
 176         # municipii, reședințe de județ, reședințe de municipiu
 177         if rank in [ 1, 4, 9, 40 ]:
 178                 return u"city"
 179         # orașe, orașe reședință de județ, reședințe ale orașelor
 180         if rank in [ 2, 5, 17 ]:
 181                 return u"town"
 182         # localități componente ale orașelor sau municpiilor, altele decât reședințele
 183         if rank in [ 10, 18 ]:
 184                 return u"village"
 185         # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
 186         if rank in [ 3, 11, 19, 22, 23 ]:
 187                 # doar satele non-reședință ce aparțin de comune pot fi cătune (hamlet)
 188                 if rank == 23 and int(population) < 50:
 189                         return u"hamlet"
 190                 else:
 191                         return u"village"
 192         # sectoarele municipiului București
 193         if rank in [ 6 ]:
 194                 return u"sector"
 195
 196         raise ValueError, "Unexpected rank value in siruta data"
 197
 198 def nodeDictForPlace(sirutadict, oldnode = None):
 199         """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
 200 the existing data which is present in 'oldnode'"""
 201
 202         node = {}
 203         tags = {}
 204
 205         # it seems some of the input contains no data for the population and the rank,
 206         # which probably means 0 for population, and we don't care for rank
 207
 208         if sirutadict[u"population2002"] == u"":
 209                 sirutadict[u"population2002"] = u"0"
 210
 211         if oldnode:
 212                 node = oldnode
 213                 tags = oldnode[u"tag"]
 214
 215                 if not u"population" in tags:
 216                         tags[u"population"] = sirutadict[u"population2002"]
 217
 218                 if u"postal_code" in tags:
 219                         if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
 220                                 tags.pop(u"postal_code")
 221
 222                 if u"addr:postcode" in tags:
 223                         if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
 224                                 tags.pop(u"addr:postcode")
 225
 226
 227         else:
 228                 node[u"lat"] = float(sirutadict[u"lat"])
 229                 node[u"lon"] = float(sirutadict[u"lon"])
 230                 tags[u"population"] = sirutadict[u"population2002"]
 231
 232         # consistently add the 1992 census data
 233         tags[u"population:census:1992"] = sirutadict[u"population2002"]
 234
 235         # this should probably be ran even for existing nodes
 236         tags[u"place"] = sirutaTypeToPlace(sirutadict[u"siruta:type"], tags[u"population"])
 237
 238         # clean up siruta:name_sup
 239         sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
 240
 241         uninteresting = [ \
 242                                 u"lon", \
 243                                 u"lat", \
 244                                 u'siruta:rank', \
 245                                 u"population2002", \
 246                                 u"region", \
 247                                 u"siruta:region_id", \
 248                                 u"siruta:enviro_type", \
 249                                 u"siruta:sortcode" ]
 250
 251         mergetags = sirutadict.copy()
 252         for tag in sirutadict:
 253                 if tag in uninteresting:
 254                         mergetags.pop(tag)
 255
 256         tags.update(mergetags)
 257
 258         simplesup = simpleName(sirutadict[u"siruta:name_sup"])
 259         is_in = [u"România"]
 260         tags[u"is_in:country"] = u"România"
 261         is_in.insert(0,sirutadict[u"siruta:county"])
 262         tags[u"is_in:county"] = sirutadict[u"siruta:county"]
 263         if tags[u"name"] <> simplesup:
 264                 is_in.insert(0,simplesup)
 265
 266         tags[u"is_in"] = u";".join(is_in)
 267
 268         # prune the created_by tag since is deprecated to have it on the
 269         # node and the changeset contains that info anyway
 270         if u"created_by" in tags:
 271                 tags.pop(u"created_by")
 272
 273         node[u"tag"] = tags
 274
 275         return node
 276
 277 def getSameSiruta(elementlist, sirutacode):
 278         """returns a list with all the elements in list which have the siruta:code == sirutacode"""
 279
 280         newlist = []
 281         for x in elementlist:
 282                 try:
 283                         if x[u"tag"][u"siruta:code"] == sirutacode:
 284                                 newlist.append(x.copy())
 285                 except KeyError:
 286                         pass
 287
 288         return newlist
 289
 290 def readAndProcessSirutaCsv(file, comment = None, source = None):
 291         """reads the input CSV file and processes each entry"""
 292
 293         csvfile = open (file, 'r')
 294         reader = SirutaDictReader( csvfile )
 295
 296         homedir = os.environ['HOME']
 297         api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
 298
 299         if not comment:
 300                 comment = 'import places from ' + file
 301         else:
 302                 comment = unicode(comment,'utf-8')
 303         if not source:
 304                 source = u"http://geo-spatial.org siruta data import"
 305         else:
 306                 source = unicode(source,'utf-8')
 307         cs_tags = { 'comment' : comment , 'source' : source }
 308
 309         api.ChangesetCreate(cs_tags)
 310
 311         for csvplace in reader:
 312
 313                 uname = csvplace[u"name"].encode("utf-8")
 314                 print "Processing data for %s ..." % ( uname )
 315                 sys.stdout.flush()
 316                 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"] )
 317                 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"], csvplace[u"siruta:code"] )
 318
 319                 if len(existing_nodes) == 0:
 320                         # node doesn't exist for this place, or is far; we can create the node
 321                         nodedict = nodeDictForPlace ( csvplace )
 322
 323                         api.NodeCreate(nodedict)
 324                         print "Created new node for %s" % ( uname )
 325
 326                 elif len(existing_nodes) == 1:
 327                         # there is an existing code, so we merge with that
 328                         referencenode = existing_nodes[0].copy()
 329                         # dictionaries don't get copied by default
 330                         referencenode[u"tag"] = existing_nodes[0][u"tag"].copy()
 331                         nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
 332
 333                         if nodedict == referencenode:
 334                                 print "Skipping: No changes needed for node %s" % ( uname )
 335                         else:
 336                                 api.NodeUpdate(nodedict)
 337                                 print "Updated existing node for %s" % ( uname )
 338
 339                 else:
 340                         # I am confused, more than one node with the same simplified name
 341
 342                         print >> sys.stderr, "Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s,lon=%s)" % (
 343                                         uname,
 344                                         len(existing_nodes),
 345                                         csvplace[u"lat"].encode("utf-8"),
 346                                         csvplace[u"lon"].encode("utf-8") )
 347
 348                 sys.stdout.flush()
 349                 sys.stderr.flush()
 350
 351         api.ChangesetClose()
 352         csvfile.close()
 353
 354 def usage():
 355         print "%s [-c|--comment <comment>] [-s|--source <source>] -i <inputcsv>" % sys.argv[0]
 356
 357 def main(argv = None):
 358
 359         if argv is None:
 360                 argv = sys.argv
 361
 362         try:
 363                 opts, args = getopt.getopt(sys.argv[1:], "hi:c:s:",
 364                                 ["help", "input=", "comment=", "source="] )
 365         except getopt.GetoptError, err:
 366                 # print help information and exit:
 367                 print str(err)
 368                 usage()
 369                 return 2
 370
 371         file = None
 372         comment = None
 373         source = None
 374
 375         for o,a in opts:
 376                 if o in ("-h", "help"):
 377                         usage()
 378                         return 0
 379                 elif o in ("-i", "--input"):
 380                         file = a
 381                 elif o in ("-s", "--source"):
 382                         source = a
 383                 elif o in ("-c", "--comment"):
 384                         comment = a
 385
 386         if not file:
 387                 print "Input csv file (-i option) is mandatory. Run the script with -h for online help."
 388                 return 2
 389
 390
 391         readAndProcessSirutaCsv(file, source=source, comment=comment)
 392
 393
 394 if __name__ == "__main__":
 395         sys.exit(main())