OsmLocImport.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 import re
   5 import unicodedata
   6 from xml.dom import minidom
   7
   8 import os
   9 if os.path.exists('OsmApi.py'):
  10         import sys
  11         sys.path.insert(0,'.')
  12 from OsmApi import OsmApi
  13
  14 from math import cos, radians
  15
  16 if os.path.exists('siruta.py'):
  17         if not '.' in sys.path:
  18                 sys.path.insert(0,'.')
  19 from sirutacsv import SirutaDictReader
  20
  21 import getopt
  22
  23
  24 def stripAccents(s):
  25         """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
  26
  27         try:
  28                 s = unicode(s,'utf-8')
  29         except TypeError:
  30                 pass
  31         s = s.replace(u"-", u" ")
  32         return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
  33
  34
  35 def simplifyName(unicode_name):
  36         """a function to turn into lowercase ASCII any name (by stripping accents)"""
  37
  38         simplename = unicode_name.lower()
  39         simplename = stripAccents(simplename)
  40
  41         return simplename
  42
  43 def getNodesList(osmxmlcontents):
  44         """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
  45
  46         if len(osmxmlcontents) > 1:
  47                 raise ValueError('Too many osm blocks in one XML')
  48
  49         # select only the nodes from the OSM XML
  50         xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
  51
  52         # TODO: use an internal data member for the api
  53         api = OsmApi()
  54         osmnodes = []
  55         for xmlnode in xmlnodes:
  56                 osmnode = api._DomParseNode(xmlnode)
  57                 osmnodes.append(osmnode)
  58
  59         return osmnodes
  60
  61
  62 def getMatchingPlaces(osmapielements,placename):
  63         """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
  64         list = []
  65
  66         sname = simplifyName (placename)
  67
  68         for i in osmapielements:
  69                 try:
  70                         if len( i[u"tag"][u"place"] ) > 0 and \
  71                                 sname == simplifyName(i[u"tag"][u"name"]):
  72                                 list.append(i)
  73                 except KeyError:
  74                         # that node didn't have the 'place' or a 'name' tag, so is uninteresting
  75                         pass
  76
  77         return list
  78
  79
  80 def locatePlaceInXML(xml,placename):
  81         """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
  82 into the xml document 'xml'. The input xml document is the one returned by the OSM API
  83 through a query for data within a bounding box"""
  84
  85         if os.path.exists(xml):
  86                 xmldoc = minidom.parse(xml)
  87         else:
  88                 xmldoc = minidom.parseString(xml.encode("utf-8"))
  89
  90         # each OSM XML has a single root osm element
  91         osmxmlcontents = xmldoc.getElementsByTagName('osm')
  92
  93         nodeslist = getNodesList(osmxmlcontents)
  94
  95         nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
  96
  97         places = getMatchingPlaces(nodeswithtags, placename)
  98
  99         return places
 100
 101 def getArea ( bbox_str ):
 102         """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
 103
 104         path = "/api/0.6/map?bbox=" + bbox_str
 105
 106         # TODO: use an internal data member for the api
 107         api = OsmApi()
 108         data = api._get ( path )
 109
 110         return data
 111
 112 def getMapAroundPoint(lon, lat, bbox_km = 10):
 113         """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
 114 within a bbox_km area"""
 115
 116         # one degree latitude is approximately 111km
 117         # and we want to know what's half of bbox_km in lat degrees
 118         delta_lat = bbox_km / 222.0
 119
 120         lat = float (lat)
 121         lon = float (lon)
 122         # one degree longitude is a cos(lat) * 111
 123         # and we want to know what's half of bbox_km in lon degrees
 124         delta_lon = cos( radians (lat) ) * delta_lat
 125
 126         lat_b = lat - delta_lat
 127         lat_t = lat + delta_lat
 128
 129         lon_l = lon - delta_lon
 130         lon_r = lon + delta_lon
 131
 132
 133         path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
 134
 135         area_xml_string = getArea ( path )
 136
 137         return area_xml_string
 138
 139 def simpleName(placename):
 140         """Removes from a name of a place any prefix that indicates its clasification"""
 141         simpleplacename = placename.replace(u"Municipiul ",u"",1)
 142         simpleplacename = simpleplacename.replace(u"Oraș ",u"",1)
 143
 144         return simpleplacename
 145
 146 def sirutaTypeToPlace(sirutarank, population):
 147         """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
 148
 149 Cod     Denumire tip de unitate administrativ teritorială
 150
 151  40     Judeţ, municipiul Bucureşti
 152   1     Municipiu  reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
 153   2     Oraş ce aparţine de judeţ, altul decât oraş  reşedinţă de judeţ
 154   3     Comună
 155   4     Municipiu, altul decât reşedinţă de judeţ
 156   5     Oraş reşedinţă de judeţ
 157   6     Sector al  municipiului Bucureşti
 158   9     Localitate  componentă,  reşedinţă de  municipiu
 159  10     Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
 160  11     Sat ce aparţine de municipiu
 161  17     Localitate componentă reşedinţă a oraşului
 162  18     Localitate  componentă a unui oraş, alta decât reşedinţă de  oraş
 163  19     Sat care aparţine  unui oraş
 164  22     Sat reşedinţă de comună
 165  23     Sat ce aparţine de comună, altul  decât reşedinţă de comună
 166 """
 167
 168         rank = int(sirutarank)
 169
 170         # municipii, reședințe de județ, reședințe de municipiu
 171         if rank in [ 1, 4, 9, 40 ]:
 172                 return u"city"
 173         # orașe, orașe reședință de județ, reședințe ale orașelor
 174         if rank in [ 2, 5, 17 ]:
 175                 return u"town"
 176         # localități componente ale orașelor sau municpiilor, altele decât reședințele
 177         if rank in [ 10, 18 ]:
 178                 return u"village"
 179         # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
 180         if rank in [ 3, 11, 19, 22, 23 ]:
 181                 # doar satele non-reședință ce aparțin de comune pot fi cătune (hamlet)
 182                 if rank == 23 and int(population) < 50:
 183                         return u"hamlet"
 184                 else:
 185                         return u"village"
 186         # sectoarele municipiului București
 187         if rank in [ 6 ]:
 188                 return u"sector"
 189
 190         raise ValueError, "Unexpected rank value in siruta data"
 191
 192 def nodeDictForPlace(sirutadict, oldnode = None):
 193         """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
 194 the existing data which is present in 'oldnode'"""
 195
 196         node = {}
 197         tags = {}
 198
 199         # it seems some of the input contains no data for the population and the rank,
 200         # which probably means 0 for population, and we don't care for rank
 201
 202         if sirutadict[u"population2002"] == u"":
 203                 sirutadict[u"population2002"] = u"0"
 204
 205         if oldnode:
 206                 node = oldnode
 207                 tags = oldnode[u"tag"]
 208
 209                 if not u"population" in tags:
 210                         tags[u"population"] = sirutadict[u"population2002"]
 211
 212                 if u"postal_code" in tags:
 213                         if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
 214                                 tags.pop(u"postal_code")
 215
 216                 if u"addr:postcode" in tags:
 217                         if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
 218                                 tags.pop(u"addr:postcode")
 219
 220
 221         else:
 222                 node[u"lat"] = float(sirutadict[u"lat"])
 223                 node[u"lon"] = float(sirutadict[u"lon"])
 224                 tags[u"population"] = sirutadict[u"population2002"]
 225
 226         # consistently add the 1992 census data
 227         tags[u"population:census:1992"] = sirutadict[u"population2002"]
 228
 229         # this should probably be ran even for existing nodes
 230         tags[u"place"] = sirutaTypeToPlace(sirutadict[u"siruta:type"], tags[u"population"])
 231
 232         # clean up siruta:name_sup
 233         sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
 234
 235         uninteresting = [ \
 236                                 u"lon", \
 237                                 u"lat", \
 238                                 u'siruta:rank', \
 239                                 u"population2002", \
 240                                 u"region", \
 241                                 u"siruta:region_id", \
 242                                 u"siruta:enviro_type", \
 243                                 u"siruta:sortcode" ]
 244
 245         mergetags = sirutadict.copy()
 246         for tag in sirutadict:
 247                 if tag in uninteresting:
 248                         mergetags.pop(tag)
 249
 250         tags.update(mergetags)
 251
 252         simplesup = simpleName(sirutadict[u"siruta:name_sup"])
 253         is_in = [u"România"]
 254         tags[u"is_in:country"] = u"România"
 255         is_in.insert(0,sirutadict[u"siruta:county"])
 256         tags[u"is_in:county"] = sirutadict[u"siruta:county"]
 257         if tags[u"name"] <> simplesup:
 258                 is_in.insert(0,simplesup)
 259
 260         tags[u"is_in"] = u";".join(is_in)
 261
 262         node[u"tag"] = tags
 263
 264         return node
 265
 266 def getSameSiruta(elementlist, sirutacode):
 267         """returns a list with all the elements in list which have the siruta:code == sirutacode"""
 268
 269         newlist = []
 270         for x in elementlist:
 271                 try:
 272                         if x[u"tag"][u"siruta:code"] == sirutacode:
 273                                 newlist.append(x.copy())
 274                 except KeyError:
 275                         pass
 276
 277         return newlist
 278
 279 def readAndProcessSirutaCsv(file, comment = None, source = None):
 280         """reads the input CSV file and processes each entry"""
 281
 282         csvfile = open (file, 'r')
 283         reader = SirutaDictReader( csvfile )
 284
 285         homedir = os.environ['HOME']
 286         api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
 287
 288         if not comment:
 289                 comment = 'import places from ' + file
 290         else:
 291                 comment = unicode(comment,'utf-8')
 292         if not source:
 293                 source = u"http://geo-spatial.org siruta data import"
 294         else:
 295                 source = unicode(source,'utf-8')
 296         cs_tags = { 'comment' : comment , 'source' : source }
 297
 298         api.ChangesetCreate(cs_tags)
 299
 300         for csvplace in reader:
 301
 302                 uname = csvplace[u"name"].encode("utf-8")
 303                 print "Processing data for %s ..." % ( uname )
 304                 sys.stdout.flush()
 305                 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"] )
 306                 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"] )
 307
 308                 if len(existing_nodes) == 0:
 309                         # node doesn't exist for this place, or is far; we can create the node
 310                         nodedict = nodeDictForPlace ( csvplace )
 311
 312                         api.NodeCreate(nodedict)
 313                         print "Created new node for %s" % ( uname )
 314
 315                 elif len(existing_nodes) > 1:
 316                         # I am confused, more than one node with the same simplified name
 317                         # try to see if there's already a siruta code attached
 318
 319                         newlist = getSameSiruta( existing_nodes, csvplace[u"siruta:code"] )
 320                         if len(newlist) == 1:
 321                                 existing_nodes = newlist
 322                         else:
 323                                 print >> sys.stderr, "Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s,lon=%s)" % (
 324                                                 uname,
 325                                                 len(existing_nodes),
 326                                                 csvplace[u"lat"].encode("utf-8"),
 327                                                 csvplace[u"lon"].encode("utf-8") )
 328
 329                 if len(existing_nodes) == 1:
 330                         # there is an existing code, so we merge with that
 331                         referencenode = existing_nodes[0].copy()
 332                         # dictionaries don't get copied by default
 333                         referencenode[u"tag"] = existing_nodes[0][u"tag"].copy()
 334                         nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
 335
 336                         if nodedict == referencenode:
 337                                 print "Skipping: No changes needed for node %s" % ( uname )
 338                         else:
 339                                 api.NodeUpdate(nodedict)
 340                                 print "Updated existing node for %s" % ( uname )
 341
 342                 sys.stdout.flush()
 343                 sys.stderr.flush()
 344
 345         api.ChangesetClose()
 346         csvfile.close()
 347
 348 def usage():
 349         print "%s [-c|--comment <comment>] [-s|--source <source>] -i <inputcsv>" % sys.argv[0]
 350
 351 def main(argv = None):
 352
 353         if argv is None:
 354                 argv = sys.argv
 355
 356         try:
 357                 opts, args = getopt.getopt(sys.argv[1:], "hi:c:s:",
 358                                 ["help", "input=", "comment=", "source="] )
 359         except getopt.GetoptError, err:
 360                 # print help information and exit:
 361                 print str(err)
 362                 usage()
 363                 return 2
 364
 365         file = None
 366         comment = None
 367         source = None
 368
 369         for o,a in opts:
 370                 if o in ("-h", "help"):
 371                         usage()
 372                         return 0
 373                 elif o in ("-i", "--input"):
 374                         file = a
 375                 elif o in ("-s", "--source"):
 376                         source = a
 377                 elif o in ("-c", "--comment"):
 378                         comment = a
 379
 380         if not file:
 381                 print "Input csv file (-i option) is mandatory. Run the script with -h for online help."
 382                 return 2
 383
 384
 385         readAndProcessSirutaCsv(file, source=source, comment=comment)
 386
 387
 388 if __name__ == "__main__":
 389         sys.exit(main())