OsmLocImport.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3
   4 import re
   5 import unicodedata
   6 from xml.dom import minidom
   7
   8 import os
   9 if os.path.exists('OsmApi.py'):
  10         import sys
  11         sys.path.insert(0,'.')
  12 from OsmApi import OsmApi
  13
  14 from math import cos, radians
  15
  16 if os.path.exists('siruta.py'):
  17         if not '.' in sys.path:
  18                 sys.path.insert(0,'.')
  19 from sirutacsv import SirutaDictReader
  20
  21 import getopt
  22
  23
  24 def stripAccents(s):
  25         """strip any accents that may exist in a unicode object and leave only the base ASCII char"""
  26
  27         try:
  28                 s = unicode(s,'utf-8')
  29         except TypeError:
  30                 pass
  31         s = s.replace(u"-", u" ")
  32         return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
  33
  34
  35 def simplifyName(unicode_name):
  36         """a function to turn into lowercase ASCII any name (by stripping accents)"""
  37
  38         simplename = unicode_name.lower()
  39         simplename = stripAccents(simplename)
  40
  41         return simplename
  42
  43 def getNodesList(osmxmlcontents):
  44         """extracts only the nodes from an OSM XML and stores them in a OsmApi structure"""
  45
  46         if len(osmxmlcontents) > 1:
  47                 raise ValueError('Too many osm blocks in one XML')
  48
  49         # select only the nodes from the OSM XML
  50         xmlnodes = osmxmlcontents[0].getElementsByTagName ('node')
  51
  52         # TODO: use an internal data member for the api
  53         api = OsmApi()
  54         osmnodes = []
  55         for xmlnode in xmlnodes:
  56                 osmnode = api._DomParseNode(xmlnode)
  57                 osmnodes.append(osmnode)
  58
  59         return osmnodes
  60
  61
  62 def getMatchingPlaces(osmapielements,placename):
  63         """filters from 'osmapielements' only the places with the same simplified name as 'placename'"""
  64         list = []
  65
  66         sname = simplifyName (placename)
  67
  68         for i in osmapielements:
  69                 try:
  70                         if len( i[u"tag"][u"place"] ) > 0 and \
  71                                 sname == simplifyName(i[u"tag"][u"name"]):
  72                                 list.append(i)
  73                 except KeyError:
  74                         # that node didn't have the 'place' or a 'name' tag, so is uninteresting
  75                         pass
  76
  77         return list
  78
  79
  80 def locatePlaceInXML(xml,placename):
  81         """Looks for nodes which have the attribute 'place' and whose name looks like 'placename'
  82 into the xml document 'xml'. The input xml document is the one returned by the OSM API
  83 through a query for data within a bounding box"""
  84
  85         if os.path.exists(xml):
  86                 xmldoc = minidom.parse(xml)
  87         else:
  88                 xmldoc = minidom.parseString(xml.encode("utf-8"))
  89
  90         # each OSM XML has a single root osm element
  91         osmxmlcontents = xmldoc.getElementsByTagName('osm')
  92
  93         nodeslist = getNodesList(osmxmlcontents)
  94
  95         nodeswithtags = [ x for x in nodeslist if len(x[u"tag"]) > 0 ]
  96
  97         places = getMatchingPlaces(nodeswithtags, placename)
  98
  99         return places
 100
 101 def getArea ( bbox_str ):
 102         """Given the bounding box defined by the bbox_str string, return the map within that bbox"""
 103
 104         path = "/api/0.6/map?bbox=" + bbox_str
 105
 106         # TODO: use an internal data member for the api
 107         api = OsmApi()
 108         data = api._get ( path )
 109
 110         return data
 111
 112 def getMapAroundPoint(lon, lat, bbox_km = 10):
 113         """Given the latitude 'lat' and longitude 'lon', get from the API the map around that point
 114 within a bbox_km area"""
 115
 116         # one degree latitude is approximately 111km
 117         # and we want to know what's half of bbox_km in lat degrees
 118         delta_lat = bbox_km / 222.0
 119
 120         lat = float (lat)
 121         lon = float (lon)
 122         # one degree longitude is a cos(lat) * 111
 123         # and we want to know what's half of bbox_km in lon degrees
 124         delta_lon = cos( radians (lat) ) * delta_lat
 125
 126         lat_b = lat - delta_lat
 127         lat_t = lat + delta_lat
 128
 129         lon_l = lon - delta_lon
 130         lon_r = lon + delta_lon
 131
 132
 133         path = "%.6f,%.6f,%.6f,%.6f" % ( lon_l, lat_b, lon_r, lat_t )
 134
 135         area_xml_string = getArea ( path )
 136
 137         return area_xml_string
 138
 139 def simpleName(placename):
 140         """Removes from a name of a place any prefix that indicates its clasification"""
 141         simpleplacename = placename.replace(u"Municipiul ",u"",1)
 142         simpleplacename = simpleplacename.replace(u"Oraș ",u"",1)
 143
 144         return simpleplacename
 145
 146 def sirutaTypeToPlace(sirutarank, population):
 147         """Maps siruta ranks to proper 'place' values. The siruta types are explained bellow
 148
 149 Cod     Denumire tip de unitate administrativ teritorială
 150
 151  40     Judeţ, municipiul Bucureşti
 152   1     Municipiu  reşedinţă de judeţ, reşedinţă a municipiului Bucureşti
 153   2     Oraş ce aparţine de judeţ, altul decât oraş  reşedinţă de judeţ
 154   3     Comună
 155   4     Municipiu, altul decât reşedinţă de judeţ
 156   5     Oraş reşedinţă de judeţ
 157   6     Sector al  municipiului Bucureşti
 158   9     Localitate  componentă,  reşedinţă de  municipiu
 159  10     Localitate componentă, a unui municipiu alta decât reşedinţă de municipiu
 160  11     Sat ce aparţine de municipiu
 161  17     Localitate componentă reşedinţă a oraşului
 162  18     Localitate  componentă a unui oraş, alta decât reşedinţă de  oraş
 163  19     Sat care aparţine  unui oraş
 164  22     Sat reşedinţă de comună
 165  23     Sat ce aparţine de comună, altul  decât reşedinţă de comună
 166 """
 167
 168         rank = int(sirutarank)
 169
 170         # municipii, reședințe de județ, reședințe de municipiu
 171         if rank in [ 1, 4, 9, 40 ]:
 172                 return u"city"
 173         # orașe, orașe reședință de județ, reședințe ale orașelor
 174         if rank in [ 2, 5, 17 ]:
 175                 return u"town"
 176         # localități componente ale orașelor sau municpiilor, altele decât reședințele
 177         if rank in [ 10, 18 ]:
 178                 return u"village"
 179         # comune, sate parte din municipii, sate parte din orașe, reședințe de comună, sate non-reședință
 180         if rank in [ 3, 11, 19, 22, 23 ]:
 181                 # doar satele non-reședință ce aparțin de comune pot fi cătune (hamlet)
 182                 if rank == 23 and int(population) < 50:
 183                         return u"hamlet"
 184                 else:
 185                         return u"village"
 186         # sectoarele municipiului București
 187         if rank in [ 6 ]:
 188                 return u"sector"
 189
 190         raise ValueError, "Unexpected rank value in siruta data"
 191
 192 def nodeDictForPlace(sirutadict, oldnode = None):
 193         """Creates a proper dictionary structure for the node defined in 'sirutadict' taking into account
 194 the existing data which is present in 'oldnode'"""
 195
 196         node = {}
 197         tags = {}
 198
 199         # it seems some of the input contains no data for the population and the rank,
 200         # which probably means 0 for population, and we don't care for rank
 201
 202         if sirutadict[u"population2002"] == u"":
 203                 sirutadict[u"population2002"] = u"0"
 204
 205         if oldnode:
 206                 node = oldnode
 207                 tags = oldnode[u"tag"]
 208
 209                 if not u"population" in tags:
 210                         tags[u"population"] = sirutadict[u"population2002"]
 211
 212                 if u"postal_code" in tags:
 213                         if int(tags[u"postal_code"]) == int(sirutadict[u"old_postal_code"]):
 214                                 tags.pop(u"postal_code")
 215
 216                 if u"addr:postcode" in tags:
 217                         if int(tags[u"addr:postcode"]) == int(sirutadict[u"old_postal_code"]):
 218                                 tags.pop(u"addr:postcode")
 219
 220
 221         else:
 222                 node[u"lat"] = float(sirutadict[u"lat"])
 223                 node[u"lon"] = float(sirutadict[u"lon"])
 224                 tags[u"population"] = sirutadict[u"population2002"]
 225
 226         # consistently add the 1992 census data
 227         tags[u"population:census:1992"] = sirutadict[u"population2002"]
 228
 229         # this should probably be ran even for existing nodes
 230         tags[u"place"] = sirutaTypeToPlace(sirutadict[u"siruta:type"], tags[u"population"])
 231
 232         # clean up siruta:name_sup
 233         sirutadict[u"siruta:name_sup"] = simpleName(sirutadict[u"siruta:name_sup"])
 234
 235         uninteresting = [ \
 236                                 u"lon", \
 237                                 u"lat", \
 238                                 u'siruta:rank', \
 239                                 u"population2002", \
 240                                 u"region", \
 241                                 u"siruta:region_id", \
 242                                 u"siruta:enviro_type", \
 243                                 u"siruta:sortcode" ]
 244
 245         mergetags = sirutadict.copy()
 246         for tag in sirutadict:
 247                 if tag in uninteresting:
 248                         mergetags.pop(tag)
 249
 250         tags.update(mergetags)
 251
 252         simplesup = simpleName(sirutadict[u"siruta:name_sup"])
 253         is_in = [u"România"]
 254         tags[u"is_in:country"] = u"România"
 255         is_in.insert(0,sirutadict[u"siruta:county"])
 256         tags[u"is_in:county"] = sirutadict[u"siruta:county"]
 257         if tags[u"name"] <> simplesup:
 258                 is_in.insert(0,simplesup)
 259
 260         tags[u"is_in"] = u";".join(is_in)
 261
 262         node[u"tag"] = tags
 263
 264         return node
 265
 266
 267 def readAndProcessSirutaCsv(file, comment = None, source = None):
 268         """reads the input CSV file and processes each entry"""
 269
 270         csvfile = open (file, 'r')
 271         reader = SirutaDictReader( csvfile )
 272
 273         homedir = os.environ['HOME']
 274         api = OsmApi(passwordfile = homedir + '/.config/osm-import-loc/osm-auth', appid = 'RoOsmLocImporter')
 275
 276         if not comment:
 277                 comment = 'import places from ' + file
 278         else:
 279                 comment = unicode(comment,'utf-8')
 280         if not source:
 281                 source = u"http://geo-spatial.org siruta data import"
 282         else:
 283                 source = unicode(source,'utf-8')
 284         cs_tags = { 'comment' : comment , 'source' : source }
 285
 286         api.ChangesetCreate(cs_tags)
 287
 288         for csvplace in reader:
 289
 290                 uname = csvplace[u"name"].encode("utf-8")
 291                 print "Processing data for %s ..." % ( uname )
 292                 sys.stdout.flush()
 293                 map = getMapAroundPoint ( lat = csvplace[u"lat"], lon = csvplace[u"lon"] )
 294                 existing_nodes = locatePlaceInXML ( map, csvplace[u"name"] )
 295
 296                 if len(existing_nodes) == 0:
 297                         # node doesn't exist for this place, or is far; we can create the node
 298                         nodedict = nodeDictForPlace ( csvplace )
 299
 300                         api.NodeCreate(nodedict)
 301                         print "Created new node for %s" % ( uname )
 302
 303                 elif len(existing_nodes) == 1:
 304                         # there is an existing code, so we merge with that
 305                         referencenode = existing_nodes[0].copy()
 306                         # dictionaries don't get copied by default
 307                         referencenode[u"tag"] = existing_nodes[0][u"tag"].copy()
 308                         nodedict = nodeDictForPlace ( csvplace, existing_nodes[0] )
 309
 310                         if nodedict == referencenode:
 311                                 print "Skipping: No changes needed for node %s" % ( uname )
 312                         else:
 313                                 api.NodeUpdate(nodedict)
 314                                 print "Updated existing node for %s" % ( uname )
 315
 316                 else:
 317                         # I am confused, more than one node with the same simplified name
 318
 319                         print >> sys.stderr, "Skipping %s: Too many (%d) existing nodes with the same name at (lat=%s,lon=%s)" % (
 320                                         uname,
 321                                         len(existing_nodes),
 322                                         csvplace[u"lat"],
 323                                         csvplace[u"lon"] )
 324
 325                 sys.stdout.flush()
 326                 sys.stderr.flush()
 327
 328         api.ChangesetClose()
 329         csvfile.close()
 330
 331 def usage():
 332         print "%s [-c|--comment <comment>] [-s|--source <source>] -i <inputcsv>" % sys.argv[0]
 333
 334 def main(argv = None):
 335
 336         if argv is None:
 337                 argv = sys.argv
 338
 339         try:
 340                 opts, args = getopt.getopt(sys.argv[1:], "hi:c:s:",
 341                                 ["help", "input=", "comment=", "source="] )
 342         except getopt.GetoptError, err:
 343                 # print help information and exit:
 344                 print str(err)
 345                 usage()
 346                 return 2
 347
 348         file = None
 349         comment = None
 350         source = None
 351
 352         for o,a in opts:
 353                 if o in ("-h", "help"):
 354                         usage()
 355                         return 0
 356                 elif o in ("-i", "--input"):
 357                         file = a
 358                 elif o in ("-s", "--source"):
 359                         source = a
 360                 elif o in ("-c", "--comment"):
 361                         comment = a
 362
 363         if not file:
 364                 print "Input csv file (-i option) is mandatory. Run the script with -h for online help."
 365                 return 2
 366
 367
 368         readAndProcessSirutaCsv(file, source=source, comment=comment)
 369
 370
 371 if __name__ == "__main__":
 372         sys.exit(main())