From d1f9086013da37e0913404fdaa63c0c856e2fc40 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michael=20H=C3=A4ckel?= Date: Sun, 2 Dec 2012 10:36:06 +0100 Subject: [PATCH] add script for comparing places in OSM with siruta --- contrib/searchplaces.py | 224 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100755 contrib/searchplaces.py diff --git a/contrib/searchplaces.py b/contrib/searchplaces.py new file mode 100755 index 0000000..5eeaba6 --- /dev/null +++ b/contrib/searchplaces.py @@ -0,0 +1,224 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# +# Requirements: Imposm (Omniscale) +# sirutalib (http://proiecte.strainu.ro/siruta/) +# localitati2008_utf8.csv (geo-spatial.org) +# +# This script compares the places in OSM with the siruta database. It checks +# for places without siruta:code, for contradictions between names in OSM and +# the siruta database, for places with equal siruta:code and for places which +# are missing in OSM. With the help of the data from geo-spatial.org a .osm +# file is created which can be used to reimport missing places. +# Note: With the extracts from Geofabrik some false positives are reported +# close to the border because they use a simplified polygon for cutting. +# +# © Michael Häckel, GPLv3 +# + +from imposm.parser import OSMParser +import sirutalib, codecs, re, subprocess, csv + +siruta = sirutalib.SirutaDatabase() +nodeCounter = 1 +is_in = u'' +county = u'' + +def formatName(input): + output = u'' + first = True + for c in input: + if first: + output += c + else: + output += c.lower() + first = c in [' ', '-', '.'] + return output.replace(u' De ', u' de ').replace(u' La ', u' la ') \ + .replace(u' Din ', u' din ').replace(u' ', ' ') + +placeList = [] + +def makeList(): + class NameSearch(object): + + def writeEntry(self, objecttype, osmid, tags, refs): + if (u'place' in tags): # and not (u'boundary' in tags) and not (u'admin_level' in tags): +# if (u'place' in tags) and (objecttype == 'n'): + listEntry = {'id': objecttype + str(osmid), + 'place': tags[u'place'], + 'siruta': 0} + if u'siruta:code' in tags: + code = tags[u'siruta:code'] + sirutaName = formatName(siruta.get_name(long(code))) + listEntry['siruta'] = long(code) + listEntry['siruta_name'] = sirutaName + if u'name' in tags: + if (sirutaName.lower() != tags[u'name'].lower()) and \ + ((not u'alt_name' in tags) or \ + sirutaName.lower() != tags[u'alt_name'].lower()): + f.write(code + ' ' + sirutaName + ' != ' + objecttype) + f.write(str(osmid) + ' ' + tags[u'name'] + '\n') + if u'name' in tags: + listEntry['name'] = tags[u'name'] + placeList.append(listEntry) + + def nodes(self, nodes): + for osmid, tags, refs in nodes: + self.writeEntry('n', osmid, tags, refs) + + def ways(self, ways): + for osmid, tags, refs in ways: + self.writeEntry('w', osmid, tags, refs) + + def relations(self, relations): + for osmid, tags, refs in relations: + self.writeEntry('r', osmid, tags, refs) + + f = codecs.open('places-contradictions.txt', 'w', 'utf8') + search = NameSearch() + p = OSMParser(concurrency=4, nodes_callback=search.nodes, ways_callback=search.ways, relations_callback=search.relations) + p.parse('romania.osm.pbf') + f.close(); + +def toAscii(input): + return input.replace(u'ș',u's').replace(u'ț',u't') \ + .replace(u'â',u'a').replace(u'ă',u'a').replace(u'î',u'i') \ + .replace(u'Ș',u's').replace(u'Ț',u't').replace(u'Â',u'a') \ + .replace(u'Ă',u'a').replace(u'Î',u'i').lower() \ + .replace(u'oras ',u'').replace(u'municipiul ',u'') + +def writePlaceInformation(f, code): + global is_in, county + f.write(u'name = ' + formatName(siruta.get_name(code)) + '\n') + f.write(u'siruta:code = ' + str(code) + '\n') + f.write(u'postal_code = ' + str(siruta.get_postal_code(code)) + '\n') + is_in = u'' + county = u'' + if siruta.get_type(code) == 40: + is_in = u'România' + else: + supcode = code + while True: + supcode = siruta.get_sup_code(supcode) + if is_in != u'': + is_in += u';' + supname = formatName(siruta.get_name(supcode).replace(u'JUDEȚUL ',u'').replace(u'ORAȘ ',u'').replace(u'MUNICIPIUL ',u'')) + is_in += supname + if siruta.get_type(supcode) == 40: + f.write(u'is_in:county = ' + supname + '\n') + county = supname + is_in += u';România' + break + f.write(u'is_in:country = România' + '\n') + f.write(u'is_in = ' + is_in + '\n') + f.write('\n') + +def searchSirutaMatches(): + f = codecs.open('places-without-siruta.txt', 'w', 'utf8') + for place in placeList: + if place['siruta'] == 0 and 'name' in place: + nameAscii = toAscii(place['name']) + f.write(place['id'] + '\t') + if 'place' in place: + f.write(place['place']) + f.write('\t' + place['name'] + '\n') + for entry in siruta._data.values(): + if nameAscii == toAscii(entry['name']): + writePlaceInformation(f, entry['siruta']) + f.write('\n') + f.close() + +def searchDoubleEntries(): + f = codecs.open('place-doubles.txt', 'w', 'utf8') + lastSiruta = 0 + lastId = '' + for place in placeList: + if place['siruta'] != 0: + if place['siruta'] == lastSiruta: + f.write(lastId + ', ' + place['id'] + '\n') + if 'name' in place: + f.write(place['name'] + '\n') + f.write('\n') + lastSiruta = place['siruta'] + lastId = place['id'] + f.close() + +def writePlaceNode(f, x, y, code): + global nodeCounter + f.write(u" \n") + nodeCounter += 1 + f.write(u" \n") + f.write(u" \n") + f.write(u" \n") + f.write(u" \n") + f.write(u" \n") + f.write(u" \n") + f.write(u" \n") + f.write(u" \n") + f.write(u" \n") + +def searchMissingPlaces(): + geofile = open('localitati2008_utf8.csv', 'r') + georeader = csv.reader(geofile) + geodict = {} + for row in georeader: + if row[3] != 'SIRUTA': + geodict[long(row[3])] = [row[0], row[1]] + geofile.close() + f = codecs.open('places-missing.txt', 'w', 'utf8') + g = codecs.open('places-missing.osm', 'w', 'utf8') + g.write("\n") + g.write("\n") + placeIter = iter(placeList) + place = placeIter.next() + codes = [] + for entry in siruta._data.values(): + codes.append(entry['siruta']) + codes.sort() + endReached = False + for code in codes: + while not endReached and place['siruta'] < code: + try: + place = placeIter.next() + except StopIteration: + endReached = True + if endReached or place['siruta'] != code: + if not siruta.get_type(code) in [1, 2, 3, 4, 5, 6, 40]: + if code in geodict: + f.write(geodict[code][0] + ',' + geodict[code][1] + '\n') + writePlaceInformation(f, code) + if code in geodict: + writePlaceNode(g, geodict[code][0], geodict[code][1], code) + f.close() + g.write("") + g.close() + +print('Making list of places.') +makeList() + +print('Sorting.') +placeList.sort(key=lambda entry: entry['siruta']) + +print('Search matches in Siruta database.') +searchSirutaMatches() + +print('Search double places in OSM.') +searchDoubleEntries() + +print('Search missing places.') +searchMissingPlaces() + +print('Finished.') -- 2.11.4.GIT