Add a facility to ignore sections of the file and ignore [IMG ID] ...
[ump2osm.git] / osm-merge-places.py
blobb1d1a2237f40ba1653a3436e9ded96236e27dd34
1 #!/usr/bin/env python
2 # vim: set fileencoding=utf-8 et :
3 # txt2osm, an UnofficialMapProject .txt to OpenStreetMap .osm converter.
4 # Copyright (C) 2008 Mariusz Adamski, rhn
5 # Copyright (C) 2009 Andrzej Zaborowski
7 # This program is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU General Public License
9 # as published by the Free Software Foundation; either version 2
10 # of the License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 # MA 02110-1301, USA.
22 import sys
23 import time
24 import math
25 from xml.sax import saxutils
27 __version__ = '0.1.1'
29 # Main program.
31 pts = {}
33 maxdist = 0.10 # degree
35 # places = [ 'hamlet', 'village', 'region', 'town', 'city', 'locality' ]
37 def distance(id, lat, lon):
38 return math.hypot(pts[id]['lat'] - lat, pts[id]['lon'] - lon)
40 def mexpand(piece):
41 if piece in [ u'kol', u'kol.' ]:
42 return u'kolonia'
43 if piece in [ u'pierwszy', u'pierwsze', u'pierwsza', u'1' ]:
44 return u'i'
45 if piece in [ u'drugi', u'drugie', u'druga', u'2' ]:
46 return u'ii'
47 if piece in [ u'trzeci', u'trzecie', u'trzecia', u'3' ]:
48 return u'iii'
49 if piece in [ u'wlk.', u'wielkie', u'wielka', u'wielki' ]:
50 return u'wlk'
51 if piece in [ u'dz.', u'duze', u'duza', u'duzy', u'duże', u'duża', u'duży' ]:
52 return u'dz'
53 if piece in [ u'ml.', u'male', u'mala', u'maly', u'małe', u'mała', u'mały' ]:
54 return u'ml'
55 if piece in [ u'str.', u'stare', u'stara', u'stary', u'st', u'str' ]:
56 return u'str'
57 if piece in [ u'nw.', u'nowe', u'nowa', u'nowy', u'n', u'n.' ]:
58 return u'nw'
59 if piece in [ u'dln.', u'dolne', u'dolna', u'dolny', u'dol', u'dol.' ]:
60 return u'dln'
61 if piece in [ u'grn.', u'gorne', u'gorna', u'gorny', u'gor', u'gor.',
62 u'gór', u'gór.', u'górne', u'górna', u'górny' ]:
63 return u'grn'
64 if piece in [ u'k.', u'k', u'koło' ]:
65 return u'kolo'
66 if piece in [ u'maz', u'maz.', u'mazowieckie', u'mazowiecka' ]:
67 return u'mazowiecki'
68 if piece in [ u'wlkp', u'wlkp.', u'wielkopolskie', u'wielkopolska' ]:
69 return u'wielkopolski'
70 if piece in [ u'sl', u'sl.', u'śl', u'śl.', u'slaskie', u'slaska',
71 u'śląskie', u'śląska', u'śląski' ]:
72 return u'slaski'
73 return piece.replace(u'ó', 'o').replace(u'ł', 'l').replace(u'ź', 'z').\
74 replace(u'ż', 'z').replace(u'ą', 'a').replace(u'ę', 'e').\
75 replace(u'ć', 'c').replace(u'ś', 's').replace(u'ń', 'n')
77 def msplit(name):
78 split = name.lower().replace('-', ' ').split(' ')
79 split = [ mexpand(n.strip()) for n in split if n.strip() != '' ]
80 return split
82 def mnamematch(a, b):
83 # TODO generate the splits on load
84 a = msplit(a)
85 b = msplit(b)
86 for n in a:
87 if n not in b:
88 return 0
89 for n in b:
90 if n not in a:
91 return 0
92 return 1
94 def isround(num):
95 (num, dummy) = math.modf(num * 10.0)
96 epsilon = 0.00001
97 for i in [ 0.0, 0.166667, 0.333333, 0.5, 0.666667, 0.833333 ]:
98 if num >= i - epsilon and num <= i + epsilon:
99 return 1
100 return 0
102 def trydelete(name, lat, lon, place, id):
103 for osmid in pts:
104 if mnamematch(pts[osmid]['att']['name'], name):
105 if distance(osmid, lat, lon) < maxdist:
106 pts[osmid]['matches'] = 1
107 pts[osmid]['newpos'] = (lat, lon, id)
108 if not isround(pts[osmid]['lat']):
109 pts[osmid]['dont'] = 3
110 if not isround(pts[osmid]['lon']):
111 pts[osmid]['dont'] = 3
112 if pts[osmid]['att']['name'] != name:
113 pts[osmid]['dont'] = 1
114 if pts[osmid]['att']['place'] != place:
115 pts[osmid]['dont'] = 1
117 import locale, codecs
118 locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
119 encoding = locale.getlocale()[1]
120 sys.stdout = codecs.getwriter(encoding)(sys.stdout, errors = "replace")
121 sys.stderr = codecs.getwriter(encoding)(sys.stderr, errors = "replace")
123 afile = codecs.open(sys.argv[1], "r", "utf-8")
124 bfile = codecs.open(sys.argv[2], "r", "utf-8")
125 lat = ""
126 lon = ""
127 id = 0
128 ignore = [ 'is_in', 'name', 'place', 'created_by', 'source' ]
129 attrs = {}
130 state = 0
131 for line in afile:
132 if state == 0 and line.find("<node") > -1:
133 el = line.replace("'", ' ').split()
134 id = el[el.index("id=") + 1]
135 lat = el[el.index("lat=") + 1]
136 lon = el[el.index("lon=") + 1]
137 attrs = {}
138 state = 1
139 if line.find(" />") > -1:
140 state = 0
141 elif state == 1 and line.find("k='") > -1:
142 el = line.split("'")
143 attrs[el[1]] = el[el.index(" v=") + 1]
144 elif state == 1 and line.find("</node") > -1:
145 state = 0
146 if 'place' in attrs:
147 if 'name' not in attrs:
148 sys.stderr.write("No name for node " + str(id) + "\n")
149 continue
150 pts[id] = {
151 'c': 0,
152 'id': id,
153 'lat': float(lat),
154 'lon': float(lon),
155 'att': attrs
157 for key in attrs:
158 if key not in ignore:
159 pts[id]['dont'] = 2
161 if state == 0 and line.find("<way") > -1:
162 state = 2
163 elif state == 2 and line.find("</way") > -1:
164 state = 0
165 elif state == 2 and line.find("<nd") > -1:
166 el = line.replace("'", ' ').split()
167 id = el[el.index("ref=") + 1]
168 if id in pts:
169 pts[id]['att']['dont'] = 1;
171 if state == 0 and line.find("<relation") > -1:
172 state = 3
173 elif state == 3 and line.find("</relation") > -1:
174 state = 0
175 elif state == 3 and line.find("<member type='node'") > -1:
176 el = line.replace("'", ' ').split()
177 id = el[el.index("ref=") + 1]
178 if id in pts:
179 pts[id]['att']['dont'] = 1;
181 state = 0
182 for line in bfile:
183 if state == 0 and line.find("<node") > -1:
184 el = line.replace("'", ' ').split()
185 id = el[el.index("id=") + 1]
186 lat = el[el.index("lat=") + 1]
187 lon = el[el.index("lon=") + 1]
188 attrs = {}
189 state = 1
190 if line.find(" />") > -1:
191 state = 0
192 elif state == 1 and line.find("k='") > -1:
193 el = line.split("'")
194 attrs[el[1]] = el[el.index(" v=") + 1]
195 elif state == 1 and line.find("</node") > -1:
196 state = 0
197 if 'place' in attrs:
198 if 'name' not in attrs:
199 sys.stderr.write("No name for node " + id + "\n")
200 continue
201 trydelete(attrs['name'], float(lat), float(lon), attrs['place'], id)
203 afile.close()
204 bfile.close()
206 afile = codecs.open(sys.argv[1], "r", "utf-8")
207 state = 0
208 out = 1
209 newline = ""
210 deleteme = {}
211 for line in afile:
212 if state == 0 and line.find("<node") > -1:
213 el = line.replace("'", ' ').split()
214 id = el[el.index("id=") + 1]
215 lat = el.index("lat=") + 1
216 lon = el.index("lon=") + 1
217 state = 1
218 if line.find(" />") > -1:
219 state = 0
221 elif id in pts:
222 if 'matches' in pts[id]:
223 if 'dont' in pts[id]:
224 if pts[id]['dont'] == 2:
225 if el.count('action='):
226 pass
227 else:
228 line = line.replace('<node',
229 "<node action='modify'", 1)
230 line = line.replace("lat='" + el[lat],
231 "lat='" + str(pts[id]['newpos'][0]))
232 line = line.replace("lon='" + el[lon],
233 "lon='" + str(pts[id]['newpos'][1]))
234 deleteme[pts[id]['newpos'][2]] = 1
235 elif pts[id]['dont'] == 3:
236 if el.count('action='):
237 pass
238 else:
239 line = line.replace('<node',
240 "<node action='modify'", 1)
241 vlat = float(el[lat])
242 vlat += 0.3 * (pts[id]['newpos'][0] - vlat)
243 vlon = float(el[lon])
244 vlon += 0.3 * (pts[id]['newpos'][1] - vlon)
245 line = line.replace("lat='" + el[lat],
246 "lat='" + str(vlat))
247 line = line.replace("lon='" + el[lon],
248 "lon='" + str(vlon))
249 deleteme[pts[id]['newpos'][2]] = 1
250 else:
251 newline = " <tag k='mergeme' v='yes' />\n"
252 elif int(id) < 0:
253 out = 0
254 elif el.count('action='):
255 if el[el.index('action=') + 1] == 'modify':
256 line = line.replace('modify', 'delete', 1)
257 elif el[el.index('action=') + 1] == 'delete':
258 pass
259 else:
260 throw
261 else:
262 line = line.replace('<node', "<node action='delete'", 1)
264 if out:
265 sys.stdout.write(line)
266 sys.stdout.write(newline)
267 newline = ""
269 if state == 1 and line.find("</node") > -1:
270 state = 0
271 out = 1
273 afile.close()
275 bfile = codecs.open(sys.argv[2], "r", "utf-8")
276 state = 0
277 out = 1
278 for line in bfile:
279 if state == 0 and line.find("<node") > -1:
280 el = line.replace("'", ' ').split()
281 id = el[el.index("id=") + 1]
282 state = 1
283 if line.find(" />") > -1:
284 state = 0
286 elif id in deleteme:
287 out = 0
289 if out:
290 sys.stdout.write(line)
291 sys.stdout.write(newline)
293 if state == 1 and line.find("</node") > -1:
294 state = 0
295 out = 1
297 bfile.close()
299 #for pt in pts:
300 # print "(%s, %s) is from %s\n" % (pt[0], pt[1], pts[pt])