TortoiseGitMerge: Updated libsvn stuff
[TortoiseGit.git] / src / TortoiseMerge / libsvn_diff / utf_validate.c
blob86ccbc8fb1c7fb8b8dbe624b3dbba456fe711b5b
1 /*
2 * utf_validate.c: Validate a UTF-8 string
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
20 * under the License.
21 * ====================================================================
24 /* Validate a UTF-8 string according to the rules in
26 * Table 3-6. Well-Formed UTF-8 Bytes Sequences
28 * in
30 * The Unicode Standard, Version 4.0
32 * which is available at
34 * http://www.unicode.org/
36 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
37 * is a subset of that enconding. The Unicode enconding prohibits things
38 * like non-shortest encodings (some characters can be represented by more
39 * than one multi-byte encoding) and the encodings for the surrogate code
40 * points. RFC-3629 superceeds RFC-2279 and adopts the same well-formed
41 * rules as Unicode. This is the ABNF in RFC-3629 that describes
42 * well-formed UTF-8 rules:
44 * UTF8-octets = *( UTF8-char )
45 * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
46 * UTF8-1 = %x00-7F
47 * UTF8-2 = %xC2-DF UTF8-tail
48 * UTF8-3 = %xE0 %xA0-BF UTF8-tail /
49 * %xE1-EC 2( UTF8-tail ) /
50 * %xED %x80-9F UTF8-tail /
51 * %xEE-EF 2( UTF8-tail )
52 * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
53 * %xF1-F3 3( UTF8-tail ) /
54 * %xF4 %x80-8F 2( UTF8-tail )
55 * UTF8-tail = %x80-BF
59 #include "private/svn_utf_private.h"
60 #include "private/svn_eol_private.h"
61 #include "private/svn_dep_compat.h"
63 /* Lookup table to categorise each octet in the string. */
64 static const char octet_category[256] = {
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00-0x7f */
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80-0x8f */
74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0x90-0x9f */
75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xa0-0xbf */
76 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
77 4, 4, /* 0xc0-0xc1 */
78 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 0xc2-0xdf */
79 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
80 6, /* 0xe0 */
81 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 0xe1-0xec */
82 8, /* 0xed */
83 9, 9, /* 0xee-0xef */
84 10, /* 0xf0 */
85 11, 11, 11, /* 0xf1-0xf3 */
86 12, /* 0xf4 */
87 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
90 /* Machine states */
91 #define FSM_START 0
92 #define FSM_80BF 1
93 #define FSM_A0BF 2
94 #define FSM_80BF80BF 3
95 #define FSM_809F 4
96 #define FSM_90BF 5
97 #define FSM_80BF80BF80BF 6
98 #define FSM_808F 7
99 #define FSM_ERROR 8
101 /* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
102 same transitions, as do categories 0xe1-0xec and 0xee-0xef. I wonder if
103 there is any great benefit in combining categories? It would reduce the
104 memory footprint of the transition table by 16 bytes, but might it be
105 harder to understand? */
107 /* Machine transition table */
108 static const char machine [9][14] = {
109 /* FSM_START */
110 {FSM_START, /* 0x00-0x7f */
111 FSM_ERROR, /* 0x80-0x8f */
112 FSM_ERROR, /* 0x90-0x9f */
113 FSM_ERROR, /* 0xa0-0xbf */
114 FSM_ERROR, /* 0xc0-0xc1 */
115 FSM_80BF, /* 0xc2-0xdf */
116 FSM_A0BF, /* 0xe0 */
117 FSM_80BF80BF, /* 0xe1-0xec */
118 FSM_809F, /* 0xed */
119 FSM_80BF80BF, /* 0xee-0xef */
120 FSM_90BF, /* 0xf0 */
121 FSM_80BF80BF80BF, /* 0xf1-0xf3 */
122 FSM_808F, /* 0xf4 */
123 FSM_ERROR}, /* 0xf5-0xff */
125 /* FSM_80BF */
126 {FSM_ERROR, /* 0x00-0x7f */
127 FSM_START, /* 0x80-0x8f */
128 FSM_START, /* 0x90-0x9f */
129 FSM_START, /* 0xa0-0xbf */
130 FSM_ERROR, /* 0xc0-0xc1 */
131 FSM_ERROR, /* 0xc2-0xdf */
132 FSM_ERROR, /* 0xe0 */
133 FSM_ERROR, /* 0xe1-0xec */
134 FSM_ERROR, /* 0xed */
135 FSM_ERROR, /* 0xee-0xef */
136 FSM_ERROR, /* 0xf0 */
137 FSM_ERROR, /* 0xf1-0xf3 */
138 FSM_ERROR, /* 0xf4 */
139 FSM_ERROR}, /* 0xf5-0xff */
141 /* FSM_A0BF */
142 {FSM_ERROR, /* 0x00-0x7f */
143 FSM_ERROR, /* 0x80-0x8f */
144 FSM_ERROR, /* 0x90-0x9f */
145 FSM_80BF, /* 0xa0-0xbf */
146 FSM_ERROR, /* 0xc0-0xc1 */
147 FSM_ERROR, /* 0xc2-0xdf */
148 FSM_ERROR, /* 0xe0 */
149 FSM_ERROR, /* 0xe1-0xec */
150 FSM_ERROR, /* 0xed */
151 FSM_ERROR, /* 0xee-0xef */
152 FSM_ERROR, /* 0xf0 */
153 FSM_ERROR, /* 0xf1-0xf3 */
154 FSM_ERROR, /* 0xf4 */
155 FSM_ERROR}, /* 0xf5-0xff */
157 /* FSM_80BF80BF */
158 {FSM_ERROR, /* 0x00-0x7f */
159 FSM_80BF, /* 0x80-0x8f */
160 FSM_80BF, /* 0x90-0x9f */
161 FSM_80BF, /* 0xa0-0xbf */
162 FSM_ERROR, /* 0xc0-0xc1 */
163 FSM_ERROR, /* 0xc2-0xdf */
164 FSM_ERROR, /* 0xe0 */
165 FSM_ERROR, /* 0xe1-0xec */
166 FSM_ERROR, /* 0xed */
167 FSM_ERROR, /* 0xee-0xef */
168 FSM_ERROR, /* 0xf0 */
169 FSM_ERROR, /* 0xf1-0xf3 */
170 FSM_ERROR, /* 0xf4 */
171 FSM_ERROR}, /* 0xf5-0xff */
173 /* FSM_809F */
174 {FSM_ERROR, /* 0x00-0x7f */
175 FSM_80BF, /* 0x80-0x8f */
176 FSM_80BF, /* 0x90-0x9f */
177 FSM_ERROR, /* 0xa0-0xbf */
178 FSM_ERROR, /* 0xc0-0xc1 */
179 FSM_ERROR, /* 0xc2-0xdf */
180 FSM_ERROR, /* 0xe0 */
181 FSM_ERROR, /* 0xe1-0xec */
182 FSM_ERROR, /* 0xed */
183 FSM_ERROR, /* 0xee-0xef */
184 FSM_ERROR, /* 0xf0 */
185 FSM_ERROR, /* 0xf1-0xf3 */
186 FSM_ERROR, /* 0xf4 */
187 FSM_ERROR}, /* 0xf5-0xff */
189 /* FSM_90BF */
190 {FSM_ERROR, /* 0x00-0x7f */
191 FSM_ERROR, /* 0x80-0x8f */
192 FSM_80BF80BF, /* 0x90-0x9f */
193 FSM_80BF80BF, /* 0xa0-0xbf */
194 FSM_ERROR, /* 0xc0-0xc1 */
195 FSM_ERROR, /* 0xc2-0xdf */
196 FSM_ERROR, /* 0xe0 */
197 FSM_ERROR, /* 0xe1-0xec */
198 FSM_ERROR, /* 0xed */
199 FSM_ERROR, /* 0xee-0xef */
200 FSM_ERROR, /* 0xf0 */
201 FSM_ERROR, /* 0xf1-0xf3 */
202 FSM_ERROR, /* 0xf4 */
203 FSM_ERROR}, /* 0xf5-0xff */
205 /* FSM_80BF80BF80BF */
206 {FSM_ERROR, /* 0x00-0x7f */
207 FSM_80BF80BF, /* 0x80-0x8f */
208 FSM_80BF80BF, /* 0x90-0x9f */
209 FSM_80BF80BF, /* 0xa0-0xbf */
210 FSM_ERROR, /* 0xc0-0xc1 */
211 FSM_ERROR, /* 0xc2-0xdf */
212 FSM_ERROR, /* 0xe0 */
213 FSM_ERROR, /* 0xe1-0xec */
214 FSM_ERROR, /* 0xed */
215 FSM_ERROR, /* 0xee-0xef */
216 FSM_ERROR, /* 0xf0 */
217 FSM_ERROR, /* 0xf1-0xf3 */
218 FSM_ERROR, /* 0xf4 */
219 FSM_ERROR}, /* 0xf5-0xff */
221 /* FSM_808F */
222 {FSM_ERROR, /* 0x00-0x7f */
223 FSM_80BF80BF, /* 0x80-0x8f */
224 FSM_ERROR, /* 0x90-0x9f */
225 FSM_ERROR, /* 0xa0-0xbf */
226 FSM_ERROR, /* 0xc0-0xc1 */
227 FSM_ERROR, /* 0xc2-0xdf */
228 FSM_ERROR, /* 0xe0 */
229 FSM_ERROR, /* 0xe1-0xec */
230 FSM_ERROR, /* 0xed */
231 FSM_ERROR, /* 0xee-0xef */
232 FSM_ERROR, /* 0xf0 */
233 FSM_ERROR, /* 0xf1-0xf3 */
234 FSM_ERROR, /* 0xf4 */
235 FSM_ERROR}, /* 0xf5-0xff */
237 /* FSM_ERROR */
238 {FSM_ERROR, /* 0x00-0x7f */
239 FSM_ERROR, /* 0x80-0x8f */
240 FSM_ERROR, /* 0x90-0x9f */
241 FSM_ERROR, /* 0xa0-0xbf */
242 FSM_ERROR, /* 0xc0-0xc1 */
243 FSM_ERROR, /* 0xc2-0xdf */
244 FSM_ERROR, /* 0xe0 */
245 FSM_ERROR, /* 0xe1-0xec */
246 FSM_ERROR, /* 0xed */
247 FSM_ERROR, /* 0xee-0xef */
248 FSM_ERROR, /* 0xf0 */
249 FSM_ERROR, /* 0xf1-0xf3 */
250 FSM_ERROR, /* 0xf4 */
251 FSM_ERROR}, /* 0xf5-0xff */
254 /* Scan MAX_LEN bytes in *DATA for chars that are not in the octet
255 * category 0 (FSM_START). Return the position of the first such char
256 * or DATA + MAX_LEN if all were cat 0.
258 static const char *
259 first_non_fsm_start_char(const char *data, apr_size_t max_len)
261 #if !SVN_UNALIGNED_ACCESS_IS_OK
263 /* On some systems, we need to make sure that buf is properly aligned
264 * for chunky data access.
266 if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1))
268 apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1);
269 if (len > max_len)
270 len = max_len;
271 max_len -= len;
273 for (; len > 0; ++data, --len)
274 if (*data < 0 || *data >= 0x80)
275 return data;
278 #endif
280 /* Scan the input one machine word at a time. */
281 for (; max_len > sizeof(apr_uintptr_t)
282 ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t))
283 if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET)
284 break;
286 /* The remaining odd bytes will be examined the naive way: */
287 for (; max_len > 0; ++data, --max_len)
288 if (*data < 0 || *data >= 0x80)
289 break;
291 return data;
294 /* Scan the C string in *DATA for chars that are not in the octet
295 * category 0 (FSM_START). Return the position of either the such
296 * char or of the terminating NUL.
298 static const char *
299 first_non_fsm_start_char_cstring(const char *data)
301 /* We need to make sure that BUF is properly aligned for chunky data
302 * access because we don't know the string's length. Unaligned chunk
303 * read access beyond the NUL terminator could therefore result in a
304 * segfault.
306 for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data)
307 if (*data <= 0 || *data >= 0x80)
308 return data;
310 /* Scan the input one machine word at a time. */
311 #ifndef SVN_UTF_NO_UNINITIALISED_ACCESS
312 /* This may read allocated but initialised bytes beyond the
313 terminating null. Any such bytes are always readable and this
314 code operates correctly whatever the uninitialised values happen
315 to be. However memory checking tools such as valgrind and GCC
316 4.8's address santitizer will object so this bit of code can be
317 disabled at compile time. */
318 for (; ; data += sizeof(apr_uintptr_t))
320 /* Check for non-ASCII chars: */
321 apr_uintptr_t chunk = *(const apr_uintptr_t *)data;
322 if (chunk & SVN__BIT_7_SET)
323 break;
325 /* This is the well-known strlen test: */
326 chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET;
327 if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET)
328 break;
330 #endif
332 /* The remaining odd bytes will be examined the naive way: */
333 for (; ; ++data)
334 if (*data <= 0 || *data >= 0x80)
335 break;
337 return data;
340 const char *
341 svn_utf__last_valid(const char *data, apr_size_t len)
343 const char *start = first_non_fsm_start_char(data, len);
344 const char *end = data + len;
345 int state = FSM_START;
347 data = start;
348 while (data < end)
350 unsigned char octet = *data++;
351 int category = octet_category[octet];
352 state = machine[state][category];
353 if (state == FSM_START)
354 start = data;
356 return start;
359 svn_boolean_t
360 svn_utf__cstring_is_valid(const char *data)
362 int state = FSM_START;
364 if (!data)
365 return FALSE;
367 data = first_non_fsm_start_char_cstring(data);
369 while (*data)
371 unsigned char octet = *data++;
372 int category = octet_category[octet];
373 state = machine[state][category];
375 return state == FSM_START;
378 svn_boolean_t
379 svn_utf__is_valid(const char *data, apr_size_t len)
381 const char *end = data + len;
382 int state = FSM_START;
384 if (!data)
385 return FALSE;
387 data = first_non_fsm_start_char(data, len);
389 while (data < end)
391 unsigned char octet = *data++;
392 int category = octet_category[octet];
393 state = machine[state][category];
395 return state == FSM_START;
398 const char *
399 svn_utf__last_valid2(const char *data, apr_size_t len)
401 const char *start = first_non_fsm_start_char(data, len);
402 const char *end = data + len;
403 int state = FSM_START;
405 data = start;
406 while (data < end)
408 unsigned char octet = *data++;
409 switch (state)
411 case FSM_START:
412 if (octet <= 0x7F)
413 break;
414 else if (octet <= 0xC1)
415 state = FSM_ERROR;
416 else if (octet <= 0xDF)
417 state = FSM_80BF;
418 else if (octet == 0xE0)
419 state = FSM_A0BF;
420 else if (octet <= 0xEC)
421 state = FSM_80BF80BF;
422 else if (octet == 0xED)
423 state = FSM_809F;
424 else if (octet <= 0xEF)
425 state = FSM_80BF80BF;
426 else if (octet == 0xF0)
427 state = FSM_90BF;
428 else if (octet <= 0xF3)
429 state = FSM_80BF80BF80BF;
430 else if (octet <= 0xF4)
431 state = FSM_808F;
432 else
433 state = FSM_ERROR;
434 break;
435 case FSM_80BF:
436 if (octet >= 0x80 && octet <= 0xBF)
437 state = FSM_START;
438 else
439 state = FSM_ERROR;
440 break;
441 case FSM_A0BF:
442 if (octet >= 0xA0 && octet <= 0xBF)
443 state = FSM_80BF;
444 else
445 state = FSM_ERROR;
446 break;
447 case FSM_80BF80BF:
448 if (octet >= 0x80 && octet <= 0xBF)
449 state = FSM_80BF;
450 else
451 state = FSM_ERROR;
452 break;
453 case FSM_809F:
454 if (octet >= 0x80 && octet <= 0x9F)
455 state = FSM_80BF;
456 else
457 state = FSM_ERROR;
458 break;
459 case FSM_90BF:
460 if (octet >= 0x90 && octet <= 0xBF)
461 state = FSM_80BF80BF;
462 else
463 state = FSM_ERROR;
464 break;
465 case FSM_80BF80BF80BF:
466 if (octet >= 0x80 && octet <= 0xBF)
467 state = FSM_80BF80BF;
468 else
469 state = FSM_ERROR;
470 break;
471 case FSM_808F:
472 if (octet >= 0x80 && octet <= 0x8F)
473 state = FSM_80BF80BF;
474 else
475 state = FSM_ERROR;
476 break;
477 default:
478 case FSM_ERROR:
479 return start;
481 if (state == FSM_START)
482 start = data;
484 return start;