2 * utf_validate.c: Validate a UTF-8 string
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
21 * ====================================================================
24 /* Validate a UTF-8 string according to the rules in
26 * Table 3-6. Well-Formed UTF-8 Bytes Sequences
30 * The Unicode Standard, Version 4.0
32 * which is available at
34 * http://www.unicode.org/
36 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
37 * is a subset of that enconding. The Unicode enconding prohibits things
38 * like non-shortest encodings (some characters can be represented by more
39 * than one multi-byte encoding) and the encodings for the surrogate code
40 * points. RFC-3629 superceeds RFC-2279 and adopts the same well-formed
41 * rules as Unicode. This is the ABNF in RFC-3629 that describes
42 * well-formed UTF-8 rules:
44 * UTF8-octets = *( UTF8-char )
45 * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
47 * UTF8-2 = %xC2-DF UTF8-tail
48 * UTF8-3 = %xE0 %xA0-BF UTF8-tail /
49 * %xE1-EC 2( UTF8-tail ) /
50 * %xED %x80-9F UTF8-tail /
51 * %xEE-EF 2( UTF8-tail )
52 * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
53 * %xF1-F3 3( UTF8-tail ) /
54 * %xF4 %x80-8F 2( UTF8-tail )
59 #include "private/svn_utf_private.h"
60 #include "private/svn_eol_private.h"
61 #include "private/svn_dep_compat.h"
63 /* Lookup table to categorise each octet in the string. */
64 static const char octet_category
[256] = {
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00-0x7f */
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80-0x8f */
74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0x90-0x9f */
75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xa0-0xbf */
76 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
78 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 0xc2-0xdf */
79 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
81 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 0xe1-0xec */
85 11, 11, 11, /* 0xf1-0xf3 */
87 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
94 #define FSM_80BF80BF 3
97 #define FSM_80BF80BF80BF 6
101 /* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
102 same transitions, as do categories 0xe1-0xec and 0xee-0xef. I wonder if
103 there is any great benefit in combining categories? It would reduce the
104 memory footprint of the transition table by 16 bytes, but might it be
105 harder to understand? */
107 /* Machine transition table */
108 static const char machine
[9][14] = {
110 {FSM_START
, /* 0x00-0x7f */
111 FSM_ERROR
, /* 0x80-0x8f */
112 FSM_ERROR
, /* 0x90-0x9f */
113 FSM_ERROR
, /* 0xa0-0xbf */
114 FSM_ERROR
, /* 0xc0-0xc1 */
115 FSM_80BF
, /* 0xc2-0xdf */
117 FSM_80BF80BF
, /* 0xe1-0xec */
119 FSM_80BF80BF
, /* 0xee-0xef */
121 FSM_80BF80BF80BF
, /* 0xf1-0xf3 */
123 FSM_ERROR
}, /* 0xf5-0xff */
126 {FSM_ERROR
, /* 0x00-0x7f */
127 FSM_START
, /* 0x80-0x8f */
128 FSM_START
, /* 0x90-0x9f */
129 FSM_START
, /* 0xa0-0xbf */
130 FSM_ERROR
, /* 0xc0-0xc1 */
131 FSM_ERROR
, /* 0xc2-0xdf */
132 FSM_ERROR
, /* 0xe0 */
133 FSM_ERROR
, /* 0xe1-0xec */
134 FSM_ERROR
, /* 0xed */
135 FSM_ERROR
, /* 0xee-0xef */
136 FSM_ERROR
, /* 0xf0 */
137 FSM_ERROR
, /* 0xf1-0xf3 */
138 FSM_ERROR
, /* 0xf4 */
139 FSM_ERROR
}, /* 0xf5-0xff */
142 {FSM_ERROR
, /* 0x00-0x7f */
143 FSM_ERROR
, /* 0x80-0x8f */
144 FSM_ERROR
, /* 0x90-0x9f */
145 FSM_80BF
, /* 0xa0-0xbf */
146 FSM_ERROR
, /* 0xc0-0xc1 */
147 FSM_ERROR
, /* 0xc2-0xdf */
148 FSM_ERROR
, /* 0xe0 */
149 FSM_ERROR
, /* 0xe1-0xec */
150 FSM_ERROR
, /* 0xed */
151 FSM_ERROR
, /* 0xee-0xef */
152 FSM_ERROR
, /* 0xf0 */
153 FSM_ERROR
, /* 0xf1-0xf3 */
154 FSM_ERROR
, /* 0xf4 */
155 FSM_ERROR
}, /* 0xf5-0xff */
158 {FSM_ERROR
, /* 0x00-0x7f */
159 FSM_80BF
, /* 0x80-0x8f */
160 FSM_80BF
, /* 0x90-0x9f */
161 FSM_80BF
, /* 0xa0-0xbf */
162 FSM_ERROR
, /* 0xc0-0xc1 */
163 FSM_ERROR
, /* 0xc2-0xdf */
164 FSM_ERROR
, /* 0xe0 */
165 FSM_ERROR
, /* 0xe1-0xec */
166 FSM_ERROR
, /* 0xed */
167 FSM_ERROR
, /* 0xee-0xef */
168 FSM_ERROR
, /* 0xf0 */
169 FSM_ERROR
, /* 0xf1-0xf3 */
170 FSM_ERROR
, /* 0xf4 */
171 FSM_ERROR
}, /* 0xf5-0xff */
174 {FSM_ERROR
, /* 0x00-0x7f */
175 FSM_80BF
, /* 0x80-0x8f */
176 FSM_80BF
, /* 0x90-0x9f */
177 FSM_ERROR
, /* 0xa0-0xbf */
178 FSM_ERROR
, /* 0xc0-0xc1 */
179 FSM_ERROR
, /* 0xc2-0xdf */
180 FSM_ERROR
, /* 0xe0 */
181 FSM_ERROR
, /* 0xe1-0xec */
182 FSM_ERROR
, /* 0xed */
183 FSM_ERROR
, /* 0xee-0xef */
184 FSM_ERROR
, /* 0xf0 */
185 FSM_ERROR
, /* 0xf1-0xf3 */
186 FSM_ERROR
, /* 0xf4 */
187 FSM_ERROR
}, /* 0xf5-0xff */
190 {FSM_ERROR
, /* 0x00-0x7f */
191 FSM_ERROR
, /* 0x80-0x8f */
192 FSM_80BF80BF
, /* 0x90-0x9f */
193 FSM_80BF80BF
, /* 0xa0-0xbf */
194 FSM_ERROR
, /* 0xc0-0xc1 */
195 FSM_ERROR
, /* 0xc2-0xdf */
196 FSM_ERROR
, /* 0xe0 */
197 FSM_ERROR
, /* 0xe1-0xec */
198 FSM_ERROR
, /* 0xed */
199 FSM_ERROR
, /* 0xee-0xef */
200 FSM_ERROR
, /* 0xf0 */
201 FSM_ERROR
, /* 0xf1-0xf3 */
202 FSM_ERROR
, /* 0xf4 */
203 FSM_ERROR
}, /* 0xf5-0xff */
205 /* FSM_80BF80BF80BF */
206 {FSM_ERROR
, /* 0x00-0x7f */
207 FSM_80BF80BF
, /* 0x80-0x8f */
208 FSM_80BF80BF
, /* 0x90-0x9f */
209 FSM_80BF80BF
, /* 0xa0-0xbf */
210 FSM_ERROR
, /* 0xc0-0xc1 */
211 FSM_ERROR
, /* 0xc2-0xdf */
212 FSM_ERROR
, /* 0xe0 */
213 FSM_ERROR
, /* 0xe1-0xec */
214 FSM_ERROR
, /* 0xed */
215 FSM_ERROR
, /* 0xee-0xef */
216 FSM_ERROR
, /* 0xf0 */
217 FSM_ERROR
, /* 0xf1-0xf3 */
218 FSM_ERROR
, /* 0xf4 */
219 FSM_ERROR
}, /* 0xf5-0xff */
222 {FSM_ERROR
, /* 0x00-0x7f */
223 FSM_80BF80BF
, /* 0x80-0x8f */
224 FSM_ERROR
, /* 0x90-0x9f */
225 FSM_ERROR
, /* 0xa0-0xbf */
226 FSM_ERROR
, /* 0xc0-0xc1 */
227 FSM_ERROR
, /* 0xc2-0xdf */
228 FSM_ERROR
, /* 0xe0 */
229 FSM_ERROR
, /* 0xe1-0xec */
230 FSM_ERROR
, /* 0xed */
231 FSM_ERROR
, /* 0xee-0xef */
232 FSM_ERROR
, /* 0xf0 */
233 FSM_ERROR
, /* 0xf1-0xf3 */
234 FSM_ERROR
, /* 0xf4 */
235 FSM_ERROR
}, /* 0xf5-0xff */
238 {FSM_ERROR
, /* 0x00-0x7f */
239 FSM_ERROR
, /* 0x80-0x8f */
240 FSM_ERROR
, /* 0x90-0x9f */
241 FSM_ERROR
, /* 0xa0-0xbf */
242 FSM_ERROR
, /* 0xc0-0xc1 */
243 FSM_ERROR
, /* 0xc2-0xdf */
244 FSM_ERROR
, /* 0xe0 */
245 FSM_ERROR
, /* 0xe1-0xec */
246 FSM_ERROR
, /* 0xed */
247 FSM_ERROR
, /* 0xee-0xef */
248 FSM_ERROR
, /* 0xf0 */
249 FSM_ERROR
, /* 0xf1-0xf3 */
250 FSM_ERROR
, /* 0xf4 */
251 FSM_ERROR
}, /* 0xf5-0xff */
254 /* Scan MAX_LEN bytes in *DATA for chars that are not in the octet
255 * category 0 (FSM_START). Return the position of the first such char
256 * or DATA + MAX_LEN if all were cat 0.
259 first_non_fsm_start_char(const char *data
, apr_size_t max_len
)
261 #if !SVN_UNALIGNED_ACCESS_IS_OK
263 /* On some systems, we need to make sure that buf is properly aligned
264 * for chunky data access.
266 if ((apr_uintptr_t
)data
& (sizeof(apr_uintptr_t
)-1))
268 apr_size_t len
= (~(apr_uintptr_t
)data
) & (sizeof(apr_uintptr_t
)-1);
273 for (; len
> 0; ++data
, --len
)
274 if (*data
< 0 || *data
>= 0x80)
280 /* Scan the input one machine word at a time. */
281 for (; max_len
> sizeof(apr_uintptr_t
)
282 ; data
+= sizeof(apr_uintptr_t
), max_len
-= sizeof(apr_uintptr_t
))
283 if (*(const apr_uintptr_t
*)data
& SVN__BIT_7_SET
)
286 /* The remaining odd bytes will be examined the naive way: */
287 for (; max_len
> 0; ++data
, --max_len
)
288 if (*data
< 0 || *data
>= 0x80)
294 /* Scan the C string in *DATA for chars that are not in the octet
295 * category 0 (FSM_START). Return the position of either the such
296 * char or of the terminating NUL.
299 first_non_fsm_start_char_cstring(const char *data
)
301 /* We need to make sure that BUF is properly aligned for chunky data
302 * access because we don't know the string's length. Unaligned chunk
303 * read access beyond the NUL terminator could therefore result in a
306 for (; (apr_uintptr_t
)data
& (sizeof(apr_uintptr_t
)-1); ++data
)
307 if (*data
<= 0 || *data
>= 0x80)
310 /* Scan the input one machine word at a time. */
311 #ifndef SVN_UTF_NO_UNINITIALISED_ACCESS
312 /* This may read allocated but initialised bytes beyond the
313 terminating null. Any such bytes are always readable and this
314 code operates correctly whatever the uninitialised values happen
315 to be. However memory checking tools such as valgrind and GCC
316 4.8's address santitizer will object so this bit of code can be
317 disabled at compile time. */
318 for (; ; data
+= sizeof(apr_uintptr_t
))
320 /* Check for non-ASCII chars: */
321 apr_uintptr_t chunk
= *(const apr_uintptr_t
*)data
;
322 if (chunk
& SVN__BIT_7_SET
)
325 /* This is the well-known strlen test: */
326 chunk
|= (chunk
& SVN__LOWER_7BITS_SET
) + SVN__LOWER_7BITS_SET
;
327 if ((chunk
& SVN__BIT_7_SET
) != SVN__BIT_7_SET
)
332 /* The remaining odd bytes will be examined the naive way: */
334 if (*data
<= 0 || *data
>= 0x80)
341 svn_utf__last_valid(const char *data
, apr_size_t len
)
343 const char *start
= first_non_fsm_start_char(data
, len
);
344 const char *end
= data
+ len
;
345 int state
= FSM_START
;
350 unsigned char octet
= *data
++;
351 int category
= octet_category
[octet
];
352 state
= machine
[state
][category
];
353 if (state
== FSM_START
)
360 svn_utf__cstring_is_valid(const char *data
)
362 int state
= FSM_START
;
367 data
= first_non_fsm_start_char_cstring(data
);
371 unsigned char octet
= *data
++;
372 int category
= octet_category
[octet
];
373 state
= machine
[state
][category
];
375 return state
== FSM_START
;
379 svn_utf__is_valid(const char *data
, apr_size_t len
)
381 const char *end
= data
+ len
;
382 int state
= FSM_START
;
387 data
= first_non_fsm_start_char(data
, len
);
391 unsigned char octet
= *data
++;
392 int category
= octet_category
[octet
];
393 state
= machine
[state
][category
];
395 return state
== FSM_START
;
399 svn_utf__last_valid2(const char *data
, apr_size_t len
)
401 const char *start
= first_non_fsm_start_char(data
, len
);
402 const char *end
= data
+ len
;
403 int state
= FSM_START
;
408 unsigned char octet
= *data
++;
414 else if (octet
<= 0xC1)
416 else if (octet
<= 0xDF)
418 else if (octet
== 0xE0)
420 else if (octet
<= 0xEC)
421 state
= FSM_80BF80BF
;
422 else if (octet
== 0xED)
424 else if (octet
<= 0xEF)
425 state
= FSM_80BF80BF
;
426 else if (octet
== 0xF0)
428 else if (octet
<= 0xF3)
429 state
= FSM_80BF80BF80BF
;
430 else if (octet
<= 0xF4)
436 if (octet
>= 0x80 && octet
<= 0xBF)
442 if (octet
>= 0xA0 && octet
<= 0xBF)
448 if (octet
>= 0x80 && octet
<= 0xBF)
454 if (octet
>= 0x80 && octet
<= 0x9F)
460 if (octet
>= 0x90 && octet
<= 0xBF)
461 state
= FSM_80BF80BF
;
465 case FSM_80BF80BF80BF
:
466 if (octet
>= 0x80 && octet
<= 0xBF)
467 state
= FSM_80BF80BF
;
472 if (octet
>= 0x80 && octet
<= 0x8F)
473 state
= FSM_80BF80BF
;
481 if (state
== FSM_START
)