RIP, Vernon...
[ttfautohint.git] / lib / afblue.pl
blob39fe5c584a85c014a820eb86271b9de1f112969e
1 #! /usr/bin/perl -w
2 # -*- Perl -*-
4 # afblue.pl
6 # Process a blue zone character data file.
8 # Copyright 2013-2016 by
9 # David Turner, Robert Wilhelm, and Werner Lemberg.
11 # This file is part of the FreeType project, and may only be used,
12 # modified, and distributed under the terms of the FreeType project
13 # license, LICENSE.TXT. By continuing to use, modify, or distribute
14 # this file you indicate that you have read the license and
15 # understand and accept it fully.
17 use strict;
18 use warnings;
19 use English '-no_match_vars';
20 use open ':std', ':encoding(UTF-8)';
23 my $prog = $PROGRAM_NAME;
24 $prog =~ s| .* / ||x; # Remove path.
26 die "usage: $prog datafile < infile > outfile\n" if $#ARGV != 0;
29 my $datafile = $ARGV[0];
31 my %diversions; # The extracted and massaged data from `datafile'.
32 my @else_stack; # Booleans to track else-clauses.
33 my @name_stack; # Stack of integers used for names of aux. variables.
35 my $curr_enum; # Name of the current enumeration.
36 my $curr_array; # Name of the current array.
37 my $curr_max; # Name of the current maximum value.
39 my $curr_enum_element; # Name of the current enumeration element.
40 my $curr_offset; # The offset relative to current aux. variable.
41 my $curr_elem_size; # The number of non-space characters in the current string or
42 # the number of elements in the current block.
44 my $have_sections = 0; # Boolean; set if start of a section has been seen.
45 my $have_strings; # Boolean; set if current section contains strings.
46 my $have_blocks; # Boolean; set if current section contains blocks.
48 my $have_enum_element; # Boolean; set if we have an enumeration element.
49 my $in_string; # Boolean; set if a string has been parsed.
51 my $num_sections = 0; # Number of sections seen so far.
53 my $last_aux; # Name of last auxiliary variable.
56 # Regular expressions.
58 # [<ws>] <enum_name> <ws> <array_name> <ws> <max_name> [<ws>] ':' [<ws>] '\n'
59 my $section_re = qr/ ^ \s* (\S+) \s+ (\S+) \s+ (\S+) \s* : \s* $ /x;
61 # [<ws>] <enum_element_name> [<ws>] '\n'
62 my $enum_element_re = qr/ ^ \s* ( [A-Za-z0-9_]+ ) \s* $ /x;
64 # '#' <preprocessor directive> '\n'
65 my $preprocessor_re = qr/ ^ \# /x;
67 # [<ws>] '/' '/' <comment> '\n'
68 my $comment_re = qr| ^ \s* // |x;
70 # empty line
71 my $whitespace_only_re = qr/ ^ \s* $ /x;
73 # [<ws>] '"' <string> '"' [<ws>] '\n' (<string> doesn't contain newlines)
74 my $string_re = qr/ ^ \s*
75 " ( (?> (?: (?> [^"\\]+ ) | \\. )* ) ) "
76 \s* $ /x;
78 # [<ws>] '{' <block> '}' [<ws>] '\n' (<block> can contain newlines)
79 my $block_start_re = qr/ ^ \s* \{ /x;
81 # We need the capturing group for `split' to make it return the separator
82 # tokens (i.e., the opening and closing brace) also.
83 my $brace_re = qr/ ( [{}] ) /x;
86 sub Warn
88 my $message = shift;
89 warn "$datafile:$INPUT_LINE_NUMBER: warning: $message\n";
93 sub Die
95 my $message = shift;
96 die "$datafile:$INPUT_LINE_NUMBER: error: $message\n";
100 my $warned_before = 0;
102 sub warn_before
104 Warn("data before first section gets ignored") unless $warned_before;
105 $warned_before = 1;
109 sub strip_newline
111 chomp;
112 s/ \x0D $ //x;
116 sub end_curr_string
118 # Append final null byte to string.
119 if ($have_strings)
121 push @{$diversions{$curr_array}}, " '\\0',\n" if $in_string;
123 $curr_offset++;
124 $in_string = 0;
129 sub update_max_elem_size
131 if ($curr_elem_size)
133 my $max = pop @{$diversions{$curr_max}};
134 $max = $curr_elem_size if $curr_elem_size > $max;
135 push @{$diversions{$curr_max}}, $max;
140 sub convert_non_ascii_char
142 # A UTF-8 character outside of the printable ASCII range, with possibly a
143 # leading backslash character.
144 my $s = shift;
146 # Here we count characters, not bytes.
147 $curr_elem_size += length $s;
149 utf8::encode($s);
150 $s = uc unpack 'H*', $s;
152 $curr_offset += $s =~ s/\G(..)/'\\x$1', /sg;
154 return $s;
158 sub convert_ascii_chars
160 # A series of ASCII characters in the printable range.
161 my $s = shift;
163 # We reduce multiple space characters to a single one.
164 $s =~ s/ +/ /g;
166 # Count all non-space characters. Note that `()' applies a list context
167 # to the capture that is used to count the elements.
168 $curr_elem_size += () = $s =~ /[^ ]/g;
170 $curr_offset += $s =~ s/\G(.)/'$1', /g;
172 return $s;
176 sub convert_literal
178 my $s = shift;
179 my $orig = $s;
181 # ASCII printables and space
182 my $safe_re = '\x20-\x7E';
183 # ASCII printables and space, no backslash
184 my $safe_no_backslash_re = '\x20-\x5B\x5D-\x7E';
186 $s =~ s{
187 (?: \\? ( [^$safe_re] )
188 | ( (?: [$safe_no_backslash_re]
189 | \\ [$safe_re] )+ ) )
192 defined($1) ? convert_non_ascii_char($1)
193 : convert_ascii_chars($2)
194 }egx;
196 # We assume that `$orig' doesn't contain `*/'
197 return $s . " /* $orig */";
201 sub aux_name
203 return "af_blue_" . $num_sections. "_" . join('_', @name_stack);
207 sub aux_name_next
209 $name_stack[$#name_stack]++;
210 my $name = aux_name();
211 $name_stack[$#name_stack]--;
213 return $name;
217 sub enum_val_string
219 # Build string that holds code to save the current offset in an
220 # enumeration element.
221 my $aux = shift;
223 my $add = ($last_aux eq "af_blue_" . $num_sections . "_0" )
224 ? ""
225 : "$last_aux + ";
227 return " $aux = $add$curr_offset,\n";
232 # Process data file.
234 open(DATA, $datafile) || die "$prog: can't open \`$datafile': $OS_ERROR\n";
236 while (<DATA>)
238 strip_newline();
240 next if /$comment_re/;
241 next if /$whitespace_only_re/;
243 if (/$section_re/)
245 Warn("previous section is empty") if ($have_sections
246 && !$have_strings
247 && !$have_blocks);
249 end_curr_string();
250 update_max_elem_size();
252 # Save captured groups from `section_re'.
253 $curr_enum = $1;
254 $curr_array = $2;
255 $curr_max = $3;
257 $curr_enum_element = "";
258 $curr_offset = 0;
260 Warn("overwriting already defined enumeration \`$curr_enum'")
261 if exists($diversions{$curr_enum});
262 Warn("overwriting already defined array \`$curr_array'")
263 if exists($diversions{$curr_array});
264 Warn("overwriting already defined maximum value \`$curr_max'")
265 if exists($diversions{$curr_max});
267 $diversions{$curr_enum} = [];
268 $diversions{$curr_array} = [];
269 $diversions{$curr_max} = [];
271 push @{$diversions{$curr_max}}, 0;
273 @name_stack = ();
274 push @name_stack, 0;
276 $have_sections = 1;
277 $have_strings = 0;
278 $have_blocks = 0;
280 $have_enum_element = 0;
281 $in_string = 0;
283 $num_sections++;
284 $curr_elem_size = 0;
286 $last_aux = aux_name();
288 next;
291 if (/$preprocessor_re/)
293 if ($have_sections)
295 # Having preprocessor conditionals complicates the computation of
296 # correct offset values. We have to introduce auxiliary enumeration
297 # elements with the name `af_blue_<s>_<n1>_<n2>_...' that store
298 # offsets to be used in conditional clauses. `<s>' is the number of
299 # sections seen so far, `<n1>' is the number of `#if' and `#endif'
300 # conditionals seen so far in the topmost level, `<n2>' the number of
301 # `#if' and `#endif' conditionals seen so far one level deeper, etc.
302 # As a consequence, uneven values are used within a clause, and even
303 # values after a clause, since the C standard doesn't allow the
304 # redefinition of an enumeration value. For example, the name
305 # `af_blue_5_1_6' is used to construct enumeration values in the fifth
306 # section after the third (second-level) if-clause within the first
307 # (top-level) if-clause. After the first top-level clause has
308 # finished, `af_blue_5_2' is used. The current offset is then
309 # relative to the value stored in the current auxiliary element.
311 if (/ ^ \# \s* if /x)
313 push @else_stack, 0;
315 $name_stack[$#name_stack]++;
317 push @{$diversions{$curr_enum}}, enum_val_string(aux_name());
318 $last_aux = aux_name();
320 push @name_stack, 0;
322 $curr_offset = 0;
324 elsif (/ ^ \# \s* elif /x)
326 Die("unbalanced #elif") unless @else_stack;
328 pop @name_stack;
330 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next());
331 $last_aux = aux_name();
333 push @name_stack, 0;
335 $curr_offset = 0;
337 elsif (/ ^ \# \s* else /x)
339 my $prev_else = pop @else_stack;
340 Die("unbalanced #else") unless defined($prev_else);
341 Die("#else already seen") if $prev_else;
342 push @else_stack, 1;
344 pop @name_stack;
346 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next());
347 $last_aux = aux_name();
349 push @name_stack, 0;
351 $curr_offset = 0;
353 elsif (/ ^ (\# \s*) endif /x)
355 my $prev_else = pop @else_stack;
356 Die("unbalanced #endif") unless defined($prev_else);
358 pop @name_stack;
360 # If there is no else-clause for an if-clause, we add one. This is
361 # necessary to have correct offsets.
362 if (!$prev_else)
364 # Use amount of whitespace from `endif'.
365 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next())
366 . $1 . "else\n";
367 $last_aux = aux_name();
369 $curr_offset = 0;
372 $name_stack[$#name_stack]++;
374 push @{$diversions{$curr_enum}}, enum_val_string(aux_name());
375 $last_aux = aux_name();
377 $curr_offset = 0;
380 # Handle (probably continued) preprocessor lines.
381 CONTINUED_LOOP:
385 strip_newline();
387 push @{$diversions{$curr_enum}}, $ARG . "\n";
388 push @{$diversions{$curr_array}}, $ARG . "\n";
390 last CONTINUED_LOOP unless / \\ $ /x;
392 } while (<DATA>);
395 else
397 warn_before();
400 next;
403 if (/$enum_element_re/)
405 end_curr_string();
406 update_max_elem_size();
408 $curr_enum_element = $1;
409 $have_enum_element = 1;
410 $curr_elem_size = 0;
412 next;
415 if (/$string_re/)
417 if ($have_sections)
419 Die("strings and blocks can't be mixed in a section") if $have_blocks;
421 # Save captured group from `string_re'.
422 my $string = $1;
424 if ($have_enum_element)
426 push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element);
427 $have_enum_element = 0;
430 $string = convert_literal($string);
432 push @{$diversions{$curr_array}}, " $string\n";
434 $have_strings = 1;
435 $in_string = 1;
437 else
439 warn_before();
442 next;
445 if (/$block_start_re/)
447 if ($have_sections)
449 Die("strings and blocks can't be mixed in a section") if $have_strings;
451 my $depth = 0;
452 my $block = "";
453 my $block_end = 0;
455 # Count braces while getting the block.
456 BRACE_LOOP:
460 strip_newline();
462 foreach my $substring (split(/$brace_re/))
464 if ($block_end)
466 Die("invalid data after last matching closing brace")
467 if $substring !~ /$whitespace_only_re/;
470 $block .= $substring;
472 if ($substring eq '{')
474 $depth++;
476 elsif ($substring eq '}')
478 $depth--;
480 $block_end = 1 if $depth == 0;
484 # If we are here, we have run out of substrings, so get next line
485 # or exit.
486 last BRACE_LOOP if $block_end;
488 $block .= "\n";
490 } while (<DATA>);
493 if ($have_enum_element)
495 push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element);
496 $have_enum_element = 0;
499 push @{$diversions{$curr_array}}, $block . ",\n";
501 $curr_offset++;
502 $curr_elem_size++;
504 $have_blocks = 1;
506 else
508 warn_before();
511 next;
514 # Garbage. We weren't able to parse the data.
515 Die("syntax error");
518 # Finalize data.
519 end_curr_string();
520 update_max_elem_size();
523 # Filter stdin to stdout, replacing `@...@' templates.
525 sub emit_diversion
527 my $diversion_name = shift;
528 return (exists($diversions{$1})) ? "@{$diversions{$1}}"
529 : "@" . $diversion_name . "@";
533 $LIST_SEPARATOR = '';
535 my $s1 = "This file has been generated by the Perl script \`$prog',";
536 my $s1len = length $s1;
537 my $s2 = "using data from file \`$datafile'.";
538 my $s2len = length $s2;
539 my $slen = ($s1len > $s2len) ? $s1len : $s2len;
541 print "/* " . $s1 . " " x ($slen - $s1len) . " */\n"
542 . "/* " . $s2 . " " x ($slen - $s2len) . " */\n"
543 . "\n";
545 while (<STDIN>)
547 s/ @ ( [A-Za-z0-9_]+? ) @ / emit_diversion($1) /egx;
548 print;
551 # EOF