Update procedures
[shapes.git] / source / pagescanner.ll
blobebc99e7d09c0a860eff77edb41b03643a15e9a63
1 /* This file is part of Shapes.
2  *
3  * Shapes is free software: you can redistribute it and/or modify
4  * it under the terms of the GNU General Public License as published by
5  * the Free Software Foundation, either version 3 of the License, or
6  * any later version.
7  *
8  * Shapes is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with Shapes.  If not, see <http://www.gnu.org/licenses/>.
15  *
16  * Copyright 2008, 2010 Henrik Tidefelt
17  */
20  * File:        pagescanner.l
21  * ----------------
22  * Lex inupt file to generate the scanner for the scanning of length data stored in a page's content stream.
23  */
27 /* The text within this first region delimited by %{ and %} is assumed to
28  * be C/C++ code and will be copied verbatim to the lex.pdf.c file ahead
29  * of the definitions of the pdflex() function. Add other header file inclusions
30  * or C++ variable declarations/prototypes that are needed by your code here.
31  */
33 #include "pdfstructure.h"
34 #include "pdfscanner.h"
35 #include "pdfparser.tab.h"
37 #include <string.h>
38 #include <iostream>
39 #include <iomanip>
40 #include <cstdio> // This is a workaround for a bug in Flex.
42 unsigned char hexToChar( char c1, char c2 );
43 unsigned char octalToChar( char c1, char c2, char c3 );
45 size_t stringParenDepth;
48  * Global variable: pdflval
49  * -----------------------
50  * This global variable is how we get attribute information about the token
51  * just scanned to the client. The scanner sets the global variable
52  * appropriately and since it's global the client can just read it.     In the
53  * future, this variable will be declared for us in the y.tab.c file
54  * produced by Yacc, but for now, we declare it manually.
55  */
56 //YYSTYPE pdflval;      // manually declared for pp1, later Yacc provides
59  * Global variable: pdflloc
60  * -----------------------
61  * This global variable is how we get position information about the token
62  * just scanned to the client. (Operates similarly to pdflval above)
63  */
64 //struct pdfltype pdflloc; // manually dclared for pp1, later Yacc provides
68  /*
69         * The section before the first %% is the Definitions section of the lex
70         * input file. Here is where you set options for the scanner, define lex
71         * states, and can set up definitions to give names to regular expressions
72         * as a simple substitution mechanism that allows for more readable
73         * entries in the Rules section later. 
74         */
76 WhiteSpace               [ \t\n\r]
77 NonWhiteSpace [^ \t\n\r]
79 PlainInteger [0-9]+
80 DecInteger [+-]?[0-9]+
82 Float [+-]?[0-9]*[.][0-9]*
84 Name [/]{NonWhiteSpace}*
86 HexString [<]({WhiteSpace}|[0-9A-Fa-f])*[>]
87 ButParentheses ([^()]|(\\(.|\n)))*
89 %option c++
90 %option noyywrap
92 %x StringState
96 {PlainInteger} {
97         char * end;
98         pdflval.plainInt = strtol( yytext, & end, 10 );
99         return T_Int;
102 {Float} {
103         char * end;
104         pdflval.pdfObj = new PDF_Float( strtod( yytext, & end ) );
105         return T_Constant;
108 {HexString} {
109         /* Note that we don't parse the internal meaning of the bracketed contents, since this
110          * would only require more work when writing back to a pdf file.
111          */
112         /*
113         const char * src( yytext + 1 );
114         char c1;
115         char c2;
116         string str;
118         for( ; isblank( *src ); ++src )
119                 ;
120         c1 = *src;
121         ++src;
122         while( c1 != '>' )
123                 {
124                         for( ; isblank( *src ); ++src )
125                                 ;
126                         c2 = *src;
127                         ++src;
128                         if( c2 == '>' )
129                                 {
130                                         str += hexToChar( c1, 0 );
131                                         break;
132                                 }
133                         str += hexToChar( c1, c2 );
134                         for( ; isblank( *src ); ++src )
135                                 ;
136                         c1 = *src;
137                         ++src;
138                 }
140         pdflval.pdfObj = new PDF_String( str );
141         */
142         pdflval.pdfObj = new PDF_HexString( yytext );
143         return T_Constant;
146 <INITIAL>[\(] {
147         stringParenDepth = 1;
148         BEGIN( StringState );
149         return yytext[0];
151 <StringState>[\(] {
152         ++stringParenDepth;
153         pdflval.str = strdup( yytext );
154         return T_String;
156 <StringState>[\)] {
157         --stringParenDepth;
158         if( stringParenDepth > 0 )
159                 {
160                         pdflval.str = strdup( yytext );
161                         return T_String;
162                 }
163         else
164                 {
165                         BEGIN( INITIAL );
166                         return yytext[ 0 ];
167                 }
170 <StringState>{ButParentheses} {
171         /* Note that we don't parse the internal meaning of escape sequences within the name, since this
172          * would only require more work when writing back to a pdf file.
173          */
174         /*
175         char * res( new char[ pdfleng + 1 ] );
176         char * dst( res.getPtr( ) );
177         const char * src( yytext );
178         char c;
179         while( true )
180                 {
181                         c = *(src++);
182                         if( c == '\\' )
183                                 {
184                                         char c1( *(src++) );
185                                         switch( c1 )
186                                                 {
187                                                 case 'n':
188                                                         *(dst++) = '\n';
189                                                         break;
190                                                 case 'r':
191                                                         *(dst++) = '\r';
192                                                         break;
193                                                 case 't':
194                                                         *(dst++) = '\t';
195                                                         break;
196                                                 case 'b':
197                                                         *(dst++) = '\b';
198                                                         break;
199                                                 case 'f':
200                                                         *(dst++) = '\f';
201                                                         break;
202                                                 case '(':
203                                                         *(dst++) = '(';
204                                                         break;
205                                                 case ')':
206                                                         *(dst++) = ')';
207                                                         break;
208                                                 case '\\':
209                                                         *(dst++) = '\\';
210                                                         break;
211                                                 default:
212                                                         if( isdigit( c1 ) )
213                                                                 {
214                                                                         char c2;
215                                                                         isPtr->get( c2 );
216                                                                         if( isdigit( c2 ) )
217                                                                                 {
218                                                                                         char c3;
219                                                                                         isPtr->get( c3 );
220                                                                                         if( isdigit( c3 ) )
221                                                                                                 {
222                                                                                                         *(dst++) = octalToChar( c1, c2, c3 );
223                                                                                                 }
224                                                                                         else
225                                                                                                 {
226                                                                                                         *(dst++) = octalToChar( 0, c1, c2 );
227                                                                                                         *(dst++) = c3;
228                                                                                                 }
229                                                                                 }
230                                                                         else
231                                                                                 {
232                                                                                         *(dst++) = octalToChar( 0, 0, c1 );
233                                                                                         *(dst++) = c2;
234                                                                                 }
235                                                                 }
236                                                         else
237                                                                 {
238                                                                         *(dst++) = c1;
239                                                                 }
240                                                         break;
241                                                 }
242                                 }
243                         else
244                                 {
245                                         *(dst++) = c;
246                                 }
247                 }
249  done:
250         *dst = '\0';
251         pdflval.str = res;
252         */
253         pdflval.str = strdup( yytext );
254         return T_String;
257 true { pdflval.pdfObj = new PDF_Boolean( true ); return T_Constant; }
258 false { pdflval.pdfObj = new PDF_Boolean( false ); return T_Constant; }
259 null { pdflval.pdfObj = new PDF_Null( ); return T_Constant; }
260 obj { return T_obj; }
261 endobj { return T_endobj; }
262 stream("\r\n"|"\n") { return T_stream; }
263 endstream { return T_endstream; }
264 R { return T_R; }
266 "<<" { return T_OpenDic; }
267 ">>" { return T_CloseDic; }
269 [\[\]] {
270         return yytext[0];
273 {Name} {
274         /* Note that we don't parse the internal meaning of #-sequences within the name, since this
275          * would only require more work when writing back to a pdf file.
276          */
277         /*
278         const char * src( yytext + 1 );
279         RefCountPtr< char > dstMem( new char[ pdflength + 1 ] );
280         char * dst( dstMem.getPtr( ) );
281         char c;
282         for( c = *(src++); ! isblank( c ); c = *(src++) )
283                 {
284                         if( c == '#' )
285                                 {
286                                         char c1;
287                                         char c2;
288                                         c1 = *(src++);
289                                         c2 = *(src++);
290                                         (dst++) = hexToChar( c1, c2 );
291                                 }
292                         else
293                                 {
294                                         (dst++) = c;
295                                 }
296                 }
297         *dst = '\0';
298         pdflval.pdfObj = new PDF_Name( dstMem.getPtr( ) );
299         */
300         pdflval.pdfObj = new PDF_Name( yytext + 1 );
301         return T_Name;
304 <INITIAL>[ \t\n\r]+ ;
306 . { throw( "Page scanner found unrecognized token" ); }
309 /* The closing %% above marks the end of the Rules section and the beginning
310  * of the User Subroutines section. All text from here to the end of the
311  * file is copied verbatim to the end of the generated lex.pdf.c file.
312  * This section is where you put definitions of helper functions.
313  */
316 unsigned char hexToChar( char c1, char c2 )
318         unsigned char res( 0 );
319         if( c1 < 'A' )
320                 {
321                         res += 16 * static_cast< unsigned char >( c1 - '0' );
322                 }
323         else if( c1 < 'a' )
324                 {
325                         res += 16 * static_cast< unsigned char >( c1 - 'A' + 10 );
326                 }
327         else
328                 {
329                         res += 16 * static_cast< unsigned char >( c1 - 'a' + 10 );
330                 }
332         if( c2 < 'A' )
333                 {
334                         res += static_cast< unsigned char >( c2 - '0' );
335                 }
336         else if( c2 < 'a' )
337                 {
338                         res += static_cast< unsigned char >( c2 - 'A' + 10 );
339                 }
340         else
341                 {
342                         res += static_cast< unsigned char >( c2 - 'a' + 10 );
343                 }
345                 return res;
348 unsigned char octalToChar( char c1, char c2, char c3 )
350         return 64 * static_cast< unsigned char >( c1 - '0' ) + 8 * static_cast< unsigned char >( c2 - '0' ) + static_cast< unsigned char >( c3 - '0' );