Updating the changelog in the VERSION file, and version_sync.
[shapes.git] / source / pagescanner.ll
blobe31773c73c16ad6858f65f0835fc1ba4734c4001
1 /* This file is part of Shapes.
2  *
3  * Shapes is free software: you can redistribute it and/or modify
4  * it under the terms of the GNU General Public License as published by
5  * the Free Software Foundation, either version 3 of the License, or
6  * any later version.
7  *
8  * Shapes is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with Shapes.  If not, see <http://www.gnu.org/licenses/>.
15  *
16  * Copyright 2008 Henrik Tidefelt
17  */
20  * File:        pagescanner.l
21  * ----------------
22  * Lex inupt file to generate the scanner for the scanning of length data stored in a page's content stream.
23  */
27 /* The text within this first region delimited by %{ and %} is assumed to
28  * be C/C++ code and will be copied verbatim to the lex.pdf.c file ahead
29  * of the definitions of the pdflex() function. Add other header file inclusions
30  * or C++ variable declarations/prototypes that are needed by your code here.
31  */
33 #include "pdfstructure.h"
34 #include "pdfscanner.h"
35 #include "pdfparser.tab.h"
36 #include <string.h>
37 #include <iostream>
38 #include <iomanip>
40 unsigned char hexToChar( char c1, char c2 );
41 unsigned char octalToChar( char c1, char c2, char c3 );
43 size_t stringParenDepth;
46  * Global variable: pdflval
47  * -----------------------
48  * This global variable is how we get attribute information about the token
49  * just scanned to the client. The scanner sets the global variable
50  * appropriately and since it's global the client can just read it.     In the
51  * future, this variable will be declared for us in the y.tab.c file
52  * produced by Yacc, but for now, we declare it manually.
53  */
54 //YYSTYPE pdflval;      // manually declared for pp1, later Yacc provides
57  * Global variable: pdflloc
58  * -----------------------
59  * This global variable is how we get position information about the token
60  * just scanned to the client. (Operates similarly to pdflval above)
61  */
62 //struct pdfltype pdflloc; // manually dclared for pp1, later Yacc provides
66  /*
67         * The section before the first %% is the Definitions section of the lex
68         * input file. Here is where you set options for the scanner, define lex
69         * states, and can set up definitions to give names to regular expressions
70         * as a simple substitution mechanism that allows for more readable
71         * entries in the Rules section later. 
72         */
74 WhiteSpace               [ \t\n\r]
75 NonWhiteSpace [^ \t\n\r]
77 PlainInteger [0-9]+
78 DecInteger [+-]?[0-9]+
80 Float [+-]?[0-9]*[.][0-9]*
82 Name [/]{NonWhiteSpace}*
84 HexString [<]({WhiteSpace}|[0-9A-Fa-f])*[>]
85 ButParentheses ([^()]|(\\(.|\n)))*
87 %option c++
88 %option noyywrap
90 %x StringState
94 {PlainInteger} {
95         char * end;
96         pdflval.plainInt = strtol( yytext, & end, 10 );
97         return T_Int;
100 {Float} {
101         char * end;
102         pdflval.pdfObj = new PDF_Float( strtod( yytext, & end ) );
103         return T_Constant;
106 {HexString} {
107         /* Note that we don't parse the internal meaning of the bracketed contents, since this
108          * would only require more work when writing back to a pdf file.
109          */
110         /*
111         const char * src( yytext + 1 );
112         char c1;
113         char c2;
114         string str;
116         for( ; isblank( *src ); ++src )
117                 ;
118         c1 = *src;
119         ++src;
120         while( c1 != '>' )
121                 {
122                         for( ; isblank( *src ); ++src )
123                                 ;
124                         c2 = *src;
125                         ++src;
126                         if( c2 == '>' )
127                                 {
128                                         str += hexToChar( c1, 0 );
129                                         break;
130                                 }
131                         str += hexToChar( c1, c2 );
132                         for( ; isblank( *src ); ++src )
133                                 ;
134                         c1 = *src;
135                         ++src;
136                 }
138         pdflval.pdfObj = new PDF_String( str );
139         */
140         pdflval.pdfObj = new PDF_HexString( yytext );
141         return T_Constant;
144 <INITIAL>[\(] {
145         stringParenDepth = 1;
146         BEGIN( StringState );
147         return yytext[0];
149 <StringState>[\(] {
150         ++stringParenDepth;
151         pdflval.str = strdup( yytext );
152         return T_String;
154 <StringState>[\)] {
155         --stringParenDepth;
156         if( stringParenDepth > 0 )
157                 {
158                         pdflval.str = strdup( yytext );
159                         return T_String;
160                 }
161         else
162                 {
163                         BEGIN( INITIAL );
164                         return yytext[ 0 ];
165                 }
168 <StringState>{ButParentheses} {
169         /* Note that we don't parse the internal meaning of escape sequences within the name, since this
170          * would only require more work when writing back to a pdf file.
171          */
172         /*
173         char * res( new char[ pdfleng + 1 ] );
174         char * dst( res.getPtr( ) );
175         const char * src( yytext );
176         char c;
177         while( true )
178                 {
179                         c = *(src++);
180                         if( c == '\\' )
181                                 {
182                                         char c1( *(src++) );
183                                         switch( c1 )
184                                                 {
185                                                 case 'n':
186                                                         *(dst++) = '\n';
187                                                         break;
188                                                 case 'r':
189                                                         *(dst++) = '\r';
190                                                         break;
191                                                 case 't':
192                                                         *(dst++) = '\t';
193                                                         break;
194                                                 case 'b':
195                                                         *(dst++) = '\b';
196                                                         break;
197                                                 case 'f':
198                                                         *(dst++) = '\f';
199                                                         break;
200                                                 case '(':
201                                                         *(dst++) = '(';
202                                                         break;
203                                                 case ')':
204                                                         *(dst++) = ')';
205                                                         break;
206                                                 case '\\':
207                                                         *(dst++) = '\\';
208                                                         break;
209                                                 default:
210                                                         if( isdigit( c1 ) )
211                                                                 {
212                                                                         char c2;
213                                                                         isPtr->get( c2 );
214                                                                         if( isdigit( c2 ) )
215                                                                                 {
216                                                                                         char c3;
217                                                                                         isPtr->get( c3 );
218                                                                                         if( isdigit( c3 ) )
219                                                                                                 {
220                                                                                                         *(dst++) = octalToChar( c1, c2, c3 );
221                                                                                                 }
222                                                                                         else
223                                                                                                 {
224                                                                                                         *(dst++) = octalToChar( 0, c1, c2 );
225                                                                                                         *(dst++) = c3;
226                                                                                                 }
227                                                                                 }
228                                                                         else
229                                                                                 {
230                                                                                         *(dst++) = octalToChar( 0, 0, c1 );
231                                                                                         *(dst++) = c2;
232                                                                                 }
233                                                                 }
234                                                         else
235                                                                 {
236                                                                         *(dst++) = c1;
237                                                                 }
238                                                         break;
239                                                 }
240                                 }
241                         else
242                                 {
243                                         *(dst++) = c;
244                                 }
245                 }
247  done:
248         *dst = '\0';
249         pdflval.str = res;
250         */
251         pdflval.str = strdup( yytext );
252         return T_String;
255 true { pdflval.pdfObj = new PDF_Boolean( true ); return T_Constant; }
256 false { pdflval.pdfObj = new PDF_Boolean( false ); return T_Constant; }
257 null { pdflval.pdfObj = new PDF_Null( ); return T_Constant; }
258 obj { return T_obj; }
259 endobj { return T_endobj; }
260 stream("\r\n"|"\n") { return T_stream; }
261 endstream { return T_endstream; }
262 R { return T_R; }
264 "<<" { return T_OpenDic; }
265 ">>" { return T_CloseDic; }
267 [\[\]] {
268         return yytext[0];
271 {Name} {
272         /* Note that we don't parse the internal meaning of #-sequences within the name, since this
273          * would only require more work when writing back to a pdf file.
274          */
275         /*
276         const char * src( yytext + 1 );
277         RefCountPtr< char > dstMem( new char[ pdflength + 1 ] );
278         char * dst( dstMem.getPtr( ) );
279         char c;
280         for( c = *(src++); ! isblank( c ); c = *(src++) )
281                 {
282                         if( c == '#' )
283                                 {
284                                         char c1;
285                                         char c2;
286                                         c1 = *(src++);
287                                         c2 = *(src++);
288                                         (dst++) = hexToChar( c1, c2 );
289                                 }
290                         else
291                                 {
292                                         (dst++) = c;
293                                 }
294                 }
295         *dst = '\0';
296         pdflval.pdfObj = new PDF_Name( dstMem.getPtr( ) );
297         */
298         pdflval.pdfObj = new PDF_Name( yytext + 1 );
299         return T_Name;
302 <INITIAL>[ \t\n\r]+ ;
304 . { throw( "Page scanner found unrecognized token" ); }
307 /* The closing %% above marks the end of the Rules section and the beginning
308  * of the User Subroutines section. All text from here to the end of the
309  * file is copied verbatim to the end of the generated lex.pdf.c file.
310  * This section is where you put definitions of helper functions.
311  */
314 unsigned char hexToChar( char c1, char c2 )
316         unsigned char res( 0 );
317         if( c1 < 'A' )
318                 {
319                         res += 16 * static_cast< unsigned char >( c1 - '0' );
320                 }
321         else if( c1 < 'a' )
322                 {
323                         res += 16 * static_cast< unsigned char >( c1 - 'A' + 10 );
324                 }
325         else
326                 {
327                         res += 16 * static_cast< unsigned char >( c1 - 'a' + 10 );
328                 }
330         if( c2 < 'A' )
331                 {
332                         res += static_cast< unsigned char >( c2 - '0' );
333                 }
334         else if( c2 < 'a' )
335                 {
336                         res += static_cast< unsigned char >( c2 - 'A' + 10 );
337                 }
338         else
339                 {
340                         res += static_cast< unsigned char >( c2 - 'a' + 10 );
341                 }
343                 return res;
346 unsigned char octalToChar( char c1, char c2, char c3 )
348         return 64 * static_cast< unsigned char >( c1 - '0' ) + 8 * static_cast< unsigned char >( c2 - '0' ) + static_cast< unsigned char >( c3 - '0' );