source/pagescanner.ll

   1 /* This file is part of Shapes.
   2  *
   3  * Shapes is free software: you can redistribute it and/or modify
   4  * it under the terms of the GNU General Public License as published by
   5  * the Free Software Foundation, either version 3 of the License, or
   6  * any later version.
   7  *
   8  * Shapes is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11  * GNU General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public License
  14  * along with Shapes.  If not, see <http://www.gnu.org/licenses/>.
  15  *
  16  * Copyright 2008, 2010 Henrik Tidefelt
  17  */
  18
  19 /*
  20  * File:        pagescanner.l
  21  * ----------------
  22  * Lex inupt file to generate the scanner for the scanning of length data stored in a page's content stream.
  23  */
  24
  25 %{
  26
  27 /* The text within this first region delimited by %{ and %} is assumed to
  28  * be C/C++ code and will be copied verbatim to the lex.pdf.c file ahead
  29  * of the definitions of the pdflex() function. Add other header file inclusions
  30  * or C++ variable declarations/prototypes that are needed by your code here.
  31  */
  32
  33 #include "pdfstructure.h"
  34 #include "pdfscanner.h"
  35 #include "pdfparser.tab.h"
  36
  37 #include <string.h>
  38 #include <iostream>
  39 #include <iomanip>
  40 #include <cstdio> // This is a workaround for a bug in Flex.
  41
  42 unsigned char hexToChar( char c1, char c2 );
  43 unsigned char octalToChar( char c1, char c2, char c3 );
  44
  45 size_t stringParenDepth;
  46
  47 /*
  48  * Global variable: pdflval
  49  * -----------------------
  50  * This global variable is how we get attribute information about the token
  51  * just scanned to the client. The scanner sets the global variable
  52  * appropriately and since it's global the client can just read it.     In the
  53  * future, this variable will be declared for us in the y.tab.c file
  54  * produced by Yacc, but for now, we declare it manually.
  55  */
  56 //YYSTYPE pdflval;      // manually declared for pp1, later Yacc provides
  57
  58 /*
  59  * Global variable: pdflloc
  60  * -----------------------
  61  * This global variable is how we get position information about the token
  62  * just scanned to the client. (Operates similarly to pdflval above)
  63  */
  64 //struct pdfltype pdflloc; // manually dclared for pp1, later Yacc provides
  65
  66 %}
  67
  68  /*
  69         * The section before the first %% is the Definitions section of the lex
  70         * input file. Here is where you set options for the scanner, define lex
  71         * states, and can set up definitions to give names to regular expressions
  72         * as a simple substitution mechanism that allows for more readable
  73         * entries in the Rules section later.
  74         */
  75
  76 WhiteSpace               [ \t\n\r]
  77 NonWhiteSpace [^ \t\n\r]
  78
  79 PlainInteger [0-9]+
  80 DecInteger [+-]?[0-9]+
  81
  82 Float [+-]?[0-9]*[.][0-9]*
  83
  84 Name [/]{NonWhiteSpace}*
  85
  86 HexString [<]({WhiteSpace}|[0-9A-Fa-f])*[>]
  87 ButParentheses ([^()]|(\\(.|\n)))*
  88
  89 %option c++
  90 %option noyywrap
  91
  92 %x StringState
  93
  94 %%
  95
  96 {PlainInteger} {
  97         char * end;
  98         pdflval.plainInt = strtol( yytext, & end, 10 );
  99         return T_Int;
 100 }
 101
 102 {Float} {
 103         char * end;
 104         pdflval.pdfObj = new PDF_Float( strtod( yytext, & end ) );
 105         return T_Constant;
 106 }
 107
 108 {HexString} {
 109         /* Note that we don't parse the internal meaning of the bracketed contents, since this
 110          * would only require more work when writing back to a pdf file.
 111          */
 112         /*
 113         const char * src( yytext + 1 );
 114         char c1;
 115         char c2;
 116         string str;
 117
 118         for( ; isblank( *src ); ++src )
 119                 ;
 120         c1 = *src;
 121         ++src;
 122         while( c1 != '>' )
 123                 {
 124                         for( ; isblank( *src ); ++src )
 125                                 ;
 126                         c2 = *src;
 127                         ++src;
 128                         if( c2 == '>' )
 129                                 {
 130                                         str += hexToChar( c1, 0 );
 131                                         break;
 132                                 }
 133                         str += hexToChar( c1, c2 );
 134                         for( ; isblank( *src ); ++src )
 135                                 ;
 136                         c1 = *src;
 137                         ++src;
 138                 }
 139
 140         pdflval.pdfObj = new PDF_String( str );
 141         */
 142         pdflval.pdfObj = new PDF_HexString( yytext );
 143         return T_Constant;
 144 }
 145
 146 <INITIAL>[\(] {
 147         stringParenDepth = 1;
 148         BEGIN( StringState );
 149         return yytext[0];
 150 }
 151 <StringState>[\(] {
 152         ++stringParenDepth;
 153         pdflval.str = strdup( yytext );
 154         return T_String;
 155 }
 156 <StringState>[\)] {
 157         --stringParenDepth;
 158         if( stringParenDepth > 0 )
 159                 {
 160                         pdflval.str = strdup( yytext );
 161                         return T_String;
 162                 }
 163         else
 164                 {
 165                         BEGIN( INITIAL );
 166                         return yytext[ 0 ];
 167                 }
 168 }
 169
 170 <StringState>{ButParentheses} {
 171         /* Note that we don't parse the internal meaning of escape sequences within the name, since this
 172          * would only require more work when writing back to a pdf file.
 173          */
 174         /*
 175         char * res( new char[ pdfleng + 1 ] );
 176         char * dst( res.getPtr( ) );
 177         const char * src( yytext );
 178         char c;
 179         while( true )
 180                 {
 181                         c = *(src++);
 182                         if( c == '\\' )
 183                                 {
 184                                         char c1( *(src++) );
 185                                         switch( c1 )
 186                                                 {
 187                                                 case 'n':
 188                                                         *(dst++) = '\n';
 189                                                         break;
 190                                                 case 'r':
 191                                                         *(dst++) = '\r';
 192                                                         break;
 193                                                 case 't':
 194                                                         *(dst++) = '\t';
 195                                                         break;
 196                                                 case 'b':
 197                                                         *(dst++) = '\b';
 198                                                         break;
 199                                                 case 'f':
 200                                                         *(dst++) = '\f';
 201                                                         break;
 202                                                 case '(':
 203                                                         *(dst++) = '(';
 204                                                         break;
 205                                                 case ')':
 206                                                         *(dst++) = ')';
 207                                                         break;
 208                                                 case '\\':
 209                                                         *(dst++) = '\\';
 210                                                         break;
 211                                                 default:
 212                                                         if( isdigit( c1 ) )
 213                                                                 {
 214                                                                         char c2;
 215                                                                         isPtr->get( c2 );
 216                                                                         if( isdigit( c2 ) )
 217                                                                                 {
 218                                                                                         char c3;
 219                                                                                         isPtr->get( c3 );
 220                                                                                         if( isdigit( c3 ) )
 221                                                                                                 {
 222                                                                                                         *(dst++) = octalToChar( c1, c2, c3 );
 223                                                                                                 }
 224                                                                                         else
 225                                                                                                 {
 226                                                                                                         *(dst++) = octalToChar( 0, c1, c2 );
 227                                                                                                         *(dst++) = c3;
 228                                                                                                 }
 229                                                                                 }
 230                                                                         else
 231                                                                                 {
 232                                                                                         *(dst++) = octalToChar( 0, 0, c1 );
 233                                                                                         *(dst++) = c2;
 234                                                                                 }
 235                                                                 }
 236                                                         else
 237                                                                 {
 238                                                                         *(dst++) = c1;
 239                                                                 }
 240                                                         break;
 241                                                 }
 242                                 }
 243                         else
 244                                 {
 245                                         *(dst++) = c;
 246                                 }
 247                 }
 248
 249  done:
 250         *dst = '\0';
 251         pdflval.str = res;
 252         */
 253         pdflval.str = strdup( yytext );
 254         return T_String;
 255 }
 256 ;;
 257 true { pdflval.pdfObj = new PDF_Boolean( true ); return T_Constant; }
 258 false { pdflval.pdfObj = new PDF_Boolean( false ); return T_Constant; }
 259 null { pdflval.pdfObj = new PDF_Null( ); return T_Constant; }
 260 obj { return T_obj; }
 261 endobj { return T_endobj; }
 262 stream("\r\n"|"\n") { return T_stream; }
 263 endstream { return T_endstream; }
 264 R { return T_R; }
 265
 266 "<<" { return T_OpenDic; }
 267 ">>" { return T_CloseDic; }
 268
 269 [\[\]] {
 270         return yytext[0];
 271 }
 272
 273 {Name} {
 274         /* Note that we don't parse the internal meaning of #-sequences within the name, since this
 275          * would only require more work when writing back to a pdf file.
 276          */
 277         /*
 278         const char * src( yytext + 1 );
 279         RefCountPtr< char > dstMem( new char[ pdflength + 1 ] );
 280         char * dst( dstMem.getPtr( ) );
 281         char c;
 282         for( c = *(src++); ! isblank( c ); c = *(src++) )
 283                 {
 284                         if( c == '#' )
 285                                 {
 286                                         char c1;
 287                                         char c2;
 288                                         c1 = *(src++);
 289                                         c2 = *(src++);
 290                                         (dst++) = hexToChar( c1, c2 );
 291                                 }
 292                         else
 293                                 {
 294                                         (dst++) = c;
 295                                 }
 296                 }
 297         *dst = '\0';
 298         pdflval.pdfObj = new PDF_Name( dstMem.getPtr( ) );
 299         */
 300         pdflval.pdfObj = new PDF_Name( yytext + 1 );
 301         return T_Name;
 302 }
 303
 304 <INITIAL>[ \t\n\r]+ ;
 305
 306 . { throw( "Page scanner found unrecognized token" ); }
 307
 308 %%
 309 /* The closing %% above marks the end of the Rules section and the beginning
 310  * of the User Subroutines section. All text from here to the end of the
 311  * file is copied verbatim to the end of the generated lex.pdf.c file.
 312  * This section is where you put definitions of helper functions.
 313  */
 314
 315
 316 unsigned char hexToChar( char c1, char c2 )
 317 {
 318         unsigned char res( 0 );
 319         if( c1 < 'A' )
 320                 {
 321                         res += 16 * static_cast< unsigned char >( c1 - '0' );
 322                 }
 323         else if( c1 < 'a' )
 324                 {
 325                         res += 16 * static_cast< unsigned char >( c1 - 'A' + 10 );
 326                 }
 327         else
 328                 {
 329                         res += 16 * static_cast< unsigned char >( c1 - 'a' + 10 );
 330                 }
 331
 332         if( c2 < 'A' )
 333                 {
 334                         res += static_cast< unsigned char >( c2 - '0' );
 335                 }
 336         else if( c2 < 'a' )
 337                 {
 338                         res += static_cast< unsigned char >( c2 - 'A' + 10 );
 339                 }
 340         else
 341                 {
 342                         res += static_cast< unsigned char >( c2 - 'a' + 10 );
 343                 }
 344
 345                 return res;
 346 }
 347
 348 unsigned char octalToChar( char c1, char c2, char c3 )
 349 {
 350         return 64 * static_cast< unsigned char >( c1 - '0' ) + 8 * static_cast< unsigned char >( c2 - '0' ) + static_cast< unsigned char >( c3 - '0' );
 351 }