source/pagescanner.ll

   1 /* This file is part of Shapes.
   2  *
   3  * Shapes is free software: you can redistribute it and/or modify
   4  * it under the terms of the GNU General Public License as published by
   5  * the Free Software Foundation, either version 3 of the License, or
   6  * any later version.
   7  *
   8  * Shapes is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11  * GNU General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public License
  14  * along with Shapes.  If not, see <http://www.gnu.org/licenses/>.
  15  *
  16  * Copyright 2008 Henrik Tidefelt
  17  */
  18
  19 /*
  20  * File:        pagescanner.l
  21  * ----------------
  22  * Lex inupt file to generate the scanner for the scanning of length data stored in a page's content stream.
  23  */
  24
  25 %{
  26
  27 /* The text within this first region delimited by %{ and %} is assumed to
  28  * be C/C++ code and will be copied verbatim to the lex.pdf.c file ahead
  29  * of the definitions of the pdflex() function. Add other header file inclusions
  30  * or C++ variable declarations/prototypes that are needed by your code here.
  31  */
  32
  33 #include "pdfstructure.h"
  34 #include "pdfscanner.h"
  35 #include "pdfparser.tab.h"
  36 #include <string.h>
  37 #include <iostream>
  38 #include <iomanip>
  39
  40 unsigned char hexToChar( char c1, char c2 );
  41 unsigned char octalToChar( char c1, char c2, char c3 );
  42
  43 size_t stringParenDepth;
  44
  45 /*
  46  * Global variable: pdflval
  47  * -----------------------
  48  * This global variable is how we get attribute information about the token
  49  * just scanned to the client. The scanner sets the global variable
  50  * appropriately and since it's global the client can just read it.     In the
  51  * future, this variable will be declared for us in the y.tab.c file
  52  * produced by Yacc, but for now, we declare it manually.
  53  */
  54 //YYSTYPE pdflval;      // manually declared for pp1, later Yacc provides
  55
  56 /*
  57  * Global variable: pdflloc
  58  * -----------------------
  59  * This global variable is how we get position information about the token
  60  * just scanned to the client. (Operates similarly to pdflval above)
  61  */
  62 //struct pdfltype pdflloc; // manually dclared for pp1, later Yacc provides
  63
  64 %}
  65
  66  /*
  67         * The section before the first %% is the Definitions section of the lex
  68         * input file. Here is where you set options for the scanner, define lex
  69         * states, and can set up definitions to give names to regular expressions
  70         * as a simple substitution mechanism that allows for more readable
  71         * entries in the Rules section later.
  72         */
  73
  74 WhiteSpace               [ \t\n\r]
  75 NonWhiteSpace [^ \t\n\r]
  76
  77 PlainInteger [0-9]+
  78 DecInteger [+-]?[0-9]+
  79
  80 Float [+-]?[0-9]*[.][0-9]*
  81
  82 Name [/]{NonWhiteSpace}*
  83
  84 HexString [<]({WhiteSpace}|[0-9A-Fa-f])*[>]
  85 ButParentheses ([^()]|(\\(.|\n)))*
  86
  87 %option c++
  88 %option noyywrap
  89
  90 %x StringState
  91
  92 %%
  93
  94 {PlainInteger} {
  95         char * end;
  96         pdflval.plainInt = strtol( yytext, & end, 10 );
  97         return T_Int;
  98 }
  99
 100 {Float} {
 101         char * end;
 102         pdflval.pdfObj = new PDF_Float( strtod( yytext, & end ) );
 103         return T_Constant;
 104 }
 105
 106 {HexString} {
 107         /* Note that we don't parse the internal meaning of the bracketed contents, since this
 108          * would only require more work when writing back to a pdf file.
 109          */
 110         /*
 111         const char * src( yytext + 1 );
 112         char c1;
 113         char c2;
 114         string str;
 115
 116         for( ; isblank( *src ); ++src )
 117                 ;
 118         c1 = *src;
 119         ++src;
 120         while( c1 != '>' )
 121                 {
 122                         for( ; isblank( *src ); ++src )
 123                                 ;
 124                         c2 = *src;
 125                         ++src;
 126                         if( c2 == '>' )
 127                                 {
 128                                         str += hexToChar( c1, 0 );
 129                                         break;
 130                                 }
 131                         str += hexToChar( c1, c2 );
 132                         for( ; isblank( *src ); ++src )
 133                                 ;
 134                         c1 = *src;
 135                         ++src;
 136                 }
 137
 138         pdflval.pdfObj = new PDF_String( str );
 139         */
 140         pdflval.pdfObj = new PDF_HexString( yytext );
 141         return T_Constant;
 142 }
 143
 144 <INITIAL>[\(] {
 145         stringParenDepth = 1;
 146         BEGIN( StringState );
 147         return yytext[0];
 148 }
 149 <StringState>[\(] {
 150         ++stringParenDepth;
 151         pdflval.str = strdup( yytext );
 152         return T_String;
 153 }
 154 <StringState>[\)] {
 155         --stringParenDepth;
 156         if( stringParenDepth > 0 )
 157                 {
 158                         pdflval.str = strdup( yytext );
 159                         return T_String;
 160                 }
 161         else
 162                 {
 163                         BEGIN( INITIAL );
 164                         return yytext[ 0 ];
 165                 }
 166 }
 167
 168 <StringState>{ButParentheses} {
 169         /* Note that we don't parse the internal meaning of escape sequences within the name, since this
 170          * would only require more work when writing back to a pdf file.
 171          */
 172         /*
 173         char * res( new char[ pdfleng + 1 ] );
 174         char * dst( res.getPtr( ) );
 175         const char * src( yytext );
 176         char c;
 177         while( true )
 178                 {
 179                         c = *(src++);
 180                         if( c == '\\' )
 181                                 {
 182                                         char c1( *(src++) );
 183                                         switch( c1 )
 184                                                 {
 185                                                 case 'n':
 186                                                         *(dst++) = '\n';
 187                                                         break;
 188                                                 case 'r':
 189                                                         *(dst++) = '\r';
 190                                                         break;
 191                                                 case 't':
 192                                                         *(dst++) = '\t';
 193                                                         break;
 194                                                 case 'b':
 195                                                         *(dst++) = '\b';
 196                                                         break;
 197                                                 case 'f':
 198                                                         *(dst++) = '\f';
 199                                                         break;
 200                                                 case '(':
 201                                                         *(dst++) = '(';
 202                                                         break;
 203                                                 case ')':
 204                                                         *(dst++) = ')';
 205                                                         break;
 206                                                 case '\\':
 207                                                         *(dst++) = '\\';
 208                                                         break;
 209                                                 default:
 210                                                         if( isdigit( c1 ) )
 211                                                                 {
 212                                                                         char c2;
 213                                                                         isPtr->get( c2 );
 214                                                                         if( isdigit( c2 ) )
 215                                                                                 {
 216                                                                                         char c3;
 217                                                                                         isPtr->get( c3 );
 218                                                                                         if( isdigit( c3 ) )
 219                                                                                                 {
 220                                                                                                         *(dst++) = octalToChar( c1, c2, c3 );
 221                                                                                                 }
 222                                                                                         else
 223                                                                                                 {
 224                                                                                                         *(dst++) = octalToChar( 0, c1, c2 );
 225                                                                                                         *(dst++) = c3;
 226                                                                                                 }
 227                                                                                 }
 228                                                                         else
 229                                                                                 {
 230                                                                                         *(dst++) = octalToChar( 0, 0, c1 );
 231                                                                                         *(dst++) = c2;
 232                                                                                 }
 233                                                                 }
 234                                                         else
 235                                                                 {
 236                                                                         *(dst++) = c1;
 237                                                                 }
 238                                                         break;
 239                                                 }
 240                                 }
 241                         else
 242                                 {
 243                                         *(dst++) = c;
 244                                 }
 245                 }
 246
 247  done:
 248         *dst = '\0';
 249         pdflval.str = res;
 250         */
 251         pdflval.str = strdup( yytext );
 252         return T_String;
 253 }
 254 ;;
 255 true { pdflval.pdfObj = new PDF_Boolean( true ); return T_Constant; }
 256 false { pdflval.pdfObj = new PDF_Boolean( false ); return T_Constant; }
 257 null { pdflval.pdfObj = new PDF_Null( ); return T_Constant; }
 258 obj { return T_obj; }
 259 endobj { return T_endobj; }
 260 stream("\r\n"|"\n") { return T_stream; }
 261 endstream { return T_endstream; }
 262 R { return T_R; }
 263
 264 "<<" { return T_OpenDic; }
 265 ">>" { return T_CloseDic; }
 266
 267 [\[\]] {
 268         return yytext[0];
 269 }
 270
 271 {Name} {
 272         /* Note that we don't parse the internal meaning of #-sequences within the name, since this
 273          * would only require more work when writing back to a pdf file.
 274          */
 275         /*
 276         const char * src( yytext + 1 );
 277         RefCountPtr< char > dstMem( new char[ pdflength + 1 ] );
 278         char * dst( dstMem.getPtr( ) );
 279         char c;
 280         for( c = *(src++); ! isblank( c ); c = *(src++) )
 281                 {
 282                         if( c == '#' )
 283                                 {
 284                                         char c1;
 285                                         char c2;
 286                                         c1 = *(src++);
 287                                         c2 = *(src++);
 288                                         (dst++) = hexToChar( c1, c2 );
 289                                 }
 290                         else
 291                                 {
 292                                         (dst++) = c;
 293                                 }
 294                 }
 295         *dst = '\0';
 296         pdflval.pdfObj = new PDF_Name( dstMem.getPtr( ) );
 297         */
 298         pdflval.pdfObj = new PDF_Name( yytext + 1 );
 299         return T_Name;
 300 }
 301
 302 <INITIAL>[ \t\n\r]+ ;
 303
 304 . { throw( "Page scanner found unrecognized token" ); }
 305
 306 %%
 307 /* The closing %% above marks the end of the Rules section and the beginning
 308  * of the User Subroutines section. All text from here to the end of the
 309  * file is copied verbatim to the end of the generated lex.pdf.c file.
 310  * This section is where you put definitions of helper functions.
 311  */
 312
 313
 314 unsigned char hexToChar( char c1, char c2 )
 315 {
 316         unsigned char res( 0 );
 317         if( c1 < 'A' )
 318                 {
 319                         res += 16 * static_cast< unsigned char >( c1 - '0' );
 320                 }
 321         else if( c1 < 'a' )
 322                 {
 323                         res += 16 * static_cast< unsigned char >( c1 - 'A' + 10 );
 324                 }
 325         else
 326                 {
 327                         res += 16 * static_cast< unsigned char >( c1 - 'a' + 10 );
 328                 }
 329
 330         if( c2 < 'A' )
 331                 {
 332                         res += static_cast< unsigned char >( c2 - '0' );
 333                 }
 334         else if( c2 < 'a' )
 335                 {
 336                         res += static_cast< unsigned char >( c2 - 'A' + 10 );
 337                 }
 338         else
 339                 {
 340                         res += static_cast< unsigned char >( c2 - 'a' + 10 );
 341                 }
 342
 343                 return res;
 344 }
 345
 346 unsigned char octalToChar( char c1, char c2, char c3 )
 347 {
 348         return 64 * static_cast< unsigned char >( c1 - '0' ) + 8 * static_cast< unsigned char >( c2 - '0' ) + static_cast< unsigned char >( c3 - '0' );
 349 }