rtf: add parser function
[siplcs.git] / src / core / sipe-rtf.l
blob4df67fa97d1e290b05741c1cd3443b39bcd7db79
1 /************************ tell Emacs this is a -*-C-*- file *************
2  * @file sipe-rtf.l
3  *
4  * pidgin-sipe
5  *
6  * Copyright (C) 2018 SIPE Project <http://sipe.sourceforge.net/>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21  *
22  *
23  * Simple parser to extract plain text from RTF and transform it to HTML
24  */
26 /* generated module */
27 %option outfile="sipe-rtf.c"
28 %option prefix="sipe_rtf_lexer_"
30 /* flex configuration options */
31 %option 8bit
32 %option bison-bridge
33 %option full
34 %option nodefault
35 %option noinput
36 %option nointeractive
37 %option pointer
38 %option reentrant
39 %option nounput
40 %option warn
41 %option noyyalloc
42 %option noyyfree
43 %option noyyrealloc
44 %option noyywrap
47 #include <glib.h>
49 #include "sipe-common.h"
50 #include "sipe-backend.h"
51 #include "sipe-rtf.h"
54  * small string buffer to avoid memory allocations
55  *
56  * Must be length of longest interesting keyword + 1
57  * Currently that would be "par"
58  */
59 #define SIPE_RTF_LEXER_KEYWORD_SIZE 3 + 1 + 1
61 /* lexer token value type */
62 struct parser_lval_type {
63         guint number;
64         gchar keyword_buffer[SIPE_RTF_LEXER_KEYWORD_SIZE];
66 #define YYSTYPE struct parser_lval_type
68 /* lexer tokens */
69 #define KEYWORD           256
70 #define KEYWORD_PARAMETER 257
71 #define KEYWORD_END       258
72 #define LEXER_ERROR       259
74 /* parser state */
75 struct parser_state {
76         GString                 *text;
77         guint                    unicode_ignore_length;
78         guint                    ignore;
79         struct parser_lval_type  lval;
82 static void sipe_rtf_add_char(struct parser_state *state, gchar c);
83 static void sipe_rtf_add_text(struct parser_state *state, const gchar *text);
84 static void sipe_rtf_add_unichar(struct parser_state *state, gunichar c);
87 DIGIT  [0-9]
88 HEX    [0-9a-fA-F]
89 LETTER [a-zA-Z]
91 %x RTF_KEYWORD
94                       /* stuff that we simply throw away   */
95 [\r\n]+               /* line endings                      */
96 "{"\\\*[^}]*"}"       /* comments                          */
97                       /* font definitions                  */
98 "{"\\f{DIGIT}+\\[^;]+;"}"
99 "{"                   /* section start                     */
100 "}"                   /* section end                       */
102                       /* pass plain text to output buffer  */
103                       /* escaped special characters        */
104 \\\\                  { sipe_rtf_add_char(yyextra, '\\'); }
105 \\"{"                 { sipe_rtf_add_char(yyextra, '{');  }
106 \\"}"                 { sipe_rtf_add_char(yyextra, '}');  }
107 \\\'{HEX}{2}          { /* 2 digit hex to 8-bit character  */
108                         unsigned int c;
109                         sscanf(yytext + 2, "%x", &c);
110                         sipe_rtf_add_char(yyextra, c);
111                       }
112 \\u{DIGIT}+           { /* Unicode character               */
113                         gunichar c;
114                         sscanf(yytext + 2, "%d", &c);
115                         sipe_rtf_add_unichar(yyextra, c);
116                       }
117                       /* all other plain text              */
118 [^{}\\\n\r]+          { sipe_rtf_add_text(yyextra, yytext); }
120                       /* stuff passed to parser for further processing */
121 \\{LETTER}+           {
122                         BEGIN(RTF_KEYWORD);
123                         g_strlcpy(yylval->keyword_buffer,
124                                   yytext + 1,
125                                   SIPE_RTF_LEXER_KEYWORD_SIZE);
126                         return(KEYWORD);
127                       }
128 <RTF_KEYWORD>{DIGIT}+     {
129                         sscanf(yytext, "%d", &yylval->number);
130                         return(KEYWORD_PARAMETER);
131                       }
132 <RTF_KEYWORD>(;|[^0-9][^;\\]*;|[ ])? {
133                         /* reset <keyword start condition and throw away */
134                         BEGIN(INITIAL);
135                         return(KEYWORD_END);
136                       }
137 <RTF_KEYWORD>.|\n     { /* reset <RTF_KEYWORD> start condition */
138                         BEGIN(INITIAL);
139                         yyless(0);
140                         return(KEYWORD_END);
141                       }
143                       /* indicate anything else as error to parser */
144 <INITIAL,RTF_KEYWORD>.|\n {
145                         return(LEXER_ERROR);
146                       }
149 /* memory allocation for flex code */
150 void *sipe_rtf_lexer_alloc(yy_size_t size,
151                            SIPE_UNUSED_PARAMETER yyscan_t yyscanner)
153         return g_malloc(size);
156 void *sipe_rtf_lexer_realloc(void *ptr, yy_size_t size,
157                              SIPE_UNUSED_PARAMETER yyscan_t yyscanner)
159         return g_realloc(ptr, size);
162 void sipe_rtf_lexer_free(void *ptr,
163                          SIPE_UNUSED_PARAMETER yyscan_t yyscanner)
165         g_free(ptr);
168 /* add text to buffer */
169 static void sipe_rtf_add_char(struct parser_state *state, gchar c)
171   /* ignored characters after unicode sequence */
172   if (state->ignore) {
173     state->ignore--;
174   } else {
175     g_string_append_c(state->text, c);
176   }
179 static void sipe_rtf_add_text(struct parser_state *state, const gchar *text)
181   /* ignored characters after unicode sequence */
182   if (state->ignore) {
183     while (*text && state->ignore--) text++;
184   }
185   if (!*text)
186     return;
188   /* add the remainder to the text buffer */
189   g_string_append(state->text, text);
192 static void sipe_rtf_add_unichar(struct parser_state *state, gunichar c)
194   /* ignored characters after unicode sequence */
195   state->ignore = state->unicode_ignore_length;
197   g_string_append_unichar(state->text, c);
200 static void sipe_rtf_parse_keyword(struct parser_state *state,
201                                    const gchar *keyword) {
202         if (strcmp(keyword, "par") == 0) {
203                 sipe_rtf_add_text(state, "<br/>");
204         }
207 static void sipe_rtf_parse_keyword_parameter(struct parser_state *state,
208                                              const gchar *keyword,
209                                              unsigned int parameter) {
210         if (strcmp(keyword, "uc") == 0) {
211                 state->unicode_ignore_length = parameter;
212         }
215 /****************************************************************************
217  * RTF parser
219  * based on Bison parser
221  * %output  "sipe-rtf-parser.c"
222  * %defines "sipe-rtf-parser.h"
224  * %define api.pure   full
225  * %define api.prefix {sipe_rtf_parser_}
227  * %param       {yyscan_t scanner}
228  * %parse-param {struct parser_state *state}
230  * %{
231  * #include ...
232  * %}
234  * %union {
235  *   gchar keyword_buffer[SIPE_RTF_LEXER_KEYWORD_SIZE];
236  *   guint number;
237  * }
239  * %token <keyword_buffer> KEYWORD
240  * %token <number>         KEYWORD_PARAMETER
241  * %token                  KEYWORD_END
242  * %token                  LEXER_ERROR
244  * %%
245  * sequence:
246  *           %empty
247  *      | sequence KEYWORD KEYWORD_PARAMETER KEYWORD_END {
248  *          const char   *keyword   = $2;
249  *          unsigned int  parameter = $3;
250  *          sipe_rtf_parse_keyword_parameter(state, keyword, parameter);
251  *         }
252  *      | sequence KEYWORD KEYWORD_END {
253  *          const char   *keyword   = $2;
254  *          sipe_rtf_parse_keyword(state, keyword);
255  *         }
256  *         | sequence LEXER_ERROR {
257  *          yyerror(scanner, state, "lexer error");
258  *          YYERROR;
259  *         }
260  *         ;
261  * %%
262  */
263 static void sipe_rtf_parser_error(const gchar *msg)
265         SIPE_DEBUG_ERROR("sipe_rtf_parser_error: %s", msg);
268 static int sipe_rtf_parser_get_token(yyscan_t scanner,
269                                      struct parser_state *state,
270                                      gboolean required)
272         int token = sipe_rtf_lexer_lex(&state->lval, scanner);
274         if (required && (token < 1)) {
275                 sipe_rtf_parser_error("unexpected end of RTF");
276                 return -1;
277         }
279         return token;
282 static gboolean sipe_rtf_parser(yyscan_t scanner,
283                                 struct parser_state *state)
285         struct parser_lval_type *lval = &state->lval;
286         int token;
288         /* read tokens from parser until it returns EOF */
289         while (TRUE) {
290                 if ((token = sipe_rtf_parser_get_token(scanner,
291                                                        state,
292                                                        FALSE)) < 0)
293                         return TRUE;
295                 switch (token) {
296                 case 0: /* parse succeeded */
297                         return FALSE;
298                         break;
300                 case KEYWORD:
301                         {
302                                 const gchar *keyword = lval->keyword_buffer;
304                                 if ((token = sipe_rtf_parser_get_token(scanner,
305                                                                        state,
306                                                                        TRUE)) < 0)
307                                         return TRUE;
309                                 switch (token) {
310                                 case KEYWORD_END:
311                                         sipe_rtf_parse_keyword(state, keyword);
312                                         break;
314                                 case KEYWORD_PARAMETER:
315                                         {
316                                                 guint parameter = lval->number;
318                                                 if ((token = sipe_rtf_parser_get_token(scanner,
319                                                                                        state,
320                                                                                        TRUE)) < 0) {
321                                                         return TRUE;
322                                                 } else if (token == KEYWORD_END) {
323                                                         sipe_rtf_parse_keyword_parameter(state,
324                                                                                          keyword,
325                                                                                          parameter);
326                                                 } else {
327                                                         sipe_rtf_parser_error("unexpected token");
328                                                         return TRUE;
329                                                 }
330                                         }
331                                         break;
333                                 default:
334                                         sipe_rtf_parser_error("broken keyword");
335                                         return TRUE;
336                                         break;
337                                 }
338                         }
339                         break;
341                 default:
342                         sipe_rtf_parser_error("unexpected token");
343                         return TRUE;
344                 }
345         }
348 gchar *sipe_rtf_to_html(const gchar *rtf)
350         // @TODO
351         (void)rtf;
352         (void)sipe_rtf_parser;
353         return g_strdup("");
357   Local Variables:
358   mode: c
359   c-file-style: "bsd"
360   indent-tabs-mode: t
361   tab-width: 8
362   End: