libvaladoc/markupreader.vala

   1 /* markupreader.vala
   2  *
   3  * Copyright (C) 2008-2009  Jürg Billeter
   4  * Copyright (C) 2011       Florian Brosch
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
  19  *
  20  * Author:
  21  *      Jürg Billeter <j@bitron.ch>
  22  */
  23
  24
  25 /**
  26  * Simple reader for a subset of XML.
  27  */
  28 public class Valadoc.MarkupReader : Object {
  29         public string filename {
  30                 private set;
  31                 get;
  32         }
  33
  34         public string name {
  35                 private set;
  36                 get;
  37         }
  38
  39         public string content {
  40                 private set;
  41                 get;
  42         }
  43
  44         private MappedFile mapped_file;
  45
  46         private string[] lines;
  47         private char* begin;
  48         private char* current;
  49         private char* end;
  50
  51         private int line;
  52         private int column;
  53
  54         private Vala.Map<string, string> attributes = new Vala.HashMap<string, string> (str_hash, str_equal);
  55         private bool empty_element;
  56
  57         private ErrorReporter reporter;
  58
  59         public MarkupReader.from_string (string filename, string content, ErrorReporter reporter) {
  60                 this.filename = filename;
  61                 this.reporter = reporter;
  62
  63                 lines = content.split ("\n");
  64                 begin = content;
  65                 end = begin + content.length;
  66                 current = begin;
  67
  68                 column = 1;
  69                 line = 1;
  70         }
  71
  72         public MarkupReader (string filename, ErrorReporter reporter) {
  73                 this.filename = filename;
  74                 this.reporter = reporter;
  75
  76                 try {
  77                         mapped_file = new MappedFile (filename, false);
  78                         begin = mapped_file.get_contents ();
  79                         lines = ((string) begin).split ("\n");
  80                         end = begin + mapped_file.get_length ();
  81
  82                         current = begin;
  83
  84                         line = 1;
  85                         column = 1;
  86                 } catch (FileError e) {
  87                         reporter.simple_error (null, "Unable to map file '%s': %s", filename, e.message);
  88                 }
  89         }
  90
  91         public string? get_line_content (int line_nr) {
  92                 if (this.lines.length > line_nr) {
  93                         return this.lines[line_nr];
  94                 }
  95
  96                 return null;
  97         }
  98
  99         public string? get_attribute (string attr) {
 100                 return attributes[attr];
 101         }
 102
 103         /*
 104          * Returns a copy of the current attributes.
 105          *
 106          * @return map of current attributes
 107          */
 108         public Vala.Map<string,string> get_attributes () {
 109                 var result = new Vala.HashMap<string, string> (str_hash, str_equal);
 110                 foreach (var key in attributes.get_keys ()) {
 111                         result.set (key, attributes.get (key));
 112                 }
 113                 return result;
 114         }
 115
 116         private string read_name () {
 117                 char* begin = current;
 118                 while (current < end) {
 119                         if (current[0] == ' ' || current[0] == '\t' || current[0] == '>'
 120                             || current[0] == '/' || current[0] == '=' || current[0] == '\n') {
 121                                 break;
 122                         }
 123                         unichar u = ((string) current).get_char_validated ((long) (end - current));
 124                         if (u != (unichar) (-1)) {
 125                                 current += u.to_utf8 (null);
 126                         } else {
 127                                 reporter.simple_error ("%s:%d".printf (filename, line),
 128                                                                            "invalid UTF-8 character");
 129                         }
 130                 }
 131                 if (current == begin) {
 132                         // syntax error: invalid name
 133                 }
 134                 return ((string) begin).substring (0, (int) (current - begin));
 135         }
 136
 137         public MarkupTokenType read_token (out MarkupSourceLocation token_begin, out MarkupSourceLocation token_end) {
 138                 attributes.clear ();
 139
 140                 if (empty_element) {
 141                         empty_element = false;
 142                         token_begin = MarkupSourceLocation (begin, line, column);
 143                         token_end = MarkupSourceLocation (begin, line, column);
 144                         return MarkupTokenType.END_ELEMENT;
 145                 }
 146
 147                 content = null;
 148                 name = null;
 149
 150                 space ();
 151
 152                 MarkupTokenType type = MarkupTokenType.NONE;
 153                 char* begin = current;
 154                 token_begin = MarkupSourceLocation (begin, line, column);
 155
 156                 if (current >= end) {
 157                         type = MarkupTokenType.EOF;
 158                 } else if (current[0] == '<') {
 159                         current++;
 160                         if (current >= end) {
 161                                 // error
 162                         } else if (current[0] == '?') {
 163                                 // processing instruction
 164                         } else if (current[0] == '!') {
 165                                 // comment or doctype
 166                                 current++;
 167                                 if (current < end - 1 && current[0] == '-' && current[1] == '-') {
 168                                         // comment
 169                                         current += 2;
 170                                         while (current < end - 2) {
 171                                                 if (current[0] == '-' && current[1] == '-' && current[2] == '>') {
 172                                                         // end of comment
 173                                                         current += 3;
 174                                                         break;
 175                                                 } else if (current[0] == '\n') {
 176                                                         line++;
 177                                                         column = 0;
 178                                                 }
 179                                                 current++;
 180                                         }
 181
 182                                         // ignore comment, read next token
 183                                         return read_token (out token_begin, out token_end);
 184                                 }
 185                         } else if (current[0] == '/') {
 186                                 type = MarkupTokenType.END_ELEMENT;
 187                                 current++;
 188                                 name = read_name ();
 189                                 if (current >= end || current[0] != '>') {
 190                                         // error
 191                                 }
 192                                 current++;
 193                         } else {
 194                                 type = MarkupTokenType.START_ELEMENT;
 195                                 name = read_name ();
 196                                 space ();
 197                                 while (current < end && current[0] != '>' && current[0] != '/') {
 198                                         string attr_name = read_name ();
 199                                         if (current >= end || current[0] != '=') {
 200                                                 // error
 201                                         }
 202                                         current++;
 203                                         // FIXME allow single quotes
 204                                         if (current >= end || current[0] != '"') {
 205                                                 // error
 206                                         }
 207                                         current++;
 208
 209                                         string attr_value = text ('"', false);
 210
 211                                         if (current >= end || current[0] != '"') {
 212                                                 // error
 213                                         }
 214                                         current++;
 215                                         attributes.set (attr_name, attr_value);
 216                                         space ();
 217                                 }
 218                                 if (current[0] == '/') {
 219                                         empty_element = true;
 220                                         current++;
 221                                         space ();
 222                                 } else {
 223                                         empty_element = false;
 224                                 }
 225                                 if (current >= end || current[0] != '>') {
 226                                         // error
 227                                 }
 228                                 current++;
 229                         }
 230                 } else {
 231                         space ();
 232
 233                         if (current[0] != '<') {
 234                                 content = text ('<', true);
 235                         } else {
 236                                 // no text
 237                                 // read next token
 238                                 return read_token (out token_begin, out token_end);
 239                         }
 240
 241                         type = MarkupTokenType.TEXT;
 242                 }
 243
 244                 token_end = MarkupSourceLocation (current, line, column - 1);
 245
 246                 return type;
 247         }
 248
 249         private string text (char end_char, bool rm_trailing_whitespace) {
 250                 StringBuilder content = new StringBuilder ();
 251                 char* text_begin = current;
 252                 char* last_linebreak = current;
 253
 254                 while (current < end && current[0] != end_char) {
 255                         unichar u = ((string) current).get_char_validated ((long) (end - current));
 256                         if (u == (unichar) (-1)) {
 257                                 reporter.simple_error ("%s:%d".printf (filename, line),
 258                                                                            "invalid UTF-8 character");
 259                         } else if (u == '&') {
 260                                 char* next_pos = current + u.to_utf8 (null);
 261                                 if (((string) next_pos).has_prefix ("amp;")) {
 262                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 263                                         content.append_c ('&');
 264                                         current += 5;
 265                                         text_begin = current;
 266                                 } else if (((string) next_pos).has_prefix ("quot;")) {
 267                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 268                                         content.append_c ('"');
 269                                         current += 6;
 270                                         text_begin = current;
 271                                 } else if (((string) next_pos).has_prefix ("apos;")) {
 272                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 273                                         content.append_c ('\'');
 274                                         current += 6;
 275                                         text_begin = current;
 276                                 } else if (((string) next_pos).has_prefix ("lt;")) {
 277                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 278                                         content.append_c ('<');
 279                                         current += 4;
 280                                         text_begin = current;
 281                                 } else if (((string) next_pos).has_prefix ("gt;")) {
 282                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 283                                         content.append_c ('>');
 284                                         current += 4;
 285                                         text_begin = current;
 286                                 } else if (((string) next_pos).has_prefix ("percnt;")) {
 287                                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 288                                         content.append_c ('%');
 289                                         current += 8;
 290                                         text_begin = current;
 291                                 } else {
 292                                         current += u.to_utf8 (null);
 293                                 }
 294                         } else {
 295                                 if (u == '\n') {
 296                                         line++;
 297                                         column = 0;
 298                                         last_linebreak = current;
 299                                 }
 300
 301                                 current += u.to_utf8 (null);
 302                                 column++;
 303                         }
 304                 }
 305
 306                 if (text_begin != current) {
 307                         content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
 308                 }
 309
 310                 column += (int) (current - last_linebreak);
 311
 312                 // Removes trailing whitespace
 313                 if (rm_trailing_whitespace) {
 314                         char* str_pos = ((char*)content.str) + content.len;
 315                         for (str_pos--; str_pos > ((char*)content.str) && str_pos[0].isspace(); str_pos--);
 316                         content.erase ((ssize_t) (str_pos-((char*) content.str) + 1), -1);
 317                 }
 318
 319                 return content.str;
 320         }
 321
 322         private void space () {
 323                 while (current < end && current[0].isspace ()) {
 324                         if (current[0] == '\n') {
 325                                 line++;
 326                                 column = 0;
 327                         }
 328                         current++;
 329                         column++;
 330                 }
 331         }
 332 }
 333
 334