lexer/lexer.cpp

   1 #include <fridh/lexer.hpp>
   2 #include <ail/array.hpp>
   3 #include <ail/string.hpp>
   4 #include <boost/foreach.hpp>
   5
   6 namespace fridh
   7 {
   8         boost::mutex table_mutex;
   9
  10         line_of_code::line_of_code():
  11                 indentation_level(0)
  12         {
  13         }
  14
  15         operator_lexeme::operator_lexeme(lexeme_type::type lexeme, std::string const & string):
  16                 lexeme(lexeme),
  17                 string(string)
  18         {
  19         }
  20
  21         bool operator_lexeme::operator<(operator_lexeme const & other) const
  22         {
  23                 return string.length() > other.string.length();
  24         }
  25
  26         lexer::lexer(std::string const & input, lines_of_code & lines):
  27                 input(input),
  28                 lines(lines)
  29         {
  30         }
  31
  32         bool lexer::parse_operator(line_of_code & output)
  33         {
  34                 std::size_t remaining_characters = end - i;
  35
  36                 BOOST_FOREACH(operator_lexeme & current_lexeme, operator_lexeme_data)
  37                 {
  38                         std::size_t operator_length = current_lexeme.string.size();
  39                         if(remaining_characters < operator_length)
  40                                 return false;
  41
  42                         std::string substring = input.substr(i, operator_length);
  43
  44                         if(substring == current_lexeme.string)
  45                         {
  46                                 output.lexemes.push_back(current_lexeme.lexeme);
  47                                 i += operator_length;
  48                                 return true;
  49                         }
  50                 }
  51                 return false;
  52         }
  53
  54         void lexer::lexer_error(std::string const & message, uword error_line)
  55         {
  56                 if(error_line == 0)
  57                         error_line = line;
  58                 throw ail::exception("Lexer error: Line " + ail::number_to_string<uword>(error_line) + ": " + message);
  59         }
  60
  61         void lexer::number_parsing_error(std::string const & message)
  62         {
  63                 lexer_error(message);
  64         }
  65
  66         bool lexer::is_name_char(char input)
  67         {
  68                 return ail::is_alpha(input) || ail::is_digit(input) || input == '_';
  69         }
  70
  71         void lexer::parse_name(line_of_code & output)
  72         {
  73                 std::size_t start = i;
  74                 for(i++; i < end && is_name_char(input[i]); i++);
  75                 std::string name = input.substr(start, i - start);
  76
  77                 lexeme current_lexeme;
  78                 if(name == "true")
  79                         current_lexeme = lexeme(true);
  80                 else if(name == "false")
  81                         current_lexeme = lexeme(false);
  82                 else if(name == "nil")
  83                         current_lexeme.type = lexeme_type::nil;
  84                 else
  85                 {
  86                         current_lexeme = lexeme(name);
  87                         current_lexeme.type = lexeme_type::name;
  88                 }
  89
  90                 output.lexemes.push_back(current_lexeme);
  91         }
  92
  93         bool lexer::string_match(std::string const & target)
  94         {
  95                 if(end - i < target.size())
  96                         return false;
  97
  98                 return input.substr(i, target.size()) == target;
  99         }
 100
 101         void lexer::process_newline(bool next_line)
 102         {
 103                 if(!current_line.lexemes.empty())
 104                 {
 105                         current_line.line = line;
 106                         lines.push_back(current_line);
 107                 }
 108                 std::string line_string = input.substr(line_offset, i - line_offset);
 109                 current_line = line_of_code();
 110
 111                 i++;
 112                 line_offset = i;
 113
 114                 if(next_line)
 115                         line++;
 116                 else
 117                 {
 118                         //skip initial spaces after ` and (
 119                         for(; i < end && input[i] == ' '; i++);
 120                 }
 121         }
 122
 123         void lexer::process_one_liner(word summand)
 124         {
 125                 uword indentation_level = current_line.indentation_level;
 126                 process_newline(false);
 127                 current_line.indentation_level = indentation_level + summand;
 128         }
 129
 130         void lexer::parse_lexemes()
 131         {
 132                 initialise_tables();
 133
 134                 line = 1;
 135
 136                 line_offset = 0;
 137
 138                 for(i = 0, end = input.size(); i < end;)
 139                 {
 140                         if(parse_operator(current_line))
 141                                 continue;
 142
 143                         char const tab = '\t';
 144
 145                         char byte = input[i];
 146
 147                         switch(byte)
 148                         {
 149                                 case tab:
 150                                         if(current_line.indentation_level > 0)
 151                                                 lexer_error("Tabs are only permitted in the beginning of a line (offset " + ail::number_to_string(i - line_offset + 1) + ")");
 152                                         for(i++, current_line.indentation_level = 1; i < end && input[i] == tab; i++, current_line.indentation_level++);
 153                                         continue;
 154
 155                                 case ' ':
 156                                 case '\r':
 157                                         i++;
 158                                         continue;
 159
 160                                 case '\n':
 161                                 {
 162                                         process_newline();
 163                                         continue;
 164                                 }
 165
 166                                 case '\'':
 167                                 case '"':
 168                                 {
 169                                         std::string string;
 170                                         parse_string(current_line);
 171                                         continue;
 172                                 }
 173
 174                                 case ';':
 175                                         parse_comment();
 176                                         continue;
 177
 178                                 case '`':
 179                                         process_newline(false);
 180                                         continue;
 181
 182                                 case '(':
 183                                         process_one_liner(1);
 184                                         continue;
 185
 186                                 case ')':
 187                                         process_one_liner(-1);
 188                                         continue;
 189                         }
 190
 191                         if(parse_number(current_line))
 192                                 continue;
 193
 194                         parse_name(current_line);
 195                 }
 196
 197                 if(!current_line.lexemes.empty())
 198                 {
 199                         current_line.line = line;
 200                         lines.push_back(current_line);
 201                 }
 202         }
 203
 204         std::string visualise_lexemes(lines_of_code & lines)
 205         {
 206                 std::string output;
 207
 208                 BOOST_FOREACH(line_of_code & current_line, lines)
 209                 {
 210                         std::string number_string = ail::number_to_string(current_line.line);
 211                         for(word i = 0, end = 5 - number_string.size(); i < end; i++)
 212                                 output += " ";
 213                         output += number_string;
 214                         output += ": ";
 215                         for(uword indentation = 0; indentation < current_line.indentation_level; indentation++)
 216                                 output += "    ";
 217                         bool first = true;
 218                         BOOST_FOREACH(lexeme & current_lexeme, current_line.lexemes)
 219                         {
 220                                 if(first)
 221                                         first = false;
 222                                 else
 223                                         output += " ";
 224                                 output += "[" + current_lexeme.to_string() + "]";
 225                         }
 226                         output += "\n";
 227                 }
 228
 229                 return output;
 230         }
 231
 232         bool lexer::parse(std::string & error)
 233         {
 234                 try
 235                 {
 236                         parse_lexemes();
 237                         return true;
 238                 }
 239                 catch(ail::exception & exception)
 240                 {
 241                         error = exception.get_message();
 242                         return false;
 243                 }
 244         }
 245 }