libcody/buffer.cc

   1 // CODYlib              -*- mode:c++ -*-
   2 // Copyright (C) 2020 Nathan Sidwell, nathan@acm.org
   3 // License: Apache v2.0
   4
   5 // Cody
   6 #include "internal.hh"
   7 // C++
   8 #include <algorithm>
   9 // C
  10 #include <cstring>
  11 // OS
  12 #include <unistd.h>
  13 #include <cerrno>
  14
  15 // MessageBuffer code
  16
  17 // Lines consist of words and end with a NEWLINE (0xa) char
  18 // Whitespace characters are TAB (0x9) and SPACE (0x20)
  19 // Words consist of non-whitespace chars separated by whitespace.
  20 // Multiple lines in one transaction are indicated by ending non-final
  21 // lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE
  22 // Continuations with ; preceding it
  23 // Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting.
  24 // Quoting with '...'
  25 // Anything outside of [-+_/%.a-zA-Z0-9] needs quoting
  26 // Anything outside of <= <space> or DEL or \' or \\ needs escaping.
  27 // Escapes are \\, \', \n, \t, \_, everything else as \<hex><hex>?
  28 // Spaces separate words, UTF8 encoding for non-ascii chars
  29
  30 namespace Cody {
  31 namespace Detail {
  32
  33 static const char CONTINUE = S2C(u8";");
  34
  35 void MessageBuffer::BeginLine ()
  36 {
  37   if (!buffer.empty ())
  38     {
  39       // Terminate the previous line with a continuation
  40       buffer.reserve (buffer.size () + 3);
  41       buffer.push_back (S2C(u8" "));
  42       buffer.push_back (CONTINUE);
  43       buffer.push_back (S2C(u8"\n"));
  44     }
  45   lastBol = buffer.size ();
  46 }
  47
  48 // QUOTE means 'maybe quote', we search it for quote-needing chars
  49
  50 void MessageBuffer::Append (char const *str, bool quote, size_t len)
  51 {
  52   if (len == ~size_t (0))
  53     len = strlen (str);
  54
  55   if (!len && !quote)
  56     return;
  57
  58   // We want to quote characters outside of [-+_A-Za-z0-9/%.], anything
  59   // that could remotely be shell-active.  UTF8 encoding for non-ascii.
  60   if (quote && len)
  61     {
  62       quote = false;
  63       // Scan looking for quote-needing characters.  We could just
  64       // append until we find one, but that's probably confusing
  65       for (size_t ix = len; ix--;)
  66         {
  67           unsigned char c = (unsigned char)str[ix];
  68           if (!((c >= S2C(u8"a") && c <= S2C(u8"z"))
  69                 || (c >= S2C(u8"A") && c <= S2C(u8"Z"))
  70                 || (c >= S2C(u8"0") && c <= S2C(u8"9"))
  71                 || c == S2C(u8"-") || c == S2C(u8"+") || c == S2C(u8"_")
  72                 || c == S2C(u8"/") || c == S2C(u8"%") || c == S2C(u8".")))
  73             {
  74               quote = true;
  75               break;
  76             }
  77         }
  78     }
  79
  80   // Maximal length of appended string
  81   buffer.reserve (buffer.size () + len * (quote ? 3 : 1) + 2);
  82
  83   if (quote)
  84     buffer.push_back (S2C(u8"'"));
  85
  86   for (auto *end = str + len; str != end;)
  87     {
  88       auto *e = end;
  89
  90       if (quote)
  91         // Look for next escape-needing char.  More relaxed than
  92         // the earlier needs-quoting check.
  93         for (e = str; e != end; ++e)
  94           {
  95             unsigned char c = (unsigned char)*e;
  96             if (c < S2C(u8" ") || c == 0x7f
  97                 || c == S2C(u8"\\") || c == S2C(u8"'"))
  98               break;
  99           }
 100       buffer.insert (buffer.end (), str, e);
 101       str = e;
 102
 103       if (str == end)
 104         break;
 105
 106       buffer.push_back (S2C(u8"\\"));
 107       switch (unsigned char c = (unsigned char)*str++)
 108         {
 109         case S2C(u8"\t"):
 110           c = S2C(u8"t");
 111           goto append;
 112
 113         case S2C(u8"\n"):
 114           c = S2C(u8"n");
 115           goto append;
 116
 117         case S2C(u8"'"):
 118         case S2C(u8"\\"):
 119         append:
 120           buffer.push_back (c);
 121           break;
 122
 123         default:
 124           // Full-on escape.  Use 2 lower-case hex chars
 125           for (unsigned shift = 8; shift;)
 126             {
 127               shift -= 4;
 128
 129               char nibble = (c >> shift) & 0xf;
 130               nibble += S2C(u8"0");
 131               if (nibble > S2C(u8"9"))
 132                 nibble += S2C(u8"a") - (S2C(u8"9") + 1);
 133               buffer.push_back (nibble);
 134             }
 135         }
 136     }
 137
 138   if (quote)
 139     buffer.push_back (S2C(u8"'"));
 140 }
 141
 142 void MessageBuffer::Append (char c)
 143 {
 144   buffer.push_back (c);
 145 }
 146
 147 void MessageBuffer::AppendInteger (unsigned u)
 148 {
 149   // Sigh, even though std::to_string is C++11, we support building on
 150   // gcc 4.8, which is a C++11 compiler lacking std::to_string.  so
 151   // have something horrible.
 152   std::string v (20, 0);
 153   size_t len = snprintf (const_cast<char *> (v.data ()), v.size (), "%u", u);
 154   v.erase (len);
 155
 156   AppendWord (v);
 157 }
 158
 159 int MessageBuffer::Write (int fd) noexcept
 160 {
 161   size_t limit = buffer.size () - lastBol;
 162   ssize_t count = write (fd, &buffer.data ()[lastBol], limit);
 163
 164   int err = 0;
 165   if (count < 0)
 166     err = errno;
 167   else
 168     {
 169       lastBol += count;
 170       if (size_t (count) != limit)
 171         err = EAGAIN;
 172     }
 173
 174   if (err != EAGAIN && err != EINTR)
 175     {
 176       // Reset for next message
 177       buffer.clear ();
 178       lastBol = 0;
 179     }
 180
 181   return err;
 182 }
 183
 184 int MessageBuffer::Read (int fd) noexcept
 185 {
 186   constexpr size_t blockSize = 200;
 187
 188   size_t lwm = buffer.size ();
 189   size_t hwm = buffer.capacity ();
 190   if (hwm - lwm < blockSize / 2)
 191     hwm += blockSize;
 192   buffer.resize (hwm);
 193
 194   auto iter = buffer.begin () + lwm;
 195   ssize_t count = read (fd, &*iter, hwm - lwm);
 196   buffer.resize (lwm + (count >= 0 ? count : 0));
 197
 198   if (count < 0)
 199     return errno;
 200
 201   if (!count)
 202     // End of file
 203     return -1;
 204
 205   bool more = true;
 206   for (;;)
 207     {
 208       auto newline = std::find (iter, buffer.end (), S2C(u8"\n"));
 209       if (newline == buffer.end ())
 210         break;
 211       more = newline != buffer.begin () && newline[-1] == CONTINUE;
 212       iter = newline + 1;
 213
 214       if (iter == buffer.end ())
 215         break;
 216
 217       if (!more)
 218         {
 219           // There is no continuation, but there are chars after the
 220           // newline.  Truncate the buffer and return an error
 221           buffer.resize (iter - buffer.begin ());
 222           return EINVAL;
 223         }
 224     }
 225
 226   return more ? EAGAIN : 0;
 227 }
 228
 229 int MessageBuffer::Lex (std::vector<std::string> &result)
 230 {
 231   result.clear ();
 232
 233   if (IsAtEnd ())
 234     return ENOENT;
 235
 236   Assert (buffer.back () == S2C(u8"\n"));
 237
 238   auto iter = buffer.begin () + lastBol;
 239
 240   for (std::string *word = nullptr;;)
 241     {
 242       char c = *iter;
 243
 244       ++iter;
 245       if (c == S2C(u8" ") || c == S2C(u8"\t"))
 246         {
 247           word = nullptr;
 248           continue;
 249         }
 250
 251       if (c == S2C(u8"\n"))
 252         break;
 253
 254       if (c == CONTINUE)
 255         {
 256           // Line continuation
 257           if (word || *iter != S2C(u8"\n"))
 258             goto malformed;
 259           ++iter;
 260           break;
 261         }
 262
 263       if (c <= S2C(u8" ") || c >= 0x7f)
 264         goto malformed;
 265
 266       if (!word)
 267         {
 268           result.emplace_back ();
 269           word = &result.back ();
 270         }
 271
 272       if (c == S2C(u8"'"))
 273         {
 274           // Quoted word
 275           for (;;)
 276             {
 277               c = *iter;
 278
 279               if (c == S2C(u8"\n"))
 280                 {
 281                 malformed:;
 282                   result.clear ();
 283                   iter = std::find (iter, buffer.end (), S2C(u8"\n"));
 284                   auto back = iter;
 285                   if (back[-1] == CONTINUE  && back[-2] == S2C(u8" "))
 286                     // Smells like a line continuation
 287                     back -= 2;
 288                   result.emplace_back (&buffer[lastBol],
 289                                        back - buffer.begin () - lastBol);
 290                   ++iter;
 291                   lastBol = iter - buffer.begin ();
 292                   return EINVAL;
 293                 }
 294
 295               if (c < S2C(u8" ") || c >= 0x7f)
 296                 goto malformed;
 297
 298               ++iter;
 299               if (c == S2C(u8"'"))
 300                 break;
 301
 302               if (c == S2C(u8"\\"))
 303                 // escape
 304                 switch (c = *iter)
 305                   {
 306                     case S2C(u8"\\"):
 307                     case S2C(u8"'"):
 308                       ++iter;
 309                       break;
 310
 311                     case S2C(u8"n"):
 312                       c = S2C(u8"\n");
 313                       ++iter;
 314                       break;
 315
 316                     case S2C(u8"_"):
 317                       // We used to escape SPACE as \_, so accept that
 318                       c = S2C(u8" ");
 319                       ++iter;
 320                       break;
 321
 322                     case S2C(u8"t"):
 323                       c = S2C(u8"\t");
 324                       ++iter;
 325                       break;
 326
 327                     default:
 328                       {
 329                         unsigned v = 0;
 330                         for (unsigned nibble = 0; nibble != 2; nibble++)
 331                           {
 332                             c = *iter;
 333                             if (c < S2C(u8"0"))
 334                               {
 335                                 if (!nibble)
 336                                   goto malformed;
 337                                 break;
 338                               }
 339                             else if (c <= S2C(u8"9"))
 340                               c -= S2C(u8"0");
 341                             else if (c < S2C(u8"a"))
 342                               {
 343                                 if (!nibble)
 344                                   goto malformed;
 345                                 break;
 346                               }
 347                             else if (c <= S2C(u8"f"))
 348                               c -= S2C(u8"a") - 10;
 349                             else
 350                               {
 351                                 if (!nibble)
 352                                   goto malformed;
 353                                 break;
 354                               }
 355                             ++iter;
 356                             v = (v << 4) | c;
 357                           }
 358                         c = v;
 359                       }
 360                   }
 361               word->push_back (c);
 362             }
 363         }
 364       else
 365         // Unquoted character
 366         word->push_back (c);
 367     }
 368   lastBol = iter - buffer.begin ();
 369   if (result.empty ())
 370     return ENOENT;
 371
 372   return 0;
 373 }
 374
 375 void MessageBuffer::LexedLine (std::string &str)
 376 {
 377   if (lastBol)
 378     {
 379       size_t pos = lastBol - 1;
 380       for (; pos; pos--)
 381         if (buffer[pos-1] == S2C(u8"\n"))
 382           break;
 383
 384       size_t end = lastBol - 1;
 385       if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(u8" "))
 386         // Strip line continuation
 387         end -= 2;
 388       str.append (&buffer[pos], end - pos);
 389     }
 390 }
 391 } // Detail
 392 } // Cody