net/tools/flip_server/url_to_filename_encoder.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include <stdlib.h>
   6
   7 #include "base/logging.h"
   8 #include "base/strings/string_util.h"
   9 #include "net/base/net_util.h"
  10 #include "net/tools/flip_server/url_to_filename_encoder.h"
  11
  12 using std::string;
  13
  14 namespace {
  15
  16 // Returns 1 if buf is prefixed by "num_digits" of hex digits
  17 // Teturns 0 otherwise.
  18 // The function checks for '\0' for string termination.
  19 int HexDigitsPrefix(const char* buf, int num_digits) {
  20   for (int i = 0; i < num_digits; i++) {
  21     if (!base::IsHexDigit(buf[i]))
  22       return 0;  // This also detects end of string as '\0' is not xdigit.
  23   }
  24   return 1;
  25 }
  26
  27 #ifdef WIN32
  28 #define strtoull _strtoui64
  29 #endif
  30
  31 // A simple parser for long long values. Returns the parsed value if a
  32 // valid integer is found; else returns deflt
  33 // UInt64 and Int64 cannot handle decimal numbers with leading 0s.
  34 uint64 ParseLeadingHex64Value(const char* str, uint64 deflt) {
  35   char* error = NULL;
  36   const uint64 value = strtoull(str, &error, 16);
  37   return (error == str) ? deflt : value;
  38 }
  39 }
  40
  41 namespace net {
  42
  43 // The escape character choice is made here -- all code and tests in this
  44 // directory are based off of this constant.  However, our testdata
  45 // has tons of dependencies on this, so it cannot be changed without
  46 // re-running those tests and fixing them.
  47 const char UrlToFilenameEncoder::kEscapeChar = ',';
  48 const char UrlToFilenameEncoder::kTruncationChar = '-';
  49 const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128;
  50
  51 void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) {
  52   CHECK(!segment->empty());
  53   if ((*segment == ".") || (*segment == "..")) {
  54     dest->append(1, kEscapeChar);
  55     dest->append(*segment);
  56     segment->clear();
  57   } else {
  58     size_t segment_size = segment->size();
  59     if (segment_size > kMaximumSubdirectoryLength) {
  60       // We need to inject ",-" at the end of the segment to signify that
  61       // we are inserting an artificial '/'.  This means we have to chop
  62       // off at least two characters to make room.
  63       segment_size = kMaximumSubdirectoryLength - 2;
  64
  65       // But we don't want to break up an escape sequence that happens to lie at
  66       // the end.  Escape sequences are at most 2 characters.
  67       if ((*segment)[segment_size - 1] == kEscapeChar) {
  68         segment_size -= 1;
  69       } else if ((*segment)[segment_size - 2] == kEscapeChar) {
  70         segment_size -= 2;
  71       }
  72       dest->append(segment->data(), segment_size);
  73       dest->append(1, kEscapeChar);
  74       dest->append(1, kTruncationChar);
  75       segment->erase(0, segment_size);
  76
  77       // At this point, if we had segment_size=3, and segment="abcd",
  78       // then after this erase, we will have written "abc,-" and set segment="d"
  79     } else {
  80       dest->append(*segment);
  81       segment->clear();
  82     }
  83   }
  84 }
  85
  86 void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
  87                                          const string& escaped_ending,
  88                                          char dir_separator,
  89                                          string* encoded_filename) {
  90   string filename_ending = UrlUtilities::Unescape(escaped_ending);
  91
  92   char encoded[3];
  93   int encoded_len;
  94   string segment;
  95
  96   // TODO(jmarantz): This code would be a bit simpler if we disallowed
  97   // Instaweb allowing filename_prefix to not end in "/".  We could
  98   // then change the is routine to just take one input string.
  99   size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
 100   if (start_of_segment == string::npos) {
 101     segment = filename_prefix;
 102   } else {
 103     segment = filename_prefix.substr(start_of_segment + 1);
 104     *encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
 105   }
 106
 107   size_t index = 0;
 108   // Special case the first / to avoid adding a leading kEscapeChar.
 109   if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
 110     encoded_filename->append(segment);
 111     segment.clear();
 112     encoded_filename->append(1, dir_separator);
 113     ++index;
 114   }
 115
 116   for (; index < filename_ending.length(); ++index) {
 117     unsigned char ch = static_cast<unsigned char>(filename_ending[index]);
 118
 119     // Note: instead of outputing an empty segment, we let the second slash
 120     // be escaped below.
 121     if ((ch == dir_separator) && !segment.empty()) {
 122       AppendSegment(&segment, encoded_filename);
 123       encoded_filename->append(1, dir_separator);
 124       segment.clear();
 125     } else {
 126       // After removing unsafe chars the only safe ones are _.=+- and alphanums.
 127       if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') ||
 128           (ch == '-') || (('0' <= ch) && (ch <= '9')) ||
 129           (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) {
 130         encoded[0] = ch;
 131         encoded_len = 1;
 132       } else {
 133         encoded[0] = kEscapeChar;
 134         encoded[1] = ch / 16;
 135         encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
 136         encoded[2] = ch % 16;
 137         encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
 138         encoded_len = 3;
 139       }
 140       segment.append(encoded, encoded_len);
 141
 142       // If segment is too big, we must chop it into chunks.
 143       if (segment.size() > kMaximumSubdirectoryLength) {
 144         AppendSegment(&segment, encoded_filename);
 145         encoded_filename->append(1, dir_separator);
 146       }
 147     }
 148   }
 149
 150   // Append "," to the leaf filename so the leaf can also be a branch., e.g.
 151   // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
 152   // /a/b/c/d".  So we will rename the "d" here to "d,".  If doing that pushed
 153   // us over the 128 char limit, then we will need to append "/" and the
 154   // remaining chars.
 155   segment += kEscapeChar;
 156   AppendSegment(&segment, encoded_filename);
 157   if (!segment.empty()) {
 158     // The last overflow segment is special, because we appended in
 159     // kEscapeChar above.  We won't need to check it again for size
 160     // or further escaping.
 161     encoded_filename->append(1, dir_separator);
 162     encoded_filename->append(segment);
 163   }
 164 }
 165
 166 // Note: this decoder is not the exact inverse of the EncodeSegment above,
 167 // because it does not take into account a prefix.
 168 bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
 169                                   char dir_separator,
 170                                   string* decoded_url) {
 171   enum State { kStart, kEscape, kFirstDigit, kTruncate, kEscapeDot };
 172   State state = kStart;
 173   char hex_buffer[3];
 174   hex_buffer[2] = '\0';
 175   for (size_t i = 0; i < encoded_filename.size(); ++i) {
 176     char ch = encoded_filename[i];
 177     switch (state) {
 178       case kStart:
 179         if (ch == kEscapeChar) {
 180           state = kEscape;
 181         } else if (ch == dir_separator) {
 182           decoded_url->append(1, '/');  // URLs only use '/' not '\\'
 183         } else {
 184           decoded_url->append(1, ch);
 185         }
 186         break;
 187       case kEscape:
 188         if (HexDigitsPrefix(&ch, 1) == 1) {
 189           hex_buffer[0] = ch;
 190           state = kFirstDigit;
 191         } else if (ch == kTruncationChar) {
 192           state = kTruncate;
 193         } else if (ch == '.') {
 194           decoded_url->append(1, '.');
 195           state = kEscapeDot;  // Look for at most one more dot.
 196         } else if (ch == dir_separator) {
 197           // Consider url "//x".  This was once encoded to "/,/x,".
 198           // This code is what skips the first Escape.
 199           decoded_url->append(1, '/');  // URLs only use '/' not '\\'
 200           state = kStart;
 201         } else {
 202           return false;
 203         }
 204         break;
 205       case kFirstDigit:
 206         if (HexDigitsPrefix(&ch, 1) == 1) {
 207           hex_buffer[1] = ch;
 208           uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
 209           decoded_url->append(1, static_cast<char>(hex_value));
 210           state = kStart;
 211         } else {
 212           return false;
 213         }
 214         break;
 215       case kTruncate:
 216         if (ch == dir_separator) {
 217           // Skip this separator, it was only put in to break up long
 218           // path segments, but is not part of the URL.
 219           state = kStart;
 220         } else {
 221           return false;
 222         }
 223         break;
 224       case kEscapeDot:
 225         decoded_url->append(1, ch);
 226         state = kStart;
 227         break;
 228     }
 229   }
 230
 231   // All legal encoded filenames end in kEscapeChar.
 232   return (state == kEscape);
 233 }
 234
 235 // Escape the given input |path| and chop any individual components
 236 // of the path which are greater than kMaximumSubdirectoryLength characters
 237 // into two chunks.
 238 //
 239 // This legacy version has several issues with aliasing of different URLs,
 240 // inability to represent both /a/b/c and /a/b/c/d, and inability to decode
 241 // the filenames back into URLs.
 242 //
 243 // But there is a large body of slurped data which depends on this format,
 244 // so leave it as the default for spdy_in_mem_edsm_server.
 245 string UrlToFilenameEncoder::LegacyEscape(const string& path) {
 246   string output;
 247
 248   // Note:  We also chop paths into medium sized 'chunks'.
 249   //        This is due to the incompetence of the windows
 250   //        filesystem, which still hasn't figured out how
 251   //        to deal with long filenames.
 252   int last_slash = 0;
 253   for (size_t index = 0; index < path.length(); index++) {
 254     char ch = path[index];
 255     if (ch == 0x5C)
 256       last_slash = index;
 257     if ((ch == 0x2D) ||                    // hyphen
 258         (ch == 0x5C) || (ch == 0x5F) ||    // backslash, underscore
 259         ((0x30 <= ch) && (ch <= 0x39)) ||  // Digits [0-9]
 260         ((0x41 <= ch) && (ch <= 0x5A)) ||  // Uppercase [A-Z]
 261         ((0x61 <= ch) && (ch <= 0x7A))) {  // Lowercase [a-z]
 262       output.append(&path[index], 1);
 263     } else {
 264       char encoded[3];
 265       encoded[0] = 'x';
 266       encoded[1] = ch / 16;
 267       encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
 268       encoded[2] = ch % 16;
 269       encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
 270       output.append(encoded, 3);
 271     }
 272     if (index - last_slash > kMaximumSubdirectoryLength) {
 273 #ifdef WIN32
 274       char slash = '\\';
 275 #else
 276       char slash = '/';
 277 #endif
 278       output.append(&slash, 1);
 279       last_slash = index;
 280     }
 281   }
 282   return output;
 283 }
 284
 285 }  // namespace net