net/http/http_content_disposition.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "net/http/http_content_disposition.h"
   6
   7 #include "base/base64.h"
   8 #include "base/logging.h"
   9 #include "base/strings/string_tokenizer.h"
  10 #include "base/strings/string_util.h"
  11 #include "base/strings/sys_string_conversions.h"
  12 #include "base/strings/utf_string_conversions.h"
  13 #include "net/base/net_string_util.h"
  14 #include "net/base/net_util.h"
  15 #include "net/http/http_util.h"
  16
  17 namespace net {
  18
  19 namespace {
  20
  21 enum RFC2047EncodingType {
  22   Q_ENCODING,
  23   B_ENCODING
  24 };
  25
  26 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
  27 // decoding a quoted-printable string.  Returns true if the input was valid.
  28 bool DecodeQEncoding(const std::string& input, std::string* output) {
  29   std::string temp;
  30   temp.reserve(input.size());
  31   for (std::string::const_iterator it = input.begin(); it != input.end();
  32        ++it) {
  33     if (*it == '_') {
  34       temp.push_back(' ');
  35     } else if (*it == '=') {
  36       if ((input.end() - it < 3) ||
  37           !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
  38           !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
  39         return false;
  40       unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
  41                          HexDigitToInt(*(it + 2));
  42       temp.push_back(static_cast<char>(ch));
  43       ++it;
  44       ++it;
  45     } else if (0x20 < *it && *it < 0x7F && *it != '?') {
  46       // In a Q-encoded word, only printable ASCII characters
  47       // represent themselves. Besides, space, '=', '_' and '?' are
  48       // not allowed, but they're already filtered out.
  49       DCHECK_NE('=', *it);
  50       DCHECK_NE('?', *it);
  51       DCHECK_NE('_', *it);
  52       temp.push_back(*it);
  53     } else {
  54       return false;
  55     }
  56   }
  57   output->swap(temp);
  58   return true;
  59 }
  60
  61 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
  62 // type is specified in |enc_type|.
  63 bool DecodeBQEncoding(const std::string& part,
  64                       RFC2047EncodingType enc_type,
  65                       const std::string& charset,
  66                       std::string* output) {
  67   std::string decoded;
  68   if (!((enc_type == B_ENCODING) ?
  69         base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) {
  70     return false;
  71   }
  72
  73   if (decoded.empty()) {
  74     output->clear();
  75     return true;
  76   }
  77
  78   return ConvertToUtf8(decoded, charset.c_str(), output);
  79 }
  80
  81 bool DecodeWord(const std::string& encoded_word,
  82                 const std::string& referrer_charset,
  83                 bool* is_rfc2047,
  84                 std::string* output,
  85                 int* parse_result_flags) {
  86   *is_rfc2047 = false;
  87   output->clear();
  88   if (encoded_word.empty())
  89     return true;
  90
  91   if (!base::IsStringASCII(encoded_word)) {
  92     // Try UTF-8, referrer_charset and the native OS default charset in turn.
  93     if (base::IsStringUTF8(encoded_word)) {
  94       *output = encoded_word;
  95     } else {
  96       base::string16 utf16_output;
  97       if (!referrer_charset.empty() &&
  98           ConvertToUTF16(encoded_word, referrer_charset.c_str(),
  99                          &utf16_output)) {
 100         *output = base::UTF16ToUTF8(utf16_output);
 101       } else {
 102         *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word));
 103       }
 104     }
 105
 106     *parse_result_flags |= HttpContentDisposition::HAS_NON_ASCII_STRINGS;
 107     return true;
 108   }
 109
 110   // RFC 2047 : one of encoding methods supported by Firefox and relatively
 111   // widely used by web servers.
 112   // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
 113   // We don't care about the length restriction (72 bytes) because
 114   // many web servers generate encoded words longer than the limit.
 115   std::string decoded_word;
 116   *is_rfc2047 = true;
 117   int part_index = 0;
 118   std::string charset;
 119   base::StringTokenizer t(encoded_word, "?");
 120   RFC2047EncodingType enc_type = Q_ENCODING;
 121   while (*is_rfc2047 && t.GetNext()) {
 122     std::string part = t.token();
 123     switch (part_index) {
 124       case 0:
 125         if (part != "=") {
 126           *is_rfc2047 = false;
 127           break;
 128         }
 129         ++part_index;
 130         break;
 131       case 1:
 132         // Do we need charset validity check here?
 133         charset = part;
 134         ++part_index;
 135         break;
 136       case 2:
 137         if (part.size() > 1 ||
 138             part.find_first_of("bBqQ") == std::string::npos) {
 139           *is_rfc2047 = false;
 140           break;
 141         }
 142         if (part[0] == 'b' || part[0] == 'B') {
 143           enc_type = B_ENCODING;
 144         }
 145         ++part_index;
 146         break;
 147       case 3:
 148         *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
 149         if (!*is_rfc2047) {
 150           // Last minute failure. Invalid B/Q encoding. Rather than
 151           // passing it through, return now.
 152           return false;
 153         }
 154         ++part_index;
 155         break;
 156       case 4:
 157         if (part != "=") {
 158           // Another last minute failure !
 159           // Likely to be a case of two encoded-words in a row or
 160           // an encoded word followed by a non-encoded word. We can be
 161           // generous, but it does not help much in terms of compatibility,
 162           // I believe. Return immediately.
 163           *is_rfc2047 = false;
 164           return false;
 165         }
 166         ++part_index;
 167         break;
 168       default:
 169         *is_rfc2047 = false;
 170         return false;
 171     }
 172   }
 173
 174   if (*is_rfc2047) {
 175     if (*(encoded_word.end() - 1) == '=') {
 176       output->swap(decoded_word);
 177       *parse_result_flags |=
 178           HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
 179       return true;
 180     }
 181     // encoded_word ending prematurelly with '?' or extra '?'
 182     *is_rfc2047 = false;
 183     return false;
 184   }
 185
 186   // We're not handling 'especial' characters quoted with '\', but
 187   // it should be Ok because we're not an email client but a
 188   // web browser.
 189
 190   // What IE6/7 does: %-escaped UTF-8.
 191   decoded_word = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
 192   if (decoded_word != encoded_word)
 193     *parse_result_flags |= HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
 194   if (base::IsStringUTF8(decoded_word)) {
 195     output->swap(decoded_word);
 196     return true;
 197     // We can try either the OS default charset or 'origin charset' here,
 198     // As far as I can tell, IE does not support it. However, I've seen
 199     // web servers emit %-escaped string in a legacy encoding (usually
 200     // origin charset).
 201     // TODO(jungshik) : Test IE further and consider adding a fallback here.
 202   }
 203   return false;
 204 }
 205
 206 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
 207 // value is supposed to be of the form:
 208 //
 209 //   value                   = token | quoted-string
 210 //
 211 // However we currently also allow RFC 2047 encoding and non-ASCII
 212 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
 213 bool DecodeFilenameValue(const std::string& input,
 214                          const std::string& referrer_charset,
 215                          std::string* output,
 216                          int* parse_result_flags) {
 217   int current_parse_result_flags = 0;
 218   std::string decoded_value;
 219   bool is_previous_token_rfc2047 = true;
 220
 221   // Tokenize with whitespace characters.
 222   base::StringTokenizer t(input, " \t\n\r");
 223   t.set_options(base::StringTokenizer::RETURN_DELIMS);
 224   while (t.GetNext()) {
 225     if (t.token_is_delim()) {
 226       // If the previous non-delimeter token is not RFC2047-encoded,
 227       // put in a space in its place. Otheriwse, skip over it.
 228       if (!is_previous_token_rfc2047)
 229         decoded_value.push_back(' ');
 230       continue;
 231     }
 232     // We don't support a single multibyte character split into
 233     // adjacent encoded words. Some broken mail clients emit headers
 234     // with that problem, but most web servers usually encode a filename
 235     // in a single encoded-word. Firefox/Thunderbird do not support
 236     // it, either.
 237     std::string decoded;
 238     if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
 239                     &decoded, &current_parse_result_flags))
 240       return false;
 241     decoded_value.append(decoded);
 242   }
 243   output->swap(decoded_value);
 244   if (parse_result_flags && !output->empty())
 245     *parse_result_flags |= current_parse_result_flags;
 246   return true;
 247 }
 248
 249 // Parses the charset and value-chars out of an ext-value string.
 250 //
 251 //  ext-value     = charset  "'" [ language ] "'" value-chars
 252 bool ParseExtValueComponents(const std::string& input,
 253                              std::string* charset,
 254                              std::string* value_chars) {
 255   base::StringTokenizer t(input, "'");
 256   t.set_options(base::StringTokenizer::RETURN_DELIMS);
 257   std::string temp_charset;
 258   std::string temp_value;
 259   int numDelimsSeen = 0;
 260   while (t.GetNext()) {
 261     if (t.token_is_delim()) {
 262       ++numDelimsSeen;
 263       continue;
 264     } else {
 265       switch (numDelimsSeen) {
 266         case 0:
 267           temp_charset = t.token();
 268           break;
 269         case 1:
 270           // Language is ignored.
 271           break;
 272         case 2:
 273           temp_value = t.token();
 274           break;
 275         default:
 276           return false;
 277       }
 278     }
 279   }
 280   if (numDelimsSeen != 2)
 281     return false;
 282   if (temp_charset.empty() || temp_value.empty())
 283     return false;
 284   charset->swap(temp_charset);
 285   value_chars->swap(temp_value);
 286   return true;
 287 }
 288
 289 // http://tools.ietf.org/html/rfc5987#section-3.2
 290 //
 291 //  ext-value     = charset  "'" [ language ] "'" value-chars
 292 //
 293 //  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
 294 //
 295 //  mime-charset  = 1*mime-charsetc
 296 //  mime-charsetc = ALPHA / DIGIT
 297 //                 / "!" / "#" / "$" / "%" / "&"
 298 //                 / "+" / "-" / "^" / "_" / "`"
 299 //                 / "{" / "}" / "~"
 300 //
 301 //  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
 302 //
 303 //  value-chars   = *( pct-encoded / attr-char )
 304 //
 305 //  pct-encoded   = "%" HEXDIG HEXDIG
 306 //
 307 //  attr-char     = ALPHA / DIGIT
 308 //                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
 309 //                 / "^" / "_" / "`" / "|" / "~"
 310 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
 311   if (param_value.find('"') != std::string::npos)
 312     return false;
 313
 314   std::string charset;
 315   std::string value;
 316   if (!ParseExtValueComponents(param_value, &charset, &value))
 317     return false;
 318
 319   // RFC 5987 value should be ASCII-only.
 320   if (!base::IsStringASCII(value)) {
 321     decoded->clear();
 322     return true;
 323   }
 324
 325   std::string unescaped = UnescapeURLComponent(
 326       value, UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
 327
 328   return ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded);
 329 }
 330
 331 } // namespace
 332
 333 HttpContentDisposition::HttpContentDisposition(
 334     const std::string& header, const std::string& referrer_charset)
 335   : type_(INLINE),
 336     parse_result_flags_(INVALID) {
 337   Parse(header, referrer_charset);
 338 }
 339
 340 HttpContentDisposition::~HttpContentDisposition() {
 341 }
 342
 343 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
 344     std::string::const_iterator begin, std::string::const_iterator end) {
 345   DCHECK(type_ == INLINE);
 346   std::string::const_iterator delimiter = std::find(begin, end, ';');
 347
 348   std::string::const_iterator type_begin = begin;
 349   std::string::const_iterator type_end = delimiter;
 350   HttpUtil::TrimLWS(&type_begin, &type_end);
 351
 352   // If the disposition-type isn't a valid token the then the
 353   // Content-Disposition header is malformed, and we treat the first bytes as
 354   // a parameter rather than a disposition-type.
 355   if (!HttpUtil::IsToken(type_begin, type_end))
 356     return begin;
 357
 358   parse_result_flags_ |= HAS_DISPOSITION_TYPE;
 359
 360   DCHECK(std::find(type_begin, type_end, '=') == type_end);
 361
 362   if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {
 363     type_ = INLINE;
 364   } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {
 365     type_ = ATTACHMENT;
 366   } else {
 367     parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
 368     type_ = ATTACHMENT;
 369   }
 370   return delimiter;
 371 }
 372
 373 // http://tools.ietf.org/html/rfc6266
 374 //
 375 //  content-disposition = "Content-Disposition" ":"
 376 //                         disposition-type *( ";" disposition-parm )
 377 //
 378 //  disposition-type    = "inline" | "attachment" | disp-ext-type
 379 //                      ; case-insensitive
 380 //  disp-ext-type       = token
 381 //
 382 //  disposition-parm    = filename-parm | disp-ext-parm
 383 //
 384 //  filename-parm       = "filename" "=" value
 385 //                      | "filename*" "=" ext-value
 386 //
 387 //  disp-ext-parm       = token "=" value
 388 //                      | ext-token "=" ext-value
 389 //  ext-token           = <the characters in token, followed by "*">
 390 //
 391 void HttpContentDisposition::Parse(const std::string& header,
 392                                    const std::string& referrer_charset) {
 393   DCHECK(type_ == INLINE);
 394   DCHECK(filename_.empty());
 395
 396   std::string::const_iterator pos = header.begin();
 397   std::string::const_iterator end = header.end();
 398   pos = ConsumeDispositionType(pos, end);
 399
 400   std::string filename;
 401   std::string ext_filename;
 402
 403   HttpUtil::NameValuePairsIterator iter(pos, end, ';');
 404   while (iter.GetNext()) {
 405     if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
 406                                                  iter.name_end(),
 407                                                  "filename")) {
 408       DecodeFilenameValue(iter.value(), referrer_charset, &filename,
 409                           &parse_result_flags_);
 410       if (!filename.empty())
 411         parse_result_flags_ |= HAS_FILENAME;
 412     } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
 413                                                             iter.name_end(),
 414                                                             "filename*")) {
 415       DecodeExtValue(iter.raw_value(), &ext_filename);
 416       if (!ext_filename.empty())
 417         parse_result_flags_ |= HAS_EXT_FILENAME;
 418     }
 419   }
 420
 421   if (!ext_filename.empty())
 422     filename_ = ext_filename;
 423   else
 424     filename_ = filename;
 425 }
 426
 427 }  // namespace net