net/http/http_content_disposition.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "net/http/http_content_disposition.h"
   6
   7 #include "base/base64.h"
   8 #include "base/logging.h"
   9 #include "base/strings/string_tokenizer.h"
  10 #include "base/strings/string_util.h"
  11 #include "base/strings/sys_string_conversions.h"
  12 #include "base/strings/utf_string_conversions.h"
  13 #include "net/base/net_string_util.h"
  14 #include "net/base/net_util.h"
  15 #include "net/http/http_util.h"
  16
  17 namespace {
  18
  19 enum RFC2047EncodingType {
  20   Q_ENCODING,
  21   B_ENCODING
  22 };
  23
  24 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
  25 // decoding a quoted-printable string.  Returns true if the input was valid.
  26 bool DecodeQEncoding(const std::string& input, std::string* output) {
  27   std::string temp;
  28   temp.reserve(input.size());
  29   for (std::string::const_iterator it = input.begin(); it != input.end();
  30        ++it) {
  31     if (*it == '_') {
  32       temp.push_back(' ');
  33     } else if (*it == '=') {
  34       if ((input.end() - it < 3) ||
  35           !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
  36           !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
  37         return false;
  38       unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
  39                          HexDigitToInt(*(it + 2));
  40       temp.push_back(static_cast<char>(ch));
  41       ++it;
  42       ++it;
  43     } else if (0x20 < *it && *it < 0x7F && *it != '?') {
  44       // In a Q-encoded word, only printable ASCII characters
  45       // represent themselves. Besides, space, '=', '_' and '?' are
  46       // not allowed, but they're already filtered out.
  47       DCHECK_NE('=', *it);
  48       DCHECK_NE('?', *it);
  49       DCHECK_NE('_', *it);
  50       temp.push_back(*it);
  51     } else {
  52       return false;
  53     }
  54   }
  55   output->swap(temp);
  56   return true;
  57 }
  58
  59 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
  60 // type is specified in |enc_type|.
  61 bool DecodeBQEncoding(const std::string& part,
  62                       RFC2047EncodingType enc_type,
  63                       const std::string& charset,
  64                       std::string* output) {
  65   std::string decoded;
  66   if (!((enc_type == B_ENCODING) ?
  67         base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) {
  68     return false;
  69   }
  70
  71   if (decoded.empty()) {
  72     output->clear();
  73     return true;
  74   }
  75
  76   return net::ConvertToUtf8(decoded, charset.c_str(), output);
  77 }
  78
  79 bool DecodeWord(const std::string& encoded_word,
  80                 const std::string& referrer_charset,
  81                 bool* is_rfc2047,
  82                 std::string* output,
  83                 int* parse_result_flags) {
  84   *is_rfc2047 = false;
  85   output->clear();
  86   if (encoded_word.empty())
  87     return true;
  88
  89   if (!base::IsStringASCII(encoded_word)) {
  90     // Try UTF-8, referrer_charset and the native OS default charset in turn.
  91     if (base::IsStringUTF8(encoded_word)) {
  92       *output = encoded_word;
  93     } else {
  94       base::string16 utf16_output;
  95       if (!referrer_charset.empty() &&
  96           net::ConvertToUTF16(encoded_word, referrer_charset.c_str(),
  97                               &utf16_output)) {
  98         *output = base::UTF16ToUTF8(utf16_output);
  99       } else {
 100         *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word));
 101       }
 102     }
 103
 104     *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS;
 105     return true;
 106   }
 107
 108   // RFC 2047 : one of encoding methods supported by Firefox and relatively
 109   // widely used by web servers.
 110   // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
 111   // We don't care about the length restriction (72 bytes) because
 112   // many web servers generate encoded words longer than the limit.
 113   std::string decoded_word;
 114   *is_rfc2047 = true;
 115   int part_index = 0;
 116   std::string charset;
 117   base::StringTokenizer t(encoded_word, "?");
 118   RFC2047EncodingType enc_type = Q_ENCODING;
 119   while (*is_rfc2047 && t.GetNext()) {
 120     std::string part = t.token();
 121     switch (part_index) {
 122       case 0:
 123         if (part != "=") {
 124           *is_rfc2047 = false;
 125           break;
 126         }
 127         ++part_index;
 128         break;
 129       case 1:
 130         // Do we need charset validity check here?
 131         charset = part;
 132         ++part_index;
 133         break;
 134       case 2:
 135         if (part.size() > 1 ||
 136             part.find_first_of("bBqQ") == std::string::npos) {
 137           *is_rfc2047 = false;
 138           break;
 139         }
 140         if (part[0] == 'b' || part[0] == 'B') {
 141           enc_type = B_ENCODING;
 142         }
 143         ++part_index;
 144         break;
 145       case 3:
 146         *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
 147         if (!*is_rfc2047) {
 148           // Last minute failure. Invalid B/Q encoding. Rather than
 149           // passing it through, return now.
 150           return false;
 151         }
 152         ++part_index;
 153         break;
 154       case 4:
 155         if (part != "=") {
 156           // Another last minute failure !
 157           // Likely to be a case of two encoded-words in a row or
 158           // an encoded word followed by a non-encoded word. We can be
 159           // generous, but it does not help much in terms of compatibility,
 160           // I believe. Return immediately.
 161           *is_rfc2047 = false;
 162           return false;
 163         }
 164         ++part_index;
 165         break;
 166       default:
 167         *is_rfc2047 = false;
 168         return false;
 169     }
 170   }
 171
 172   if (*is_rfc2047) {
 173     if (*(encoded_word.end() - 1) == '=') {
 174       output->swap(decoded_word);
 175       *parse_result_flags |=
 176           net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
 177       return true;
 178     }
 179     // encoded_word ending prematurelly with '?' or extra '?'
 180     *is_rfc2047 = false;
 181     return false;
 182   }
 183
 184   // We're not handling 'especial' characters quoted with '\', but
 185   // it should be Ok because we're not an email client but a
 186   // web browser.
 187
 188   // What IE6/7 does: %-escaped UTF-8.
 189   decoded_word = net::UnescapeURLComponent(encoded_word,
 190                                            net::UnescapeRule::SPACES);
 191   if (decoded_word != encoded_word)
 192     *parse_result_flags |=
 193         net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
 194   if (base::IsStringUTF8(decoded_word)) {
 195     output->swap(decoded_word);
 196     return true;
 197     // We can try either the OS default charset or 'origin charset' here,
 198     // As far as I can tell, IE does not support it. However, I've seen
 199     // web servers emit %-escaped string in a legacy encoding (usually
 200     // origin charset).
 201     // TODO(jungshik) : Test IE further and consider adding a fallback here.
 202   }
 203   return false;
 204 }
 205
 206 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
 207 // value is supposed to be of the form:
 208 //
 209 //   value                   = token | quoted-string
 210 //
 211 // However we currently also allow RFC 2047 encoding and non-ASCII
 212 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
 213 bool DecodeFilenameValue(const std::string& input,
 214                          const std::string& referrer_charset,
 215                          std::string* output,
 216                          int* parse_result_flags) {
 217   int current_parse_result_flags = 0;
 218   std::string decoded_value;
 219   bool is_previous_token_rfc2047 = true;
 220
 221   // Tokenize with whitespace characters.
 222   base::StringTokenizer t(input, " \t\n\r");
 223   t.set_options(base::StringTokenizer::RETURN_DELIMS);
 224   while (t.GetNext()) {
 225     if (t.token_is_delim()) {
 226       // If the previous non-delimeter token is not RFC2047-encoded,
 227       // put in a space in its place. Otheriwse, skip over it.
 228       if (!is_previous_token_rfc2047)
 229         decoded_value.push_back(' ');
 230       continue;
 231     }
 232     // We don't support a single multibyte character split into
 233     // adjacent encoded words. Some broken mail clients emit headers
 234     // with that problem, but most web servers usually encode a filename
 235     // in a single encoded-word. Firefox/Thunderbird do not support
 236     // it, either.
 237     std::string decoded;
 238     if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
 239                     &decoded, &current_parse_result_flags))
 240       return false;
 241     decoded_value.append(decoded);
 242   }
 243   output->swap(decoded_value);
 244   if (parse_result_flags && !output->empty())
 245     *parse_result_flags |= current_parse_result_flags;
 246   return true;
 247 }
 248
 249 // Parses the charset and value-chars out of an ext-value string.
 250 //
 251 //  ext-value     = charset  "'" [ language ] "'" value-chars
 252 bool ParseExtValueComponents(const std::string& input,
 253                              std::string* charset,
 254                              std::string* value_chars) {
 255   base::StringTokenizer t(input, "'");
 256   t.set_options(base::StringTokenizer::RETURN_DELIMS);
 257   std::string temp_charset;
 258   std::string temp_value;
 259   int numDelimsSeen = 0;
 260   while (t.GetNext()) {
 261     if (t.token_is_delim()) {
 262       ++numDelimsSeen;
 263       continue;
 264     } else {
 265       switch (numDelimsSeen) {
 266         case 0:
 267           temp_charset = t.token();
 268           break;
 269         case 1:
 270           // Language is ignored.
 271           break;
 272         case 2:
 273           temp_value = t.token();
 274           break;
 275         default:
 276           return false;
 277       }
 278     }
 279   }
 280   if (numDelimsSeen != 2)
 281     return false;
 282   if (temp_charset.empty() || temp_value.empty())
 283     return false;
 284   charset->swap(temp_charset);
 285   value_chars->swap(temp_value);
 286   return true;
 287 }
 288
 289 // http://tools.ietf.org/html/rfc5987#section-3.2
 290 //
 291 //  ext-value     = charset  "'" [ language ] "'" value-chars
 292 //
 293 //  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
 294 //
 295 //  mime-charset  = 1*mime-charsetc
 296 //  mime-charsetc = ALPHA / DIGIT
 297 //                 / "!" / "#" / "$" / "%" / "&"
 298 //                 / "+" / "-" / "^" / "_" / "`"
 299 //                 / "{" / "}" / "~"
 300 //
 301 //  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
 302 //
 303 //  value-chars   = *( pct-encoded / attr-char )
 304 //
 305 //  pct-encoded   = "%" HEXDIG HEXDIG
 306 //
 307 //  attr-char     = ALPHA / DIGIT
 308 //                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
 309 //                 / "^" / "_" / "`" / "|" / "~"
 310 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
 311   if (param_value.find('"') != std::string::npos)
 312     return false;
 313
 314   std::string charset;
 315   std::string value;
 316   if (!ParseExtValueComponents(param_value, &charset, &value))
 317     return false;
 318
 319   // RFC 5987 value should be ASCII-only.
 320   if (!base::IsStringASCII(value)) {
 321     decoded->clear();
 322     return true;
 323   }
 324
 325   std::string unescaped = net::UnescapeURLComponent(
 326       value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
 327
 328   return net::ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded);
 329 }
 330
 331 } // namespace
 332
 333 namespace net {
 334
 335 HttpContentDisposition::HttpContentDisposition(
 336     const std::string& header, const std::string& referrer_charset)
 337   : type_(INLINE),
 338     parse_result_flags_(INVALID) {
 339   Parse(header, referrer_charset);
 340 }
 341
 342 HttpContentDisposition::~HttpContentDisposition() {
 343 }
 344
 345 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
 346     std::string::const_iterator begin, std::string::const_iterator end) {
 347   DCHECK(type_ == INLINE);
 348   std::string::const_iterator delimiter = std::find(begin, end, ';');
 349
 350   std::string::const_iterator type_begin = begin;
 351   std::string::const_iterator type_end = delimiter;
 352   HttpUtil::TrimLWS(&type_begin, &type_end);
 353
 354   // If the disposition-type isn't a valid token the then the
 355   // Content-Disposition header is malformed, and we treat the first bytes as
 356   // a parameter rather than a disposition-type.
 357   if (!HttpUtil::IsToken(type_begin, type_end))
 358     return begin;
 359
 360   parse_result_flags_ |= HAS_DISPOSITION_TYPE;
 361
 362   DCHECK(std::find(type_begin, type_end, '=') == type_end);
 363
 364   if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {
 365     type_ = INLINE;
 366   } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {
 367     type_ = ATTACHMENT;
 368   } else {
 369     parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
 370     type_ = ATTACHMENT;
 371   }
 372   return delimiter;
 373 }
 374
 375 // http://tools.ietf.org/html/rfc6266
 376 //
 377 //  content-disposition = "Content-Disposition" ":"
 378 //                         disposition-type *( ";" disposition-parm )
 379 //
 380 //  disposition-type    = "inline" | "attachment" | disp-ext-type
 381 //                      ; case-insensitive
 382 //  disp-ext-type       = token
 383 //
 384 //  disposition-parm    = filename-parm | disp-ext-parm
 385 //
 386 //  filename-parm       = "filename" "=" value
 387 //                      | "filename*" "=" ext-value
 388 //
 389 //  disp-ext-parm       = token "=" value
 390 //                      | ext-token "=" ext-value
 391 //  ext-token           = <the characters in token, followed by "*">
 392 //
 393 void HttpContentDisposition::Parse(const std::string& header,
 394                                    const std::string& referrer_charset) {
 395   DCHECK(type_ == INLINE);
 396   DCHECK(filename_.empty());
 397
 398   std::string::const_iterator pos = header.begin();
 399   std::string::const_iterator end = header.end();
 400   pos = ConsumeDispositionType(pos, end);
 401
 402   std::string name;
 403   std::string filename;
 404   std::string ext_filename;
 405
 406   HttpUtil::NameValuePairsIterator iter(pos, end, ';');
 407   while (iter.GetNext()) {
 408     if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
 409                                                  iter.name_end(),
 410                                                  "filename")) {
 411       DecodeFilenameValue(iter.value(), referrer_charset, &filename,
 412                           &parse_result_flags_);
 413       if (!filename.empty())
 414         parse_result_flags_ |= HAS_FILENAME;
 415     } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),
 416                                                     iter.name_end(),
 417                                                     "name")) {
 418       DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);
 419       if (!name.empty())
 420         parse_result_flags_ |= HAS_NAME;
 421     } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
 422                                                             iter.name_end(),
 423                                                             "filename*")) {
 424       DecodeExtValue(iter.raw_value(), &ext_filename);
 425       if (!ext_filename.empty())
 426         parse_result_flags_ |= HAS_EXT_FILENAME;
 427     }
 428   }
 429
 430   if (!ext_filename.empty())
 431     filename_ = ext_filename;
 432   else if (!filename.empty())
 433     filename_ = filename;
 434   else
 435     filename_ = name;
 436 }
 437
 438 }  // namespace net