intl/uconv/nsTextToSubURI.cpp

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* This Source Code Form is subject to the terms of the Mozilla Public
   3  * License, v. 2.0. If a copy of the MPL was not distributed with this
   4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   5 #include "nsString.h"
   6 #include "nsITextToSubURI.h"
   7 #include "nsEscape.h"
   8 #include "nsTextToSubURI.h"
   9 #include "nsCRT.h"
  10 #include "mozilla/ArrayUtils.h"
  11 #include "mozilla/Encoding.h"
  12 #include "mozilla/Preferences.h"
  13 #include "mozilla/TextUtils.h"
  14 #include "mozilla/Utf8.h"
  15
  16 using namespace mozilla;
  17
  18 nsTextToSubURI::~nsTextToSubURI() = default;
  19
  20 NS_IMPL_ISUPPORTS(nsTextToSubURI, nsITextToSubURI)
  21
  22 NS_IMETHODIMP
  23 nsTextToSubURI::ConvertAndEscape(const nsACString& aCharset,
  24                                  const nsAString& aText, nsACString& aOut) {
  25   auto encoding = Encoding::ForLabelNoReplacement(aCharset);
  26   if (!encoding) {
  27     aOut.Truncate();
  28     return NS_ERROR_UCONV_NOCONV;
  29   }
  30   nsresult rv;
  31   nsAutoCString intermediate;
  32   std::tie(rv, std::ignore) = encoding->Encode(aText, intermediate);
  33   if (NS_FAILED(rv)) {
  34     aOut.Truncate();
  35     return rv;
  36   }
  37   bool ok = NS_Escape(intermediate, aOut, url_XPAlphas);
  38   if (!ok) {
  39     aOut.Truncate();
  40     return NS_ERROR_OUT_OF_MEMORY;
  41   }
  42   return NS_OK;
  43 }
  44
  45 NS_IMETHODIMP
  46 nsTextToSubURI::UnEscapeAndConvert(const nsACString& aCharset,
  47                                    const nsACString& aText, nsAString& aOut) {
  48   auto encoding = Encoding::ForLabelNoReplacement(aCharset);
  49   if (!encoding) {
  50     aOut.Truncate();
  51     return NS_ERROR_UCONV_NOCONV;
  52   }
  53   nsAutoCString unescaped(aText);
  54   NS_UnescapeURL(unescaped);
  55   auto rv = encoding->DecodeWithoutBOMHandling(unescaped, aOut);
  56   if (NS_SUCCEEDED(rv)) {
  57     return NS_OK;
  58   }
  59   return rv;
  60 }
  61
  62 static bool statefulCharset(const char* charset) {
  63   // HZ, UTF-7 and the CN and KR ISO-2022 variants are no longer in
  64   // mozilla-central but keeping them here just in case for the benefit of
  65   // comm-central.
  66   if (!nsCRT::strncasecmp(charset, "ISO-2022-", sizeof("ISO-2022-") - 1) ||
  67       !nsCRT::strcasecmp(charset, "UTF-7") ||
  68       !nsCRT::strcasecmp(charset, "HZ-GB-2312"))
  69     return true;
  70
  71   return false;
  72 }
  73
  74 nsresult nsTextToSubURI::convertURItoUnicode(const nsCString& aCharset,
  75                                              const nsCString& aURI,
  76                                              nsAString& aOut) {
  77   // check for 7bit encoding the data may not be ASCII after we decode
  78   bool isStatefulCharset = statefulCharset(aCharset.get());
  79
  80   if (!isStatefulCharset) {
  81     if (IsAscii(aURI)) {
  82       CopyASCIItoUTF16(aURI, aOut);
  83       return NS_OK;
  84     }
  85     if (IsUtf8(aURI)) {
  86       CopyUTF8toUTF16(aURI, aOut);
  87       return NS_OK;
  88     }
  89   }
  90
  91   // empty charset could indicate UTF-8, but aURI turns out not to be UTF-8.
  92   NS_ENSURE_FALSE(aCharset.IsEmpty(), NS_ERROR_INVALID_ARG);
  93
  94   auto encoding = Encoding::ForLabelNoReplacement(aCharset);
  95   if (!encoding) {
  96     aOut.Truncate();
  97     return NS_ERROR_UCONV_NOCONV;
  98   }
  99   return encoding->DecodeWithoutBOMHandlingAndWithoutReplacement(aURI, aOut);
 100 }
 101
 102 NS_IMETHODIMP nsTextToSubURI::UnEscapeURIForUI(const nsACString& aURIFragment,
 103                                                bool aDontEscape,
 104                                                nsAString& _retval) {
 105   nsAutoCString unescapedSpec;
 106   // skip control octets (0x00 - 0x1f and 0x7f) when unescaping
 107   NS_UnescapeURL(PromiseFlatCString(aURIFragment),
 108                  esc_SkipControl | esc_AlwaysCopy, unescapedSpec);
 109
 110   // in case of failure, return escaped URI
 111   // Test for != NS_OK rather than NS_FAILED, because incomplete multi-byte
 112   // sequences are also considered failure in this context
 113   if (convertURItoUnicode("UTF-8"_ns, unescapedSpec, _retval) != NS_OK) {
 114     // assume UTF-8 instead of ASCII  because hostname (IDN) may be in UTF-8
 115     CopyUTF8toUTF16(aURIFragment, _retval);
 116   }
 117
 118   if (aDontEscape) {
 119     return NS_OK;
 120   }
 121
 122   // If there are any characters that are unsafe for URIs, reescape those.
 123   if (mIDNBlocklist.IsEmpty()) {
 124     mozilla::net::InitializeBlocklist(mIDNBlocklist);
 125     // we allow SPACE and IDEOGRAPHIC SPACE in this method
 126     mozilla::net::RemoveCharFromBlocklist(u' ', mIDNBlocklist);
 127     mozilla::net::RemoveCharFromBlocklist(0x3000, mIDNBlocklist);
 128   }
 129
 130   MOZ_ASSERT(!mIDNBlocklist.IsEmpty());
 131   const nsPromiseFlatString& unescapedResult = PromiseFlatString(_retval);
 132   nsString reescapedSpec;
 133   _retval = NS_EscapeURL(
 134       unescapedResult,
 135       [&](char16_t aChar) -> bool {
 136         return mozilla::net::CharInBlocklist(aChar, mIDNBlocklist);
 137       },
 138       reescapedSpec);
 139
 140   return NS_OK;
 141 }
 142
 143 NS_IMETHODIMP
 144 nsTextToSubURI::UnEscapeNonAsciiURI(const nsACString& aCharset,
 145                                     const nsACString& aURIFragment,
 146                                     nsAString& _retval) {
 147   nsAutoCString unescapedSpec;
 148   NS_UnescapeURL(PromiseFlatCString(aURIFragment),
 149                  esc_AlwaysCopy | esc_OnlyNonASCII, unescapedSpec);
 150   // leave the URI as it is if it's not UTF-8 and aCharset is not a ASCII
 151   // superset since converting "http:" with such an encoding is always a bad
 152   // idea.
 153   if (!IsUtf8(unescapedSpec) &&
 154       (aCharset.LowerCaseEqualsLiteral("utf-16") ||
 155        aCharset.LowerCaseEqualsLiteral("utf-16be") ||
 156        aCharset.LowerCaseEqualsLiteral("utf-16le") ||
 157        aCharset.LowerCaseEqualsLiteral("utf-7") ||
 158        aCharset.LowerCaseEqualsLiteral("x-imap4-modified-utf7"))) {
 159     CopyASCIItoUTF16(aURIFragment, _retval);
 160     return NS_OK;
 161   }
 162
 163   nsresult rv =
 164       convertURItoUnicode(PromiseFlatCString(aCharset), unescapedSpec, _retval);
 165   // NS_OK_UDEC_MOREINPUT is a success code, so caller can't catch the error
 166   // if the string ends with a valid (but incomplete) sequence.
 167   return rv == NS_OK_UDEC_MOREINPUT ? NS_ERROR_UDEC_ILLEGALINPUT : rv;
 168 }
 169
 170 //----------------------------------------------------------------------