include/tools/inetmime.hxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19 #ifndef INCLUDED_TOOLS_INETMIME_HXX
  20 #define INCLUDED_TOOLS_INETMIME_HXX
  21
  22 #include <tools/toolsdllapi.h>
  23 #include <rtl/character.hxx>
  24 #include <rtl/string.hxx>
  25 #include <rtl/ustring.hxx>
  26 #include <tools/debug.hxx>
  27
  28 #include <unordered_map>
  29
  30 struct INetContentTypeParameter
  31 {
  32     /** The optional character set specification (see RFC 2231), in US-ASCII
  33         encoding and converted to lower case.
  34      */
  35     OString m_sCharset;
  36
  37     /** The optional language specification (see RFC 2231), in US-ASCII
  38         encoding and converted to lower case.
  39      */
  40     OString m_sLanguage;
  41
  42     /** The attribute value.  If the value is a quoted-string, it is
  43         'unpacked.'  If a character set is specified, and the value can be
  44         converted to Unicode, this is done.  Also, if no character set is
  45         specified, it is first tried to convert the value from UTF-8 encoding
  46         to Unicode, and if that doesn't work (because the value is not in
  47         UTF-8 encoding), it is converted from ISO-8859-1 encoding to Unicode
  48         (which will always work).  But if a character set is specified and the
  49         value cannot be converted from that character set to Unicode, special
  50         action is taken to produce a value that can possibly be transformed
  51         back into its original form:  Any 8-bit character from a non-encoded
  52         part of the original value is directly converted to Unicode
  53         (effectively handling it as if it was ISO-8859-1 encoded), and any
  54         8-bit character from an encoded part of the original value is mapped
  55         to the range U+F800..U+F8FF at the top of the Corporate Use Subarea
  56         within Unicode's Private Use Area (effectively adding 0xF800 to the
  57         character's numeric value).
  58      */
  59     OUString m_sValue;
  60
  61     /** This is true if the value is successfully converted to Unicode, and
  62         false if the value is a special mixture of ISO-LATIN-1 characters and
  63         characters from Unicode's Private Use Area.
  64      */
  65     bool m_bConverted;
  66 };
  67
  68 /** The key is the name of the attribute, in US-ASCII encoding and converted
  69     to lower case.  If a parameter value is split as described in RFC 2231,
  70     there will only be one item for the complete parameter, with the attribute
  71     name lacking any section suffix.
  72  */
  73 typedef std::unordered_map<OString, INetContentTypeParameter>
  74     INetContentTypeParameterList;
  75
  76
  77 class SAL_WARN_UNUSED TOOLS_DLLPUBLIC INetMIME
  78 {
  79 public:
  80     /** Check for US-ASCII visible character.
  81
  82         @param nChar  Some UCS-4 character.
  83
  84         @return  True if nChar is a US-ASCII visible character (US-ASCII
  85         0x21--0x7E).
  86      */
  87     static inline bool isVisible(sal_uInt32 nChar);
  88
  89     /** Check whether some character is valid within an RFC 822 <atom>.
  90
  91         @param nChar  Some UCS-4 character.
  92
  93         @return  True if nChar is valid within an RFC 822 <atom> (US-ASCII
  94         'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '%', '&', ''', '*', '+',
  95         '-', '/', '=', '?', '^', '_', '`', '{', '|', '}', or '~').
  96      */
  97     static bool isAtomChar(sal_uInt32 nChar);
  98
  99     /** Check whether some character is valid within an RFC 2060 <atom>.
 100
 101         @param nChar  Some UCS-4 character.
 102
 103         @return  True if nChar is valid within an RFC 2060 <atom> (US-ASCII
 104         'A'--'Z', 'a'--'z', '0'--'9', '!', '#', '$', '&', ''', '+', ',', '-',
 105         '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', ']', '^', '_', '`',
 106         '|', '}', or '~').
 107      */
 108     static bool isIMAPAtomChar(sal_uInt32 nChar);
 109
 110     /** Get the digit weight of a US-ASCII character.
 111
 112         @param nChar  Some UCS-4 character.
 113
 114         @return  If nChar is a US-ASCII (decimal) digit character (US-ASCII
 115         '0'--'9'), return the corresponding weight (0--9); otherwise,
 116         return -1.
 117      */
 118     static inline int getWeight(sal_uInt32 nChar);
 119
 120     /** Get the hexadecimal digit weight of a US-ASCII character.
 121
 122         @param nChar  Some UCS-4 character.
 123
 124         @return  If nChar is a US-ASCII hexadecimal digit character (US-ASCII
 125         '0'--'9', 'A'--'F', or 'a'--'f'), return the corresponding weight
 126         (0--15); otherwise, return -1.
 127      */
 128     static inline int getHexWeight(sal_uInt32 nChar);
 129
 130     /** Check two US-ASCII strings for equality, ignoring case.
 131
 132         @param pBegin1  Points to the start of the first string, must not be
 133         null.
 134
 135         @param pEnd1  Points past the end of the first string, must be >=
 136         pBegin1.
 137
 138         @param pString2  Points to the start of the null terminated second
 139         string, must not be null.
 140
 141         @return  True if the two strings are equal, ignoring the case of US-
 142         ASCII alphabetic characters (US-ASCII 'A'--'Z' and 'a'--'z').
 143      */
 144     static bool equalIgnoreCase(const sal_Unicode * pBegin1,
 145                                 const sal_Unicode * pEnd1,
 146                                 const char * pString2);
 147
 148     static bool scanUnsigned(const sal_Unicode *& rBegin,
 149                              const sal_Unicode * pEnd, bool bLeadingZeroes,
 150                              sal_uInt32 & rValue);
 151
 152     /** Parse the body of an RFC 2045 Content-Type header field.
 153
 154         @param pBegin  The range (that must be valid) from non-null pBegin,
 155         inclusive. to non-null pEnd, exclusive, forms the body of the
 156         Content-Type header field.  It must be of the form
 157
 158           token "/" token *(";" token "=" (token / quoted-string))
 159
 160         with intervening linear white space and comments (cf. RFCs 822, 2045).
 161         The RFC 2231 extensions are supported.  The encoding of rMediaType
 162         should be US-ASCII, but any Unicode values in the range U+0080..U+FFFF
 163         are interpreted 'as appropriate.'
 164
 165         @param pType  If not null, returns the type (the first of the above
 166         tokens), in US-ASCII encoding and converted to lower case.
 167
 168         @param pSubType  If not null, returns the sub-type (the second of the
 169         above tokens), in US-ASCII encoding and converted to lower case.
 170
 171         @param pParameters  If not null, returns the parameters as a list of
 172         INetContentTypeParameters (the attributes are in US-ASCII encoding and
 173         converted to lower case, the values are in Unicode encoding).  If
 174         null, only the syntax of the parameters is checked, but they are not
 175         returned.
 176
 177         @return  Null if the syntax of the field body is incorrect (i.e., does
 178         not start with type and sub-type tokens).  Otherwise, a pointer past the
 179         longest valid input prefix.  If null is returned, none of the output
 180         parameters will be modified.
 181      */
 182     static sal_Unicode const * scanContentType(
 183         OUString const & rStr,
 184         OUString * pType = nullptr, OUString * pSubType = nullptr,
 185         INetContentTypeParameterList * pParameters = nullptr);
 186
 187     static OUString decodeHeaderFieldBody(const OString& rBody);
 188
 189     /** Get the UTF-32 character at the head of a UTF-16 encoded string.
 190
 191         @param rBegin  Points to the start of the UTF-16 encoded string, must
 192         not be null.  On exit, it points past the first UTF-32 character's
 193         encoding.
 194
 195         @param pEnd  Points past the end of the UTF-16 encoded string, must be
 196         strictly greater than rBegin.
 197
 198         @return  The UCS-4 character at the head of the UTF-16 encoded string.
 199         If the string does not start with the UTF-16 encoding of a UCS-32
 200         character, the first UTF-16 value is returned.
 201      */
 202     static inline sal_uInt32 getUTF32Character(const sal_Unicode *& rBegin,
 203                                                const sal_Unicode * pEnd);
 204 };
 205
 206 // static
 207 inline bool INetMIME::isVisible(sal_uInt32 nChar)
 208 {
 209     return nChar >= '!' && nChar <= '~';
 210 }
 211
 212 // static
 213 inline int INetMIME::getWeight(sal_uInt32 nChar)
 214 {
 215     return rtl::isAsciiDigit(nChar) ? int(nChar - '0') : -1;
 216 }
 217
 218 // static
 219 inline int INetMIME::getHexWeight(sal_uInt32 nChar)
 220 {
 221     return rtl::isAsciiDigit(nChar) ? int(nChar - '0') :
 222            nChar >= 'A' && nChar <= 'F' ? int(nChar - 'A' + 10) :
 223            nChar >= 'a' && nChar <= 'f' ? int(nChar - 'a' + 10) : -1;
 224 }
 225
 226 // static
 227 inline sal_uInt32 INetMIME::getUTF32Character(const sal_Unicode *& rBegin,
 228                                               const sal_Unicode * pEnd)
 229 {
 230     DBG_ASSERT(rBegin && rBegin < pEnd,
 231                "INetMIME::getUTF32Character(): Bad sequence");
 232     if (rBegin + 1 < pEnd && rBegin[0] >= 0xD800 && rBegin[0] <= 0xDBFF
 233         && rBegin[1] >= 0xDC00 && rBegin[1] <= 0xDFFF)
 234     {
 235         sal_uInt32 nUTF32 = sal_uInt32(*rBegin++ & 0x3FF) << 10;
 236         return (nUTF32 | (*rBegin++ & 0x3FF)) + 0x10000;
 237     }
 238     else
 239         return *rBegin++;
 240 }
 241
 242
 243 #endif
 244
 245 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */