accessible/atk/DOMtoATK.h

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include <glib.h>
   8 #include <cstdint>
   9 #include "mozilla/a11y/HyperTextAccessibleBase.h"
  10 #include "nsCharTraits.h"
  11 #include "nsString.h"
  12
  13 /**
  14  * ATK offsets are counted in unicode codepoints, while DOM offsets are counted
  15  * in UTF-16 code units.  That makes a difference for non-BMP characters,
  16  * which need two UTF-16 code units to be represented (a pair of surrogates),
  17  * while they are just one unicode character.
  18  *
  19  * To keep synchronization between ATK offsets (unicode codepoints) and DOM
  20  * offsets (UTF-16 code units), after translation from UTF-16 to UTF-8 we add a
  21  * BOM after each non-BMP character (which would otherwise use 2 UTF-16
  22  * code units for only 1 unicode codepoint).
  23  *
  24  * BOMs (Byte Order Marks, U+FEFF, also known as ZERO WIDTH NO-BREAK SPACE, but
  25  * that usage is deprecated) normally only appear at the beginning of unicode
  26  * files, but their occurrence within text (notably after cut&paste) is not
  27  * uncommon, and are thus considered as non-text.
  28  *
  29  * Since the selection requested through ATK may not contain both surrogates
  30  * at the ends of the selection, we need to fetch one UTF-16 code point more
  31  * on both side, and get rid of it before returning the string to ATK. The
  32  * ATKStringConverterHelper class maintains this, NewATKString should be used
  33  * to call it properly.
  34  *
  35  * In the end,
  36  * - if the start is between the high and low surrogates, the UTF-8 result
  37  * includes a BOM from it but not the character
  38  * - if the end is between the high and low surrogates, the UTF-8 result
  39  * includes the character but *not* the BOM
  40  * - all non-BMP characters that are fully in the string are in the UTF-8 result
  41  * as character followed by BOM
  42  */
  43 namespace mozilla {
  44 namespace a11y {
  45
  46 namespace DOMtoATK {
  47
  48 /**
  49  * Converts a string of accessible text into ATK gchar* string (by adding
  50  * BOMs). This can be used when offsets do not need to be adjusted because
  51  * ends of the string can not fall between surrogates.
  52  */
  53 gchar* Convert(const nsAString& aStr);
  54
  55 /**
  56  * Add a BOM after each non-BMP character.
  57  */
  58 void AddBOMs(nsACString& aDest, const nsACString& aSource);
  59
  60 class ATKStringConverterHelper {
  61  public:
  62   ATKStringConverterHelper(void)
  63       :
  64 #ifdef DEBUG
  65         mAdjusted(false),
  66 #endif
  67         mStartShifted(false),
  68         mEndShifted(false) {
  69   }
  70
  71   /**
  72    * In order to properly get non-BMP values, offsets need to be changed
  73    * to get one character more on each end, so that ConvertUTF16toUTF8 can
  74    * convert surrogates even if the originally requested offsets fall between
  75    * them.
  76    */
  77   void AdjustOffsets(gint* aStartOffset, gint* aEndOffset, gint count);
  78
  79   /**
  80    * Converts a string of accessible text with adjusted offsets into ATK
  81    * gchar* string (by adding BOMs).  Note, AdjustOffsets has to be called
  82    * before getting the text passed to this.
  83    */
  84   gchar* ConvertAdjusted(const nsAString& aStr);
  85
  86  private:
  87   /**
  88    * Remove the additional characters requested by PrepareUTF16toUTF8.
  89    */
  90   gchar* FinishUTF16toUTF8(nsCString& aStr);
  91
  92 #ifdef DEBUG
  93   bool mAdjusted;
  94 #endif
  95   bool mStartShifted;
  96   bool mEndShifted;
  97 };
  98
  99 /**
 100  * Get text from aAccessible, using ATKStringConverterHelper to properly
 101  * introduce appropriate BOMs.
 102  */
 103 inline gchar* NewATKString(HyperTextAccessibleBase* aAccessible,
 104                            gint aStartOffset, gint aEndOffset) {
 105   gint startOffset = aStartOffset, endOffset = aEndOffset;
 106   ATKStringConverterHelper converter;
 107   converter.AdjustOffsets(&startOffset, &endOffset,
 108                           gint(aAccessible->CharacterCount()));
 109   nsAutoString str;
 110   aAccessible->TextSubstring(startOffset, endOffset, str);
 111
 112   if (str.Length() == 0) {
 113     // Bogus offsets, or empty string, either way we do not need conversion.
 114     return g_strdup("");
 115   }
 116
 117   return converter.ConvertAdjusted(str);
 118 }
 119
 120 /**
 121  * Get a character from aAccessible, fetching more data as appropriate to
 122  * properly get non-BMP characters or a BOM as appropriate.
 123  */
 124 inline gunichar ATKCharacter(HyperTextAccessibleBase* aAccessible,
 125                              gint aOffset) {
 126   // char16_t is unsigned short in Mozilla, gnuichar is guint32 in glib.
 127   gunichar character = static_cast<gunichar>(aAccessible->CharAt(aOffset));
 128
 129   if (NS_IS_LOW_SURROGATE(character)) {
 130     // Trailing surrogate, return BOM instead.
 131     return 0xFEFF;
 132   }
 133
 134   if (NS_IS_HIGH_SURROGATE(character)) {
 135     // Heading surrogate, get the trailing surrogate and combine them.
 136     gunichar characterLow =
 137         static_cast<gunichar>(aAccessible->CharAt(aOffset + 1));
 138
 139     if (!NS_IS_LOW_SURROGATE(characterLow)) {
 140       // It should have been a trailing surrogate... Flag the error.
 141       return 0xFFFD;
 142     }
 143     return SURROGATE_TO_UCS4(character, characterLow);
 144   }
 145
 146   return character;
 147 }
 148
 149 }  // namespace DOMtoATK
 150
 151 }  // namespace a11y
 152 }  // namespace mozilla