intl/lwbrk/WordBreaker.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* This Source Code Form is subject to the terms of the Mozilla Public
   3  * License, v. 2.0. If a copy of the MPL was not distributed with this
   4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   5
   6 #include "mozilla/intl/WordBreaker.h"
   7 #include "mozilla/Preferences.h"
   8
   9 using mozilla::intl::WordBreakClass;
  10 using mozilla::intl::WordBreaker;
  11 using mozilla::intl::WordRange;
  12
  13 /*static*/
  14 already_AddRefed<WordBreaker> WordBreaker::Create() {
  15   return RefPtr<WordBreaker>(new WordBreaker()).forget();
  16 }
  17
  18 bool WordBreaker::BreakInBetween(const char16_t* aText1, uint32_t aTextLen1,
  19                                  const char16_t* aText2, uint32_t aTextLen2) {
  20   MOZ_ASSERT(nullptr != aText1, "null ptr");
  21   MOZ_ASSERT(nullptr != aText2, "null ptr");
  22
  23   if (!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2)) return false;
  24
  25   return GetClass(aText1[aTextLen1 - 1]) != GetClass(aText2[0]);
  26 }
  27
  28 #define IS_ASCII(c) (0 == (0xFF80 & (c)))
  29 #define ASCII_IS_ALPHA(c) \
  30   ((('a' <= (c)) && ((c) <= 'z')) || (('A' <= (c)) && ((c) <= 'Z')))
  31 #define ASCII_IS_DIGIT(c) (('0' <= (c)) && ((c) <= '9'))
  32 #define ASCII_IS_SPACE(c) \
  33   ((' ' == (c)) || ('\t' == (c)) || ('\r' == (c)) || ('\n' == (c)))
  34 #define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80)
  35
  36 // we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect
  37 // Unicode 3.0
  38 #define IS_HAN(c) \
  39   ((0x3400 <= (c)) && ((c) <= 0x9fff)) || ((0xf900 <= (c)) && ((c) <= 0xfaff))
  40 #define IS_KATAKANA(c) ((0x30A0 <= (c)) && ((c) <= 0x30FF))
  41 #define IS_HIRAGANA(c) ((0x3040 <= (c)) && ((c) <= 0x309F))
  42 #define IS_HALFWIDTHKATAKANA(c) ((0xFF60 <= (c)) && ((c) <= 0xFF9F))
  43 #define IS_THAI(c) (0x0E00 == (0xFF80 & (c)))  // Look at the higest 9 bits
  44
  45 /* static */
  46 WordBreakClass WordBreaker::GetClass(char16_t c) {
  47   // The pref is cached on first call; changes will require a browser restart.
  48   static bool sStopAtUnderscore =
  49       Preferences::GetBool("layout.word_select.stop_at_underscore", false);
  50
  51   // begin of the hack
  52
  53   if (IS_ALPHABETICAL_SCRIPT(c)) {
  54     if (IS_ASCII(c)) {
  55       if (ASCII_IS_SPACE(c)) {
  56         return kWbClassSpace;
  57       } else if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) ||
  58                  (c == '_' && !sStopAtUnderscore)) {
  59         return kWbClassAlphaLetter;
  60       } else {
  61         return kWbClassPunct;
  62       }
  63     } else if (IS_THAI(c)) {
  64       return kWbClassThaiLetter;
  65     } else if (c == 0x00A0 /*NBSP*/) {
  66       return kWbClassSpace;
  67     } else {
  68       return kWbClassAlphaLetter;
  69     }
  70   } else {
  71     if (IS_HAN(c)) {
  72       return kWbClassHanLetter;
  73     } else if (IS_KATAKANA(c)) {
  74       return kWbClassKatakanaLetter;
  75     } else if (IS_HIRAGANA(c)) {
  76       return kWbClassHiraganaLetter;
  77     } else if (IS_HALFWIDTHKATAKANA(c)) {
  78       return kWbClassHWKatakanaLetter;
  79     } else {
  80       return kWbClassAlphaLetter;
  81     }
  82   }
  83   return static_cast<WordBreakClass>(0);
  84 }
  85
  86 WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aTextLen,
  87                                 uint32_t aOffset) {
  88   WordRange range;
  89   MOZ_ASSERT(nullptr != aText, "null ptr");
  90   MOZ_ASSERT(0 != aTextLen, "len = 0");
  91   MOZ_ASSERT(aOffset <= aTextLen, "aOffset > aTextLen");
  92
  93   range.mBegin = aTextLen + 1;
  94   range.mEnd = aTextLen + 1;
  95
  96   if (!aText || aOffset > aTextLen) return range;
  97
  98   WordBreakClass c = GetClass(aText[aOffset]);
  99   uint32_t i;
 100   // Scan forward
 101   range.mEnd--;
 102   for (i = aOffset + 1; i <= aTextLen; i++) {
 103     if (c != GetClass(aText[i])) {
 104       range.mEnd = i;
 105       break;
 106     }
 107   }
 108
 109   // Scan backward
 110   range.mBegin = 0;
 111   for (i = aOffset; i > 0; i--) {
 112     if (c != GetClass(aText[i - 1])) {
 113       range.mBegin = i;
 114       break;
 115     }
 116   }
 117   if (kWbClassThaiLetter == c) {
 118     // need to call Thai word breaker from here
 119     // we should pass the whole Thai segment to the thai word breaker to find a
 120     // shorter answer
 121   }
 122   return range;
 123 }
 124
 125 int32_t WordBreaker::NextWord(const char16_t* aText, uint32_t aLen,
 126                               uint32_t aPos) {
 127   WordBreakClass c1, c2;
 128   uint32_t cur = aPos;
 129   if (cur == aLen) return NS_WORDBREAKER_NEED_MORE_TEXT;
 130   c1 = GetClass(aText[cur]);
 131
 132   for (cur++; cur < aLen; cur++) {
 133     c2 = GetClass(aText[cur]);
 134     if (c2 != c1) break;
 135   }
 136   if (kWbClassThaiLetter == c1) {
 137     // need to call Thai word breaker from here
 138     // we should pass the whole Thai segment to the thai word breaker to find a
 139     // shorter answer
 140   }
 141   if (cur == aLen) return NS_WORDBREAKER_NEED_MORE_TEXT;
 142   return cur;
 143 }