1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozilla/intl/WordBreaker.h"
7 #include "mozilla/Preferences.h"
9 using mozilla::intl::WordBreakClass
;
10 using mozilla::intl::WordBreaker
;
11 using mozilla::intl::WordRange
;
14 already_AddRefed
<WordBreaker
> WordBreaker::Create() {
15 return RefPtr
<WordBreaker
>(new WordBreaker()).forget();
18 bool WordBreaker::BreakInBetween(const char16_t
* aText1
, uint32_t aTextLen1
,
19 const char16_t
* aText2
, uint32_t aTextLen2
) {
20 MOZ_ASSERT(nullptr != aText1
, "null ptr");
21 MOZ_ASSERT(nullptr != aText2
, "null ptr");
23 if (!aText1
|| !aText2
|| (0 == aTextLen1
) || (0 == aTextLen2
)) return false;
25 return GetClass(aText1
[aTextLen1
- 1]) != GetClass(aText2
[0]);
28 #define IS_ASCII(c) (0 == (0xFF80 & (c)))
29 #define ASCII_IS_ALPHA(c) \
30 ((('a' <= (c)) && ((c) <= 'z')) || (('A' <= (c)) && ((c) <= 'Z')))
31 #define ASCII_IS_DIGIT(c) (('0' <= (c)) && ((c) <= '9'))
32 #define ASCII_IS_SPACE(c) \
33 ((' ' == (c)) || ('\t' == (c)) || ('\r' == (c)) || ('\n' == (c)))
34 #define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80)
36 // we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect
39 ((0x3400 <= (c)) && ((c) <= 0x9fff)) || ((0xf900 <= (c)) && ((c) <= 0xfaff))
40 #define IS_KATAKANA(c) ((0x30A0 <= (c)) && ((c) <= 0x30FF))
41 #define IS_HIRAGANA(c) ((0x3040 <= (c)) && ((c) <= 0x309F))
42 #define IS_HALFWIDTHKATAKANA(c) ((0xFF60 <= (c)) && ((c) <= 0xFF9F))
43 #define IS_THAI(c) (0x0E00 == (0xFF80 & (c))) // Look at the higest 9 bits
46 WordBreakClass
WordBreaker::GetClass(char16_t c
) {
47 // The pref is cached on first call; changes will require a browser restart.
48 static bool sStopAtUnderscore
=
49 Preferences::GetBool("layout.word_select.stop_at_underscore", false);
53 if (IS_ALPHABETICAL_SCRIPT(c
)) {
55 if (ASCII_IS_SPACE(c
)) {
57 } else if (ASCII_IS_ALPHA(c
) || ASCII_IS_DIGIT(c
) ||
58 (c
== '_' && !sStopAtUnderscore
)) {
59 return kWbClassAlphaLetter
;
63 } else if (IS_THAI(c
)) {
64 return kWbClassThaiLetter
;
65 } else if (c
== 0x00A0 /*NBSP*/) {
68 return kWbClassAlphaLetter
;
72 return kWbClassHanLetter
;
73 } else if (IS_KATAKANA(c
)) {
74 return kWbClassKatakanaLetter
;
75 } else if (IS_HIRAGANA(c
)) {
76 return kWbClassHiraganaLetter
;
77 } else if (IS_HALFWIDTHKATAKANA(c
)) {
78 return kWbClassHWKatakanaLetter
;
80 return kWbClassAlphaLetter
;
83 return static_cast<WordBreakClass
>(0);
86 WordRange
WordBreaker::FindWord(const char16_t
* aText
, uint32_t aTextLen
,
89 MOZ_ASSERT(nullptr != aText
, "null ptr");
90 MOZ_ASSERT(0 != aTextLen
, "len = 0");
91 MOZ_ASSERT(aOffset
<= aTextLen
, "aOffset > aTextLen");
93 range
.mBegin
= aTextLen
+ 1;
94 range
.mEnd
= aTextLen
+ 1;
96 if (!aText
|| aOffset
> aTextLen
) return range
;
98 WordBreakClass c
= GetClass(aText
[aOffset
]);
102 for (i
= aOffset
+ 1; i
<= aTextLen
; i
++) {
103 if (c
!= GetClass(aText
[i
])) {
111 for (i
= aOffset
; i
> 0; i
--) {
112 if (c
!= GetClass(aText
[i
- 1])) {
117 if (kWbClassThaiLetter
== c
) {
118 // need to call Thai word breaker from here
119 // we should pass the whole Thai segment to the thai word breaker to find a
125 int32_t WordBreaker::NextWord(const char16_t
* aText
, uint32_t aLen
,
127 WordBreakClass c1
, c2
;
129 if (cur
== aLen
) return NS_WORDBREAKER_NEED_MORE_TEXT
;
130 c1
= GetClass(aText
[cur
]);
132 for (cur
++; cur
< aLen
; cur
++) {
133 c2
= GetClass(aText
[cur
]);
136 if (kWbClassThaiLetter
== c1
) {
137 // need to call Thai word breaker from here
138 // we should pass the whole Thai segment to the thai word breaker to find a
141 if (cur
== aLen
) return NS_WORDBREAKER_NEED_MORE_TEXT
;