1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
5 // created: 2019may08 Markus W. Scherer
7 #include "unicode/utypes.h"
8 #include "unicode/bytestrie.h"
9 #include "unicode/localematcher.h"
10 #include "unicode/locid.h"
11 #include "unicode/uobject.h"
12 #include "unicode/ures.h"
14 #include "locdistance.h"
15 #include "loclikelysubtags.h"
26 * Bit flag used on the last character of a subtag in the trie.
27 * Must be set consistently by the builder and the lookup code.
29 constexpr int32_t END_OF_SUBTAG
= 0x80;
30 /** Distance value bit flag, set by the builder. */
31 constexpr int32_t DISTANCE_SKIP_SCRIPT
= 0x80;
32 /** Distance value bit flag, set by trieNext(). */
33 constexpr int32_t DISTANCE_IS_FINAL
= 0x100;
34 constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT
= DISTANCE_IS_FINAL
| DISTANCE_SKIP_SCRIPT
;
36 constexpr int32_t ABOVE_THRESHOLD
= 100;
38 // Indexes into array of distances.
41 IX_DEF_SCRIPT_DISTANCE
,
42 IX_DEF_REGION_DISTANCE
,
43 IX_MIN_REGION_DISTANCE
,
47 LocaleDistance
*gLocaleDistance
= nullptr;
48 UInitOnce gInitOnce
{};
50 UBool U_CALLCONV
cleanup() {
51 delete gLocaleDistance
;
52 gLocaleDistance
= nullptr;
59 void U_CALLCONV
LocaleDistance::initLocaleDistance(UErrorCode
&errorCode
) {
60 // This function is invoked only via umtx_initOnce().
61 U_ASSERT(gLocaleDistance
== nullptr);
62 const XLikelySubtags
&likely
= *XLikelySubtags::getSingleton(errorCode
);
63 if (U_FAILURE(errorCode
)) { return; }
64 const LocaleDistanceData
&data
= likely
.getDistanceData();
65 if (data
.distanceTrieBytes
== nullptr ||
66 data
.regionToPartitions
== nullptr || data
.partitions
== nullptr ||
68 data
.distances
== nullptr) {
69 errorCode
= U_MISSING_RESOURCE_ERROR
;
72 gLocaleDistance
= new LocaleDistance(data
, likely
);
73 if (gLocaleDistance
== nullptr) {
74 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
77 ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE
, cleanup
);
80 const LocaleDistance
*LocaleDistance::getSingleton(UErrorCode
&errorCode
) {
81 if (U_FAILURE(errorCode
)) { return nullptr; }
82 umtx_initOnce(gInitOnce
, &LocaleDistance::initLocaleDistance
, errorCode
);
83 return gLocaleDistance
;
86 LocaleDistance::LocaleDistance(const LocaleDistanceData
&data
, const XLikelySubtags
&likely
) :
87 likelySubtags(likely
),
88 trie(data
.distanceTrieBytes
),
89 regionToPartitionsIndex(data
.regionToPartitions
), partitionArrays(data
.partitions
),
90 paradigmLSRs(data
.paradigms
), paradigmLSRsLength(data
.paradigmsLength
),
91 defaultLanguageDistance(data
.distances
[IX_DEF_LANG_DISTANCE
]),
92 defaultScriptDistance(data
.distances
[IX_DEF_SCRIPT_DISTANCE
]),
93 defaultRegionDistance(data
.distances
[IX_DEF_REGION_DISTANCE
]),
94 minRegionDistance(data
.distances
[IX_MIN_REGION_DISTANCE
]) {
95 // For the default demotion value, use the
96 // default region distance between unrelated Englishes.
97 // Thus, unless demotion is turned off,
98 // a mere region difference for one desired locale
99 // is as good as a perfect match for the next following desired locale.
100 // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
101 LSR
en("en", "Latn", "US", LSR::EXPLICIT_LSR
);
102 LSR
enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR
);
103 const LSR
*p_enGB
= &enGB
;
104 int32_t indexAndDistance
= getBestIndexAndDistance(en
, &p_enGB
, 1,
105 shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE
, ULOCMATCH_DIRECTION_WITH_ONE_WAY
);
106 defaultDemotionPerDesiredLocale
= getDistanceFloor(indexAndDistance
);
109 int32_t LocaleDistance::getBestIndexAndDistance(
111 const LSR
**supportedLSRs
, int32_t supportedLSRsLength
,
112 int32_t shiftedThreshold
,
113 ULocMatchFavorSubtag favorSubtag
, ULocMatchDirection direction
) const {
114 BytesTrie
iter(trie
);
115 // Look up the desired language only once for all supported LSRs.
116 // Its "distance" is either a match point value of 0, or a non-match negative value.
117 // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
118 int32_t desLangDistance
= trieNext(iter
, desired
.language
, false);
119 uint64_t desLangState
= desLangDistance
>= 0 && supportedLSRsLength
> 1 ? iter
.getState64() : 0;
120 // Index of the supported LSR with the lowest distance.
121 int32_t bestIndex
= -1;
122 // Cached lookup info from XLikelySubtags.compareLikely().
123 int32_t bestLikelyInfo
= -1;
124 for (int32_t slIndex
= 0; slIndex
< supportedLSRsLength
; ++slIndex
) {
125 const LSR
&supported
= *supportedLSRs
[slIndex
];
127 int32_t distance
= desLangDistance
;
129 U_ASSERT((distance
& DISTANCE_IS_FINAL
) == 0);
131 iter
.resetToState64(desLangState
);
133 distance
= trieNext(iter
, supported
.language
, true);
135 // Note: The data builder verifies that there are no rules with "any" (*) language and
136 // real (non *) script or region subtags.
137 // This means that if the lookup for either language fails we can use
138 // the default distances without further lookups.
141 flags
= distance
& DISTANCE_IS_FINAL_OR_SKIP_SCRIPT
;
142 distance
&= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT
;
144 if (uprv_strcmp(desired
.language
, supported
.language
) == 0) {
147 distance
= defaultLanguageDistance
;
152 U_ASSERT(0 <= distance
&& distance
<= 100);
153 // Round up the shifted threshold (if fraction bits are not 0)
154 // for comparison with un-shifted distances until we need fraction bits.
155 // (If we simply shifted non-zero fraction bits away, then we might ignore a language
156 // when it's really still a micro distance below the threshold.)
157 int32_t roundedThreshold
= (shiftedThreshold
+ DISTANCE_FRACTION_MASK
) >> DISTANCE_SHIFT
;
158 // We implement "favor subtag" by reducing the language subtag distance
159 // (unscientifically reducing it to a quarter of the normal value),
160 // so that the script distance is relatively more important.
161 // For example, given a default language distance of 80, we reduce it to 20,
162 // which is below the default threshold of 50, which is the default script distance.
163 if (favorSubtag
== ULOCMATCH_FAVOR_SCRIPT
) {
166 // Let distance == roundedThreshold pass until the tie-breaker logic
167 // at the end of the loop.
168 if (distance
> roundedThreshold
) {
172 int32_t scriptDistance
;
173 if (star
|| flags
!= 0) {
174 if (uprv_strcmp(desired
.script
, supported
.script
) == 0) {
177 scriptDistance
= defaultScriptDistance
;
180 scriptDistance
= getDesSuppScriptDistance(iter
, iter
.getState64(),
181 desired
.script
, supported
.script
);
182 flags
= scriptDistance
& DISTANCE_IS_FINAL
;
183 scriptDistance
&= ~DISTANCE_IS_FINAL
;
185 distance
+= scriptDistance
;
186 if (distance
> roundedThreshold
) {
190 if (uprv_strcmp(desired
.region
, supported
.region
) == 0) {
191 // regionDistance = 0
192 } else if (star
|| (flags
& DISTANCE_IS_FINAL
) != 0) {
193 distance
+= defaultRegionDistance
;
195 int32_t remainingThreshold
= roundedThreshold
- distance
;
196 if (minRegionDistance
> remainingThreshold
) {
200 // From here on we know the regions are not equal.
201 // Map each region to zero or more partitions. (zero = one non-matching string)
202 // (Each array of single-character partition strings is encoded as one string.)
203 // If either side has more than one, then we find the maximum distance.
204 // This could be optimized by adding some more structure, but probably not worth it.
205 distance
+= getRegionPartitionsDistance(
206 iter
, iter
.getState64(),
207 partitionsForRegion(desired
),
208 partitionsForRegion(supported
),
211 int32_t shiftedDistance
= shiftDistance(distance
);
212 if (shiftedDistance
== 0) {
213 // Distinguish between equivalent but originally unequal locales via an
214 // additional micro distance.
215 shiftedDistance
|= (desired
.flags
^ supported
.flags
);
216 if (shiftedDistance
< shiftedThreshold
) {
217 if (direction
!= ULOCMATCH_DIRECTION_ONLY_TWO_WAY
||
218 // Is there also a match when we swap desired/supported?
219 isMatch(supported
, desired
, shiftedThreshold
, favorSubtag
)) {
220 if (shiftedDistance
== 0) {
221 return slIndex
<< INDEX_SHIFT
;
224 shiftedThreshold
= shiftedDistance
;
229 if (shiftedDistance
< shiftedThreshold
) {
230 if (direction
!= ULOCMATCH_DIRECTION_ONLY_TWO_WAY
||
231 // Is there also a match when we swap desired/supported?
232 isMatch(supported
, desired
, shiftedThreshold
, favorSubtag
)) {
234 shiftedThreshold
= shiftedDistance
;
237 } else if (shiftedDistance
== shiftedThreshold
&& bestIndex
>= 0) {
238 if (direction
!= ULOCMATCH_DIRECTION_ONLY_TWO_WAY
||
239 // Is there also a match when we swap desired/supported?
240 isMatch(supported
, desired
, shiftedThreshold
, favorSubtag
)) {
241 bestLikelyInfo
= likelySubtags
.compareLikely(
242 supported
, *supportedLSRs
[bestIndex
], bestLikelyInfo
);
243 if ((bestLikelyInfo
& 1) != 0) {
244 // This supported locale matches as well as the previous best match,
245 // and neither matches perfectly,
246 // but this one is "more likely" (has more-default subtags).
253 return bestIndex
>= 0 ?
254 (bestIndex
<< INDEX_SHIFT
) | shiftedThreshold
:
255 INDEX_NEG_1
| shiftDistance(ABOVE_THRESHOLD
);
258 int32_t LocaleDistance::getDesSuppScriptDistance(
259 BytesTrie
&iter
, uint64_t startState
, const char *desired
, const char *supported
) {
260 // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
261 int32_t distance
= trieNext(iter
, desired
, false);
263 distance
= trieNext(iter
, supported
, true);
266 UStringTrieResult result
= iter
.resetToState64(startState
).next(u
'*'); // <*, *>
267 U_ASSERT(USTRINGTRIE_HAS_VALUE(result
));
268 if (uprv_strcmp(desired
, supported
) == 0) {
269 distance
= 0; // same script
271 distance
= iter
.getValue();
272 U_ASSERT(distance
>= 0);
274 if (result
== USTRINGTRIE_FINAL_VALUE
) {
275 distance
|= DISTANCE_IS_FINAL
;
281 int32_t LocaleDistance::getRegionPartitionsDistance(
282 BytesTrie
&iter
, uint64_t startState
,
283 const char *desiredPartitions
, const char *supportedPartitions
, int32_t threshold
) {
284 char desired
= *desiredPartitions
++;
285 char supported
= *supportedPartitions
++;
286 U_ASSERT(desired
!= 0 && supported
!= 0);
287 // See if we have single desired/supported partitions, from NUL-terminated
288 // partition strings without explicit length.
289 bool suppLengthGt1
= *supportedPartitions
!= 0; // gt1: more than 1 character
290 // equivalent to: if (desLength == 1 && suppLength == 1)
291 if (*desiredPartitions
== 0 && !suppLengthGt1
) {
292 // Fastpath for single desired/supported partitions.
293 UStringTrieResult result
= iter
.next(uprv_invCharToAscii(desired
) | END_OF_SUBTAG
);
294 if (USTRINGTRIE_HAS_NEXT(result
)) {
295 result
= iter
.next(uprv_invCharToAscii(supported
) | END_OF_SUBTAG
);
296 if (USTRINGTRIE_HAS_VALUE(result
)) {
297 return iter
.getValue();
300 return getFallbackRegionDistance(iter
, startState
);
303 const char *supportedStart
= supportedPartitions
- 1; // for restart of inner loop
304 int32_t regionDistance
= 0;
305 // Fall back to * only once, not for each pair of partition strings.
308 // Look up each desired-partition string only once,
309 // not for each (desired, supported) pair.
310 UStringTrieResult result
= iter
.next(uprv_invCharToAscii(desired
) | END_OF_SUBTAG
);
311 if (USTRINGTRIE_HAS_NEXT(result
)) {
312 uint64_t desState
= suppLengthGt1
? iter
.getState64() : 0;
314 result
= iter
.next(uprv_invCharToAscii(supported
) | END_OF_SUBTAG
);
316 if (USTRINGTRIE_HAS_VALUE(result
)) {
321 d
= getFallbackRegionDistance(iter
, startState
);
326 } else if (regionDistance
< d
) {
329 if ((supported
= *supportedPartitions
++) != 0) {
330 iter
.resetToState64(desState
);
336 int32_t d
= getFallbackRegionDistance(iter
, startState
);
339 } else if (regionDistance
< d
) {
344 if ((desired
= *desiredPartitions
++) != 0) {
345 iter
.resetToState64(startState
);
346 supportedPartitions
= supportedStart
;
347 supported
= *supportedPartitions
++;
352 return regionDistance
;
355 int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie
&iter
, uint64_t startState
) {
357 UStringTrieResult result
=
359 iter
.resetToState64(startState
).next(u
'*'); // <*, *>
360 U_ASSERT(USTRINGTRIE_HAS_VALUE(result
));
361 int32_t distance
= iter
.getValue();
362 U_ASSERT(distance
>= 0);
366 int32_t LocaleDistance::trieNext(BytesTrie
&iter
, const char *s
, bool wantValue
) {
369 return -1; // no empty subtags in the distance data
372 c
= uprv_invCharToAscii(c
);
373 // EBCDIC: If *s is not an invariant character,
374 // then c is now 0 and will simply not match anything, which is harmless.
377 if (!USTRINGTRIE_HAS_NEXT(iter
.next(c
))) {
381 // last character of this subtag
382 UStringTrieResult result
= iter
.next(c
| END_OF_SUBTAG
);
384 if (USTRINGTRIE_HAS_VALUE(result
)) {
385 int32_t value
= iter
.getValue();
386 if (result
== USTRINGTRIE_FINAL_VALUE
) {
387 value
|= DISTANCE_IS_FINAL
;
392 if (USTRINGTRIE_HAS_NEXT(result
)) {
402 UBool
LocaleDistance::isParadigmLSR(const LSR
&lsr
) const {
403 // Linear search for a very short list (length 6 as of 2019),
404 // because we look for equivalence not equality, and
405 // because it's easy.
406 // If there are many paradigm LSRs we should use a hash set
407 // with custom comparator and hasher.
408 U_ASSERT(paradigmLSRsLength
<= 15);
409 for (int32_t i
= 0; i
< paradigmLSRsLength
; ++i
) {
410 if (lsr
.isEquivalentTo(paradigmLSRs
[i
])) { return true; }