1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "nsHyphenator.h"
8 #include "mozilla/dom/ContentChild.h"
9 #include "mozilla/Omnijar.h"
10 #include "nsContentUtils.h"
11 #include "nsIChannel.h"
13 #include "nsIFileURL.h"
14 #include "nsIInputStream.h"
15 #include "nsIJARURI.h"
17 #include "nsNetUtil.h"
18 #include "nsUnicodeProperties.h"
19 #include "nsUTF8Utils.h"
20 #include "nsZipArchive.h"
22 #include "mapped_hyph.h"
24 using namespace mozilla
;
26 void DefaultDelete
<const HyphDic
>::operator()(const HyphDic
* aHyph
) const {
27 mapped_hyph_free_dictionary(const_cast<HyphDic
*>(aHyph
));
30 void DefaultDelete
<const CompiledData
>::operator()(
31 const CompiledData
* aData
) const {
32 mapped_hyph_free_compiled_data(const_cast<CompiledData
*>(aData
));
35 static const void* GetItemPtrFromJarURI(nsIJARURI
* aJAR
, uint32_t* aLength
) {
36 // Try to get the jarfile's nsZipArchive, find the relevant item, and return
37 // a pointer to its data provided it is stored uncompressed.
38 nsCOMPtr
<nsIURI
> jarFile
;
39 if (NS_FAILED(aJAR
->GetJARFile(getter_AddRefs(jarFile
)))) {
42 nsCOMPtr
<nsIFileURL
> fileUrl
= do_QueryInterface(jarFile
);
46 nsCOMPtr
<nsIFile
> file
;
47 fileUrl
->GetFile(getter_AddRefs(file
));
51 RefPtr
<nsZipArchive
> archive
= Omnijar::GetReader(file
);
54 aJAR
->GetJAREntry(path
);
55 nsZipItem
* item
= archive
->GetItem(path
.get());
56 if (item
&& item
->Compression() == 0 && item
->Size() > 0) {
57 // We do NOT own this data, but it won't go away until the omnijar
58 // file is closed during shutdown.
59 const uint8_t* data
= archive
->GetData(item
);
61 *aLength
= item
->Size();
69 static UniquePtr
<base::SharedMemory
> GetHyphDictFromParent(nsIURI
* aURI
,
71 MOZ_ASSERT(!XRE_IsParentProcess());
72 base::SharedMemoryHandle handle
= base::SharedMemory::NULLHandle();
75 if (!dom::ContentChild::GetSingleton()->SendGetHyphDict(aURI
, &handle
,
79 UniquePtr
<base::SharedMemory
> shm
= MakeUnique
<base::SharedMemory
>();
80 if (!shm
->IsHandleValid(handle
)) {
83 if (!shm
->SetHandle(std::move(handle
), true)) {
86 if (!shm
->Map(size
)) {
89 char* addr
= static_cast<char*>(shm
->memory());
97 static UniquePtr
<base::SharedMemory
> CopyToShmem(const CompiledData
* aData
) {
98 MOZ_ASSERT(XRE_IsParentProcess());
100 // The shm-related calls here are not expected to fail, but if they do,
101 // we'll just return null (as if the resource was unavailable) and proceed
102 // without hyphenation.
103 uint32_t size
= mapped_hyph_compiled_data_size(aData
);
104 UniquePtr
<base::SharedMemory
> shm
= MakeUnique
<base::SharedMemory
>();
105 if (!shm
->CreateFreezeable(size
)) {
108 if (!shm
->Map(size
)) {
111 char* buffer
= static_cast<char*>(shm
->memory());
116 memcpy(buffer
, mapped_hyph_compiled_data_ptr(aData
), size
);
117 if (!shm
->Freeze()) {
124 static UniquePtr
<base::SharedMemory
> LoadFromURI(nsIURI
* aURI
,
127 MOZ_ASSERT(XRE_IsParentProcess());
128 nsCOMPtr
<nsIChannel
> channel
;
129 if (NS_FAILED(NS_NewChannel(
130 getter_AddRefs(channel
), aURI
, nsContentUtils::GetSystemPrincipal(),
131 nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_SEC_CONTEXT_IS_NULL
,
132 nsIContentPolicy::TYPE_OTHER
))) {
135 nsCOMPtr
<nsIInputStream
> instream
;
136 if (NS_FAILED(channel
->Open(getter_AddRefs(instream
)))) {
139 // Check size, bail out if it is excessively large (the largest of the
140 // hyphenation files currently shipped with Firefox is around 1MB
143 if (NS_FAILED(instream
->Available(&available
)) || !available
||
144 available
> 16 * 1024 * 1024) {
149 UniquePtr
<base::SharedMemory
> shm
= MakeUnique
<base::SharedMemory
>();
150 if (!shm
->CreateFreezeable(available
)) {
153 if (!shm
->Map(available
)) {
156 char* buffer
= static_cast<char*>(shm
->memory());
161 uint32_t bytesRead
= 0;
162 if (NS_FAILED(instream
->Read(buffer
, available
, &bytesRead
)) ||
163 bytesRead
!= available
) {
167 if (!mapped_hyph_is_valid_hyphenator(
168 reinterpret_cast<const uint8_t*>(buffer
), bytesRead
)) {
172 if (!shm
->Freeze()) {
176 *aLength
= bytesRead
;
180 // Read from the URI into a temporary buffer, compile it, then copy the
181 // compiled resource to a shared memory region.
182 auto buffer
= MakeUnique
<char[]>(available
);
183 uint32_t bytesRead
= 0;
184 if (NS_FAILED(instream
->Read(buffer
.get(), available
, &bytesRead
)) ||
185 bytesRead
!= available
) {
189 UniquePtr
<const CompiledData
> data(mapped_hyph_compile_buffer(
190 reinterpret_cast<const uint8_t*>(buffer
.get()), bytesRead
, false));
192 *aLength
= mapped_hyph_compiled_data_size(data
.get());
193 return CopyToShmem(data
.get());
199 nsHyphenator::nsHyphenator(nsIURI
* aURI
, bool aHyphenateCapitalized
)
200 : mDict(static_cast<const void*>(nullptr)),
202 mHyphenateCapitalized(aHyphenateCapitalized
) {
203 // Files with extension ".hyf" are expected to be precompiled mapped_hyph
204 // tables; we also support uncompiled ".dic" files, but they are more
205 // expensive to process on first load.
207 aURI
->GetFilePath(path
);
208 bool precompiled
= StringEndsWith(path
, ".hyf"_ns
);
210 // Content processes don't do compilation; they depend on the parent giving
211 // them a compiled version of the resource, so that we only pay the cost of
212 // compilation once per language per session.
213 if (!precompiled
&& !XRE_IsParentProcess()) {
215 UniquePtr
<base::SharedMemory
> shm
= GetHyphDictFromParent(aURI
, &length
);
217 // We don't need to validate mDict because the parent process
218 // will have done so.
220 mDict
= AsVariant(std::move(shm
));
225 nsCOMPtr
<nsIJARURI
> jar
= do_QueryInterface(aURI
);
227 // This gives us a raw pointer into the omnijar's data (if uncompressed);
228 // we do not own it and must not attempt to free it!
230 const void* ptr
= GetItemPtrFromJarURI(jar
, &length
);
233 // The data should be directly usable by mapped_hyph; validate that it
234 // looks correct, and save the pointer.
235 if (mapped_hyph_is_valid_hyphenator(static_cast<const uint8_t*>(ptr
),
238 mDict
= AsVariant(ptr
);
242 // The data is an uncompiled pattern file, so we need to compile it.
243 // We then move it to shared memory so we can expose it to content
245 MOZ_ASSERT(XRE_IsParentProcess());
246 UniquePtr
<const CompiledData
> data(mapped_hyph_compile_buffer(
247 static_cast<const uint8_t*>(ptr
), length
, false));
249 UniquePtr
<base::SharedMemory
> shm
= CopyToShmem(data
.get());
251 mDictSize
= mapped_hyph_compiled_data_size(data
.get());
252 mDict
= AsVariant(std::move(shm
));
258 // Omnijar must be compressed (currently this is the case on Android).
259 // If we're the parent process, decompress the resource into a shmem
260 // buffer; if we're a child, send a request to the parent for the
261 // shared-memory copy (which it will load if not already available).
262 if (XRE_IsParentProcess()) {
263 UniquePtr
<base::SharedMemory
> shm
=
264 LoadFromURI(aURI
, &length
, precompiled
);
267 mDict
= AsVariant(std::move(shm
));
271 UniquePtr
<base::SharedMemory
> shm
=
272 GetHyphDictFromParent(aURI
, &length
);
274 // We don't need to validate mDict because the parent process
275 // will have done so.
277 mDict
= AsVariant(std::move(shm
));
284 // We get file:// URIs when running an unpackaged build; they could also
285 // occur if we support adding hyphenation dictionaries by putting files in
286 // a directory of the profile, for example.
287 if (net::SchemeIsFile(aURI
)) {
288 // Ask the Rust lib to mmap the file. In this case our mDictSize field
289 // remains zero; mDict is not a pointer to the raw data but an opaque
290 // reference to a Rust object, and can only be freed by passing it to
291 // mapped_hyph_free_dictionary().
292 // (This case occurs in unpackaged developer builds.)
294 // GetFilePath returns the path with an unexpected leading slash (like
295 // "/c:/path/to/firefox/...") that may prevent it being found if it's an
296 // absolute Windows path starting with a drive letter.
297 // So check for this case and strip the slash.
298 if (path
.Length() > 2 && path
[0] == '/' && path
[2] == ':') {
303 // If the file is compiled, we can just map it directly.
304 UniquePtr
<const HyphDic
> dic(mapped_hyph_load_dictionary(path
.get()));
306 mDict
= AsVariant(std::move(dic
));
310 // For an uncompiled .dic file, the parent process is responsible for
311 // compiling it and storing the result in a shmem block that can be
312 // shared to content processes.
313 MOZ_ASSERT(XRE_IsParentProcess());
314 MOZ_ASSERT(StringEndsWith(path
, ".dic"_ns
));
315 UniquePtr
<const CompiledData
> data(
316 mapped_hyph_compile_file(path
.get(), false));
318 UniquePtr
<base::SharedMemory
> shm
= CopyToShmem(data
.get());
320 mDictSize
= mapped_hyph_compiled_data_size(data
.get());
321 mDict
= AsVariant(std::move(shm
));
328 // Each loading branch above will return if successful. So if we get here,
329 // whichever load type we attempted must have failed because something about
330 // the resource is broken.
333 msg
.Insert("Invalid hyphenation resource: ", 0);
334 NS_ASSERTION(false, msg
.get());
337 bool nsHyphenator::IsValid() {
339 [](const void*& ptr
) { return ptr
!= nullptr; },
340 [](UniquePtr
<base::SharedMemory
>& shm
) { return shm
!= nullptr; },
341 [](UniquePtr
<const HyphDic
>& hyph
) { return hyph
!= nullptr; });
344 nsresult
nsHyphenator::Hyphenate(const nsAString
& aString
,
345 nsTArray
<bool>& aHyphens
) {
346 if (!aHyphens
.SetLength(aString
.Length(), fallible
)) {
347 return NS_ERROR_OUT_OF_MEMORY
;
349 memset(aHyphens
.Elements(), false, aHyphens
.Length() * sizeof(bool));
352 uint32_t wordStart
= 0, wordLimit
= 0;
354 for (uint32_t i
= 0; i
< aString
.Length(); i
+= chLen
) {
355 uint32_t ch
= aString
[i
];
358 if (NS_IS_HIGH_SURROGATE(ch
)) {
359 if (i
+ 1 < aString
.Length() && NS_IS_LOW_SURROGATE(aString
[i
+ 1])) {
360 ch
= SURROGATE_TO_UCS4(ch
, aString
[i
+ 1]);
363 NS_WARNING("unpaired surrogate found during hyphenation");
367 nsUGenCategory cat
= unicode::GetGenCategory(ch
);
368 if (cat
== nsUGenCategory::kLetter
|| cat
== nsUGenCategory::kMark
) {
373 wordLimit
= i
+ chLen
;
374 if (i
+ chLen
< aString
.Length()) {
380 HyphenateWord(aString
, wordStart
, wordLimit
, aHyphens
);
388 void nsHyphenator::HyphenateWord(const nsAString
& aString
, uint32_t aStart
,
389 uint32_t aLimit
, nsTArray
<bool>& aHyphens
) {
390 // Convert word from aStart and aLimit in aString to utf-8 for mapped_hyph,
391 // lowercasing it as we go so that it will match the (lowercased) patterns
394 const char16_t
* cur
= aString
.BeginReading() + aStart
;
395 const char16_t
* end
= aString
.BeginReading() + aLimit
;
396 bool firstLetter
= true;
398 uint32_t ch
= *cur
++;
400 if (NS_IS_HIGH_SURROGATE(ch
)) {
401 if (cur
< end
&& NS_IS_LOW_SURROGATE(*cur
)) {
402 ch
= SURROGATE_TO_UCS4(ch
, *cur
++);
404 return; // unpaired surrogate: bail out, don't hyphenate broken text
406 } else if (NS_IS_LOW_SURROGATE(ch
)) {
407 return; // unpaired surrogate
410 // XXX What about language-specific casing? Consider Turkish I/i...
411 // In practice, it looks like the current patterns will not be
412 // affected by this, as they treat dotted and undotted i similarly.
413 uint32_t origCh
= ch
;
414 ch
= ToLowerCase(ch
);
417 // Avoid hyphenating capitalized words (bug 1550532) unless explicitly
418 // allowed by prefs for the language in use.
419 // Also never auto-hyphenate a word that has internal caps, as it may
420 // well be an all-caps acronym or a quirky name like iTunes.
421 if (!mHyphenateCapitalized
|| !firstLetter
) {
427 if (ch
< 0x80) { // U+0000 - U+007F
429 } else if (ch
< 0x0800) { // U+0100 - U+07FF
430 utf8
.Append(0xC0 | (ch
>> 6));
431 utf8
.Append(0x80 | (0x003F & ch
));
432 } else if (ch
< 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
433 utf8
.Append(0xE0 | (ch
>> 12));
434 utf8
.Append(0x80 | (0x003F & (ch
>> 6)));
435 utf8
.Append(0x80 | (0x003F & ch
));
437 utf8
.Append(0xF0 | (ch
>> 18));
438 utf8
.Append(0x80 | (0x003F & (ch
>> 12)));
439 utf8
.Append(0x80 | (0x003F & (ch
>> 6)));
440 utf8
.Append(0x80 | (0x003F & ch
));
444 AutoTArray
<uint8_t, 200> hyphenValues
;
445 hyphenValues
.SetLength(utf8
.Length());
446 int32_t result
= mDict
.match(
447 [&](const void*& ptr
) {
448 return mapped_hyph_find_hyphen_values_raw(
449 static_cast<const uint8_t*>(ptr
), mDictSize
, utf8
.BeginReading(),
450 utf8
.Length(), hyphenValues
.Elements(), hyphenValues
.Length());
452 [&](UniquePtr
<base::SharedMemory
>& shm
) {
453 return mapped_hyph_find_hyphen_values_raw(
454 static_cast<const uint8_t*>(shm
->memory()), mDictSize
,
455 utf8
.BeginReading(), utf8
.Length(), hyphenValues
.Elements(),
456 hyphenValues
.Length());
458 [&](UniquePtr
<const HyphDic
>& hyph
) {
459 return mapped_hyph_find_hyphen_values_dic(
460 hyph
.get(), utf8
.BeginReading(), utf8
.Length(),
461 hyphenValues
.Elements(), hyphenValues
.Length());
464 // We need to convert UTF-8 indexing as used by the hyphenation lib into
465 // UTF-16 indexing of the aHyphens[] array for Gecko.
466 uint32_t utf16index
= 0;
467 for (uint32_t utf8index
= 0; utf8index
< utf8
.Length();) {
468 // We know utf8 is valid, so we only need to look at the first byte of
469 // each character to determine its length and the corresponding UTF-16
470 // length to add to utf16index.
471 const uint8_t leadByte
= utf8
[utf8index
];
472 if (leadByte
< 0x80) {
474 } else if (leadByte
< 0xE0) {
476 } else if (leadByte
< 0xF0) {
481 // The hyphenation value of interest is the one for the last code unit
482 // of the utf-8 character, and is recorded on the last code unit of the
483 // utf-16 character (in the case of a surrogate pair).
484 utf16index
+= leadByte
>= 0xF0 ? 2 : 1;
485 if (utf16index
> 0 && (hyphenValues
[utf8index
- 1] & 0x01)) {
486 aHyphens
[aStart
+ utf16index
- 1] = true;
492 void nsHyphenator::CloneHandle(base::SharedMemoryHandle
* aOutHandle
,
493 uint32_t* aOutSize
) {
494 // If the resource is invalid, or if we fail to share it to the child
495 // process, we'll just bail out and continue without hyphenation; no need
496 // for this to be a fatal error.
497 if (!mDict
.is
<UniquePtr
<base::SharedMemory
>>()) {
500 *aOutHandle
= mDict
.as
<UniquePtr
<base::SharedMemory
>>()->CloneHandle();
501 *aOutSize
= mDictSize
;