Backed out 2 changesets (bug 903746) for causing non-unified build bustages on nsIPri...
[gecko.git] / intl / hyphenation / glue / nsHyphenator.cpp
blob129f30f9d550d03ea34fe46517c57d50de2a39b7
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "nsHyphenator.h"
8 #include "mozilla/dom/ContentChild.h"
9 #include "mozilla/Omnijar.h"
10 #include "nsContentUtils.h"
11 #include "nsIChannel.h"
12 #include "nsIFile.h"
13 #include "nsIFileURL.h"
14 #include "nsIInputStream.h"
15 #include "nsIJARURI.h"
16 #include "nsIURI.h"
17 #include "nsNetUtil.h"
18 #include "nsUnicodeProperties.h"
19 #include "nsUTF8Utils.h"
20 #include "nsZipArchive.h"
22 #include "mapped_hyph.h"
24 using namespace mozilla;
26 void DefaultDelete<const HyphDic>::operator()(const HyphDic* aHyph) const {
27 mapped_hyph_free_dictionary(const_cast<HyphDic*>(aHyph));
30 void DefaultDelete<const CompiledData>::operator()(
31 const CompiledData* aData) const {
32 mapped_hyph_free_compiled_data(const_cast<CompiledData*>(aData));
35 static const void* GetItemPtrFromJarURI(nsIJARURI* aJAR, uint32_t* aLength) {
36 // Try to get the jarfile's nsZipArchive, find the relevant item, and return
37 // a pointer to its data provided it is stored uncompressed.
38 nsCOMPtr<nsIURI> jarFile;
39 if (NS_FAILED(aJAR->GetJARFile(getter_AddRefs(jarFile)))) {
40 return nullptr;
42 nsCOMPtr<nsIFileURL> fileUrl = do_QueryInterface(jarFile);
43 if (!fileUrl) {
44 return nullptr;
46 nsCOMPtr<nsIFile> file;
47 fileUrl->GetFile(getter_AddRefs(file));
48 if (!file) {
49 return nullptr;
51 RefPtr<nsZipArchive> archive = Omnijar::GetReader(file);
52 if (archive) {
53 nsCString path;
54 aJAR->GetJAREntry(path);
55 nsZipItem* item = archive->GetItem(path.get());
56 if (item && item->Compression() == 0 && item->Size() > 0) {
57 // We do NOT own this data, but it won't go away until the omnijar
58 // file is closed during shutdown.
59 const uint8_t* data = archive->GetData(item);
60 if (data) {
61 *aLength = item->Size();
62 return data;
66 return nullptr;
69 static UniquePtr<base::SharedMemory> GetHyphDictFromParent(nsIURI* aURI,
70 uint32_t* aLength) {
71 MOZ_ASSERT(!XRE_IsParentProcess());
72 base::SharedMemoryHandle handle = base::SharedMemory::NULLHandle();
73 uint32_t size;
74 MOZ_ASSERT(aURI);
75 if (!dom::ContentChild::GetSingleton()->SendGetHyphDict(aURI, &handle,
76 &size)) {
77 return nullptr;
79 UniquePtr<base::SharedMemory> shm = MakeUnique<base::SharedMemory>();
80 if (!shm->IsHandleValid(handle)) {
81 return nullptr;
83 if (!shm->SetHandle(std::move(handle), true)) {
84 return nullptr;
86 if (!shm->Map(size)) {
87 return nullptr;
89 char* addr = static_cast<char*>(shm->memory());
90 if (!addr) {
91 return nullptr;
93 *aLength = size;
94 return shm;
97 static UniquePtr<base::SharedMemory> CopyToShmem(const CompiledData* aData) {
98 MOZ_ASSERT(XRE_IsParentProcess());
100 // The shm-related calls here are not expected to fail, but if they do,
101 // we'll just return null (as if the resource was unavailable) and proceed
102 // without hyphenation.
103 uint32_t size = mapped_hyph_compiled_data_size(aData);
104 UniquePtr<base::SharedMemory> shm = MakeUnique<base::SharedMemory>();
105 if (!shm->CreateFreezeable(size)) {
106 return nullptr;
108 if (!shm->Map(size)) {
109 return nullptr;
111 char* buffer = static_cast<char*>(shm->memory());
112 if (!buffer) {
113 return nullptr;
116 memcpy(buffer, mapped_hyph_compiled_data_ptr(aData), size);
117 if (!shm->Freeze()) {
118 return nullptr;
121 return shm;
124 static UniquePtr<base::SharedMemory> LoadFromURI(nsIURI* aURI,
125 uint32_t* aLength,
126 bool aPrecompiled) {
127 MOZ_ASSERT(XRE_IsParentProcess());
128 nsCOMPtr<nsIChannel> channel;
129 if (NS_FAILED(NS_NewChannel(
130 getter_AddRefs(channel), aURI, nsContentUtils::GetSystemPrincipal(),
131 nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_SEC_CONTEXT_IS_NULL,
132 nsIContentPolicy::TYPE_OTHER))) {
133 return nullptr;
135 nsCOMPtr<nsIInputStream> instream;
136 if (NS_FAILED(channel->Open(getter_AddRefs(instream)))) {
137 return nullptr;
139 // Check size, bail out if it is excessively large (the largest of the
140 // hyphenation files currently shipped with Firefox is around 1MB
141 // uncompressed).
142 uint64_t available;
143 if (NS_FAILED(instream->Available(&available)) || !available ||
144 available > 16 * 1024 * 1024) {
145 return nullptr;
148 if (aPrecompiled) {
149 UniquePtr<base::SharedMemory> shm = MakeUnique<base::SharedMemory>();
150 if (!shm->CreateFreezeable(available)) {
151 return nullptr;
153 if (!shm->Map(available)) {
154 return nullptr;
156 char* buffer = static_cast<char*>(shm->memory());
157 if (!buffer) {
158 return nullptr;
161 uint32_t bytesRead = 0;
162 if (NS_FAILED(instream->Read(buffer, available, &bytesRead)) ||
163 bytesRead != available) {
164 return nullptr;
167 if (!mapped_hyph_is_valid_hyphenator(
168 reinterpret_cast<const uint8_t*>(buffer), bytesRead)) {
169 return nullptr;
172 if (!shm->Freeze()) {
173 return nullptr;
176 *aLength = bytesRead;
177 return shm;
180 // Read from the URI into a temporary buffer, compile it, then copy the
181 // compiled resource to a shared memory region.
182 auto buffer = MakeUnique<char[]>(available);
183 uint32_t bytesRead = 0;
184 if (NS_FAILED(instream->Read(buffer.get(), available, &bytesRead)) ||
185 bytesRead != available) {
186 return nullptr;
189 UniquePtr<const CompiledData> data(mapped_hyph_compile_buffer(
190 reinterpret_cast<const uint8_t*>(buffer.get()), bytesRead, false));
191 if (data) {
192 *aLength = mapped_hyph_compiled_data_size(data.get());
193 return CopyToShmem(data.get());
196 return nullptr;
199 nsHyphenator::nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized)
200 : mDict(static_cast<const void*>(nullptr)),
201 mDictSize(0),
202 mHyphenateCapitalized(aHyphenateCapitalized) {
203 // Files with extension ".hyf" are expected to be precompiled mapped_hyph
204 // tables; we also support uncompiled ".dic" files, but they are more
205 // expensive to process on first load.
206 nsAutoCString path;
207 aURI->GetFilePath(path);
208 bool precompiled = StringEndsWith(path, ".hyf"_ns);
210 // Content processes don't do compilation; they depend on the parent giving
211 // them a compiled version of the resource, so that we only pay the cost of
212 // compilation once per language per session.
213 if (!precompiled && !XRE_IsParentProcess()) {
214 uint32_t length;
215 UniquePtr<base::SharedMemory> shm = GetHyphDictFromParent(aURI, &length);
216 if (shm) {
217 // We don't need to validate mDict because the parent process
218 // will have done so.
219 mDictSize = length;
220 mDict = AsVariant(std::move(shm));
222 return;
225 nsCOMPtr<nsIJARURI> jar = do_QueryInterface(aURI);
226 if (jar) {
227 // This gives us a raw pointer into the omnijar's data (if uncompressed);
228 // we do not own it and must not attempt to free it!
229 uint32_t length;
230 const void* ptr = GetItemPtrFromJarURI(jar, &length);
231 if (ptr) {
232 if (precompiled) {
233 // The data should be directly usable by mapped_hyph; validate that it
234 // looks correct, and save the pointer.
235 if (mapped_hyph_is_valid_hyphenator(static_cast<const uint8_t*>(ptr),
236 length)) {
237 mDictSize = length;
238 mDict = AsVariant(ptr);
239 return;
241 } else {
242 // The data is an uncompiled pattern file, so we need to compile it.
243 // We then move it to shared memory so we can expose it to content
244 // processes.
245 MOZ_ASSERT(XRE_IsParentProcess());
246 UniquePtr<const CompiledData> data(mapped_hyph_compile_buffer(
247 static_cast<const uint8_t*>(ptr), length, false));
248 if (data) {
249 UniquePtr<base::SharedMemory> shm = CopyToShmem(data.get());
250 if (shm) {
251 mDictSize = mapped_hyph_compiled_data_size(data.get());
252 mDict = AsVariant(std::move(shm));
253 return;
257 } else {
258 // Omnijar must be compressed (currently this is the case on Android).
259 // If we're the parent process, decompress the resource into a shmem
260 // buffer; if we're a child, send a request to the parent for the
261 // shared-memory copy (which it will load if not already available).
262 if (XRE_IsParentProcess()) {
263 UniquePtr<base::SharedMemory> shm =
264 LoadFromURI(aURI, &length, precompiled);
265 if (shm) {
266 mDictSize = length;
267 mDict = AsVariant(std::move(shm));
268 return;
270 } else {
271 UniquePtr<base::SharedMemory> shm =
272 GetHyphDictFromParent(aURI, &length);
273 if (shm) {
274 // We don't need to validate mDict because the parent process
275 // will have done so.
276 mDictSize = length;
277 mDict = AsVariant(std::move(shm));
278 return;
284 // We get file:// URIs when running an unpackaged build; they could also
285 // occur if we support adding hyphenation dictionaries by putting files in
286 // a directory of the profile, for example.
287 if (net::SchemeIsFile(aURI)) {
288 // Ask the Rust lib to mmap the file. In this case our mDictSize field
289 // remains zero; mDict is not a pointer to the raw data but an opaque
290 // reference to a Rust object, and can only be freed by passing it to
291 // mapped_hyph_free_dictionary().
292 // (This case occurs in unpackaged developer builds.)
293 #if XP_WIN
294 // GetFilePath returns the path with an unexpected leading slash (like
295 // "/c:/path/to/firefox/...") that may prevent it being found if it's an
296 // absolute Windows path starting with a drive letter.
297 // So check for this case and strip the slash.
298 if (path.Length() > 2 && path[0] == '/' && path[2] == ':') {
299 path.Cut(0, 1);
301 #endif
302 if (precompiled) {
303 // If the file is compiled, we can just map it directly.
304 UniquePtr<const HyphDic> dic(mapped_hyph_load_dictionary(path.get()));
305 if (dic) {
306 mDict = AsVariant(std::move(dic));
307 return;
309 } else {
310 // For an uncompiled .dic file, the parent process is responsible for
311 // compiling it and storing the result in a shmem block that can be
312 // shared to content processes.
313 MOZ_ASSERT(XRE_IsParentProcess());
314 MOZ_ASSERT(StringEndsWith(path, ".dic"_ns));
315 UniquePtr<const CompiledData> data(
316 mapped_hyph_compile_file(path.get(), false));
317 if (data) {
318 UniquePtr<base::SharedMemory> shm = CopyToShmem(data.get());
319 if (shm) {
320 mDictSize = mapped_hyph_compiled_data_size(data.get());
321 mDict = AsVariant(std::move(shm));
322 return;
328 // Each loading branch above will return if successful. So if we get here,
329 // whichever load type we attempted must have failed because something about
330 // the resource is broken.
331 nsAutoCString msg;
332 aURI->GetSpec(msg);
333 msg.Insert("Invalid hyphenation resource: ", 0);
334 NS_ASSERTION(false, msg.get());
337 bool nsHyphenator::IsValid() {
338 return mDict.match(
339 [](const void*& ptr) { return ptr != nullptr; },
340 [](UniquePtr<base::SharedMemory>& shm) { return shm != nullptr; },
341 [](UniquePtr<const HyphDic>& hyph) { return hyph != nullptr; });
344 nsresult nsHyphenator::Hyphenate(const nsAString& aString,
345 nsTArray<bool>& aHyphens) {
346 if (!aHyphens.SetLength(aString.Length(), fallible)) {
347 return NS_ERROR_OUT_OF_MEMORY;
349 memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool));
351 bool inWord = false;
352 uint32_t wordStart = 0, wordLimit = 0;
353 uint32_t chLen;
354 for (uint32_t i = 0; i < aString.Length(); i += chLen) {
355 uint32_t ch = aString[i];
356 chLen = 1;
358 if (NS_IS_HIGH_SURROGATE(ch)) {
359 if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i + 1])) {
360 ch = SURROGATE_TO_UCS4(ch, aString[i + 1]);
361 chLen = 2;
362 } else {
363 NS_WARNING("unpaired surrogate found during hyphenation");
367 nsUGenCategory cat = unicode::GetGenCategory(ch);
368 if (cat == nsUGenCategory::kLetter || cat == nsUGenCategory::kMark) {
369 if (!inWord) {
370 inWord = true;
371 wordStart = i;
373 wordLimit = i + chLen;
374 if (i + chLen < aString.Length()) {
375 continue;
379 if (inWord) {
380 HyphenateWord(aString, wordStart, wordLimit, aHyphens);
381 inWord = false;
385 return NS_OK;
388 void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart,
389 uint32_t aLimit, nsTArray<bool>& aHyphens) {
390 // Convert word from aStart and aLimit in aString to utf-8 for mapped_hyph,
391 // lowercasing it as we go so that it will match the (lowercased) patterns
392 // (bug 1105644).
393 nsAutoCString utf8;
394 const char16_t* cur = aString.BeginReading() + aStart;
395 const char16_t* end = aString.BeginReading() + aLimit;
396 bool firstLetter = true;
397 while (cur < end) {
398 uint32_t ch = *cur++;
400 if (NS_IS_HIGH_SURROGATE(ch)) {
401 if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
402 ch = SURROGATE_TO_UCS4(ch, *cur++);
403 } else {
404 return; // unpaired surrogate: bail out, don't hyphenate broken text
406 } else if (NS_IS_LOW_SURROGATE(ch)) {
407 return; // unpaired surrogate
410 // XXX What about language-specific casing? Consider Turkish I/i...
411 // In practice, it looks like the current patterns will not be
412 // affected by this, as they treat dotted and undotted i similarly.
413 uint32_t origCh = ch;
414 ch = ToLowerCase(ch);
416 if (ch != origCh) {
417 // Avoid hyphenating capitalized words (bug 1550532) unless explicitly
418 // allowed by prefs for the language in use.
419 // Also never auto-hyphenate a word that has internal caps, as it may
420 // well be an all-caps acronym or a quirky name like iTunes.
421 if (!mHyphenateCapitalized || !firstLetter) {
422 return;
425 firstLetter = false;
427 if (ch < 0x80) { // U+0000 - U+007F
428 utf8.Append(ch);
429 } else if (ch < 0x0800) { // U+0100 - U+07FF
430 utf8.Append(0xC0 | (ch >> 6));
431 utf8.Append(0x80 | (0x003F & ch));
432 } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
433 utf8.Append(0xE0 | (ch >> 12));
434 utf8.Append(0x80 | (0x003F & (ch >> 6)));
435 utf8.Append(0x80 | (0x003F & ch));
436 } else {
437 utf8.Append(0xF0 | (ch >> 18));
438 utf8.Append(0x80 | (0x003F & (ch >> 12)));
439 utf8.Append(0x80 | (0x003F & (ch >> 6)));
440 utf8.Append(0x80 | (0x003F & ch));
444 AutoTArray<uint8_t, 200> hyphenValues;
445 hyphenValues.SetLength(utf8.Length());
446 int32_t result = mDict.match(
447 [&](const void*& ptr) {
448 return mapped_hyph_find_hyphen_values_raw(
449 static_cast<const uint8_t*>(ptr), mDictSize, utf8.BeginReading(),
450 utf8.Length(), hyphenValues.Elements(), hyphenValues.Length());
452 [&](UniquePtr<base::SharedMemory>& shm) {
453 return mapped_hyph_find_hyphen_values_raw(
454 static_cast<const uint8_t*>(shm->memory()), mDictSize,
455 utf8.BeginReading(), utf8.Length(), hyphenValues.Elements(),
456 hyphenValues.Length());
458 [&](UniquePtr<const HyphDic>& hyph) {
459 return mapped_hyph_find_hyphen_values_dic(
460 hyph.get(), utf8.BeginReading(), utf8.Length(),
461 hyphenValues.Elements(), hyphenValues.Length());
463 if (result > 0) {
464 // We need to convert UTF-8 indexing as used by the hyphenation lib into
465 // UTF-16 indexing of the aHyphens[] array for Gecko.
466 uint32_t utf16index = 0;
467 for (uint32_t utf8index = 0; utf8index < utf8.Length();) {
468 // We know utf8 is valid, so we only need to look at the first byte of
469 // each character to determine its length and the corresponding UTF-16
470 // length to add to utf16index.
471 const uint8_t leadByte = utf8[utf8index];
472 if (leadByte < 0x80) {
473 utf8index += 1;
474 } else if (leadByte < 0xE0) {
475 utf8index += 2;
476 } else if (leadByte < 0xF0) {
477 utf8index += 3;
478 } else {
479 utf8index += 4;
481 // The hyphenation value of interest is the one for the last code unit
482 // of the utf-8 character, and is recorded on the last code unit of the
483 // utf-16 character (in the case of a surrogate pair).
484 utf16index += leadByte >= 0xF0 ? 2 : 1;
485 if (utf16index > 0 && (hyphenValues[utf8index - 1] & 0x01)) {
486 aHyphens[aStart + utf16index - 1] = true;
492 void nsHyphenator::CloneHandle(base::SharedMemoryHandle* aOutHandle,
493 uint32_t* aOutSize) {
494 // If the resource is invalid, or if we fail to share it to the child
495 // process, we'll just bail out and continue without hyphenation; no need
496 // for this to be a fatal error.
497 if (!mDict.is<UniquePtr<base::SharedMemory>>()) {
498 return;
500 *aOutHandle = mDict.as<UniquePtr<base::SharedMemory>>()->CloneHandle();
501 *aOutSize = mDictSize;