1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
3 * You can obtain one at http://mozilla.org/MPL/2.0/. */
5 // workerManager is exported for tests.
6 import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";
8 const WORKER_URL = "resource://gre/modules/translation/cld-worker.js";
11 * The length of the substring to pull from the document's text for language
14 * This value should ideally be one that is large enough to yield a confident
15 * identification result without being too large or expensive to extract.
17 * At this time, this value is not driven by statistical data or analysis.
19 * For the moment, while we investigate which language identification library
20 * we would like to use, keep this logic in sync with language-id-engine.sys.mjs
22 const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
24 export var workerManager = {
25 // Since Emscripten can handle heap growth, but not heap shrinkage, we
26 // need to refresh the worker after we've processed a particularly large
27 // string in order to prevent unnecessary resident memory growth.
29 // These values define the cut-off string length and the idle timeout
30 // (in milliseconds) before destroying a worker. Once a string of the
31 // maximum size has been processed, the worker is marked for
32 // destruction, and is terminated as soon as it has been idle for the
35 // 1.5MB. This is the approximate string length that forces heap growth
37 LARGE_STRING: 1.5 * 1024 * 1024,
38 IDLE_TIMEOUT: 10 * 1000,
42 detectLanguage(aParams) {
43 return this.workerReady
45 return new Promise(resolve => {
46 this.detectionQueue.push({ resolve });
47 worker.postMessage(aParams);
51 // We have our asynchronous result from the worker.
53 // Determine if our input was large enough to trigger heap growth,
54 // or if we're already waiting to destroy the worker when it's
55 // idle. If so, schedule termination after the idle timeout.
57 aParams.text.length >= this.LARGE_STRING ||
58 this._idleTimeout != null
68 _workerReadyPromise: null,
71 if (!this._workerReadyPromise) {
72 this._workerReadyPromise = new Promise(resolve => {
73 let worker = new Worker(WORKER_URL);
74 worker.onmessage = aMsg => {
75 if (aMsg.data == "ready") {
78 this.detectionQueue.shift().resolve(aMsg.data);
81 this._worker = worker;
85 return this._workerReadyPromise;
88 // Holds the ID of the current pending idle cleanup setTimeout.
91 // Schedule the current worker to be terminated after the idle timeout.
93 if (this._idleTimeout != null) {
94 clearTimeout(this._idleTimeout);
97 this._idleTimeout = setTimeout(
98 this._flushWorker.bind(this),
103 // Immediately terminate the worker, as long as there no pending
104 // results. Otherwise, reschedule termination until after the next
107 if (this.detectionQueue.length) {
111 this._worker.terminate();
115 this._workerReadyPromise = null;
116 this._idleTimeout = null;
121 export var LanguageDetector = {
123 * Detect the language of a given string.
125 * The argument may be either a string containing the text to analyze,
126 * or an object with the following properties:
128 * - 'text' The text to analyze.
130 * - 'isHTML' (optional) A boolean, indicating whether the text
131 * should be analyzed as HTML rather than plain text.
133 * - 'language' (optional) A string indicating the expected language.
134 * For text extracted from HTTP documents, this is expected to
135 * come from the Content-Language header.
137 * - 'tld' (optional) A string indicating the top-level domain of the
138 * document the text was extracted from.
140 * - 'encoding' (optional) A string describing the encoding of the
141 * document the string was extracted from. Note that, regardless
142 * of the value of this property, the 'text' property must be a
143 * UTF-16 JavaScript string.
145 * @returns {Promise<Object>}
146 * @resolves When detection is finished, with a object containing
148 * - 'language' (string with a language code)
149 * - 'confident' (boolean) Whether the detector is confident of the
151 * - 'languages' (array) An array of up to three elements, containing
152 * the most prevalent languages detected. It contains a
153 * 'languageCode' property, containing the ISO language code of
154 * the language, and a 'percent' property, describing the
155 * approximate percentage of the input which is in that language.
156 * For text of an unknown language, the result may contain an
157 * entry with the languge code 'un', indicating the percent of
158 * the text which is unknown.
160 detectLanguage(aParams) {
161 if (typeof aParams == "string") {
162 aParams = { text: aParams };
165 return workerManager.detectLanguage(aParams);
169 * Attempts to determine the language in which the document's content is written.
171 * For the moment, while we investigate which language identification library
172 * we would like to use, keep this logic in sync with language-id-engine.sys.mjs
173 * @returns {string | null}
175 async detectLanguageFromDocument(aDocument) {
176 // Grab a selection of text.
177 let encoder = Cu.createDocumentEncoder("text/plain");
178 encoder.init(aDocument, "text/plain", encoder.SkipInvisibleContent);
180 .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
181 .replaceAll("\r", "")
182 .replaceAll("\n", " ");
184 const { language, confident } = await workerManager.detectLanguage({
188 workerManager.flushWorker();
190 return confident ? language : null;