toolkit/components/translation/LanguageDetector.sys.mjs

   1 /* This Source Code Form is subject to the terms of the Mozilla Public
   2  * License, v. 2.0. If a copy of the MPL was not distributed with this file,
   3  * You can obtain one at http://mozilla.org/MPL/2.0/. */
   4
   5 // workerManager is exported for tests.
   6 import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";
   7
   8 const WORKER_URL = "resource://gre/modules/translation/cld-worker.js";
   9
  10 /**
  11  * The length of the substring to pull from the document's text for language
  12  * identification.
  13  *
  14  * This value should ideally be one that is large enough to yield a confident
  15  * identification result without being too large or expensive to extract.
  16  *
  17  * At this time, this value is not driven by statistical data or analysis.
  18  *
  19  * For the moment, while we investigate which language identification library
  20  * we would like to use, keep this logic in sync with language-id-engine.sys.mjs
  21  */
  22 const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
  23
  24 export var workerManager = {
  25   // Since Emscripten can handle heap growth, but not heap shrinkage, we
  26   // need to refresh the worker after we've processed a particularly large
  27   // string in order to prevent unnecessary resident memory growth.
  28   //
  29   // These values define the cut-off string length and the idle timeout
  30   // (in milliseconds) before destroying a worker. Once a string of the
  31   // maximum size has been processed, the worker is marked for
  32   // destruction, and is terminated as soon as it has been idle for the
  33   // given timeout.
  34   //
  35   // 1.5MB. This is the approximate string length that forces heap growth
  36   // for a 2MB heap.
  37   LARGE_STRING: 1.5 * 1024 * 1024,
  38   IDLE_TIMEOUT: 10 * 1000,
  39
  40   detectionQueue: [],
  41
  42   detectLanguage(aParams) {
  43     return this.workerReady
  44       .then(worker => {
  45         return new Promise(resolve => {
  46           this.detectionQueue.push({ resolve });
  47           worker.postMessage(aParams);
  48         });
  49       })
  50       .then(result => {
  51         // We have our asynchronous result from the worker.
  52         //
  53         // Determine if our input was large enough to trigger heap growth,
  54         // or if we're already waiting to destroy the worker when it's
  55         // idle. If so, schedule termination after the idle timeout.
  56         if (
  57           aParams.text.length >= this.LARGE_STRING ||
  58           this._idleTimeout != null
  59         ) {
  60           this.flushWorker();
  61         }
  62
  63         return result;
  64       });
  65   },
  66
  67   _worker: null,
  68   _workerReadyPromise: null,
  69
  70   get workerReady() {
  71     if (!this._workerReadyPromise) {
  72       this._workerReadyPromise = new Promise(resolve => {
  73         let worker = new Worker(WORKER_URL);
  74         worker.onmessage = aMsg => {
  75           if (aMsg.data == "ready") {
  76             resolve(worker);
  77           } else {
  78             this.detectionQueue.shift().resolve(aMsg.data);
  79           }
  80         };
  81         this._worker = worker;
  82       });
  83     }
  84
  85     return this._workerReadyPromise;
  86   },
  87
  88   // Holds the ID of the current pending idle cleanup setTimeout.
  89   _idleTimeout: null,
  90
  91   // Schedule the current worker to be terminated after the idle timeout.
  92   flushWorker() {
  93     if (this._idleTimeout != null) {
  94       clearTimeout(this._idleTimeout);
  95     }
  96
  97     this._idleTimeout = setTimeout(
  98       this._flushWorker.bind(this),
  99       this.IDLE_TIMEOUT
 100     );
 101   },
 102
 103   // Immediately terminate the worker, as long as there no pending
 104   // results. Otherwise, reschedule termination until after the next
 105   // idle timeout.
 106   _flushWorker() {
 107     if (this.detectionQueue.length) {
 108       this.flushWorker();
 109     } else {
 110       if (this._worker) {
 111         this._worker.terminate();
 112       }
 113
 114       this._worker = null;
 115       this._workerReadyPromise = null;
 116       this._idleTimeout = null;
 117     }
 118   },
 119 };
 120
 121 export var LanguageDetector = {
 122   /**
 123    * Detect the language of a given string.
 124    *
 125    * The argument may be either a string containing the text to analyze,
 126    * or an object with the following properties:
 127    *
 128    *  - 'text' The text to analyze.
 129    *
 130    *  - 'isHTML' (optional) A boolean, indicating whether the text
 131    *      should be analyzed as HTML rather than plain text.
 132    *
 133    *  - 'language' (optional) A string indicating the expected language.
 134    *      For text extracted from HTTP documents, this is expected to
 135    *      come from the Content-Language header.
 136    *
 137    *  - 'tld' (optional) A string indicating the top-level domain of the
 138    *      document the text was extracted from.
 139    *
 140    *  - 'encoding' (optional) A string describing the encoding of the
 141    *      document the string was extracted from. Note that, regardless
 142    *      of the value of this property, the 'text' property must be a
 143    *      UTF-16 JavaScript string.
 144    *
 145    * @returns {Promise<Object>}
 146    * @resolves When detection is finished, with a object containing
 147    * these fields:
 148    *  - 'language' (string with a language code)
 149    *  - 'confident' (boolean) Whether the detector is confident of the
 150    *      result.
 151    *  - 'languages' (array) An array of up to three elements, containing
 152    *      the most prevalent languages detected. It contains a
 153    *      'languageCode' property, containing the ISO language code of
 154    *      the language, and a 'percent' property, describing the
 155    *      approximate percentage of the input which is in that language.
 156    *      For text of an unknown language, the result may contain an
 157    *      entry with the languge code 'un', indicating the percent of
 158    *      the text which is unknown.
 159    */
 160   detectLanguage(aParams) {
 161     if (typeof aParams == "string") {
 162       aParams = { text: aParams };
 163     }
 164
 165     return workerManager.detectLanguage(aParams);
 166   },
 167
 168   /**
 169    * Attempts to determine the language in which the document's content is written.
 170    *
 171    * For the moment, while we investigate which language identification library
 172    * we would like to use, keep this logic in sync with language-id-engine.sys.mjs
 173    * @returns {string | null}
 174    */
 175   async detectLanguageFromDocument(aDocument) {
 176     // Grab a selection of text.
 177     let encoder = Cu.createDocumentEncoder("text/plain");
 178     encoder.init(aDocument, "text/plain", encoder.SkipInvisibleContent);
 179     let text = encoder
 180       .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
 181       .replaceAll("\r", "")
 182       .replaceAll("\n", " ");
 183
 184     const { language, confident } = await workerManager.detectLanguage({
 185       text,
 186     });
 187
 188     workerManager.flushWorker();
 189
 190     return confident ? language : null;
 191   },
 192 };