browser/extensions/formautofill/FormAutofillNameUtils.jsm

   1 /* This Source Code Form is subject to the terms of the Mozilla Public
   2  * License, v. 2.0. If a copy of the MPL was not distributed with this
   3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   4
   5 "use strict";
   6
   7 var EXPORTED_SYMBOLS = ["FormAutofillNameUtils"];
   8
   9 // FormAutofillNameUtils is initially translated from
  10 // https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc?rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
  11 var FormAutofillNameUtils = {
  12   NAME_PREFIXES: [
  13     "1lt",
  14     "1st",
  15     "2lt",
  16     "2nd",
  17     "3rd",
  18     "admiral",
  19     "capt",
  20     "captain",
  21     "col",
  22     "cpt",
  23     "dr",
  24     "gen",
  25     "general",
  26     "lcdr",
  27     "lt",
  28     "ltc",
  29     "ltg",
  30     "ltjg",
  31     "maj",
  32     "major",
  33     "mg",
  34     "mr",
  35     "mrs",
  36     "ms",
  37     "pastor",
  38     "prof",
  39     "rep",
  40     "reverend",
  41     "rev",
  42     "sen",
  43     "st",
  44   ],
  45
  46   NAME_SUFFIXES: [
  47     "b.a",
  48     "ba",
  49     "d.d.s",
  50     "dds",
  51     "i",
  52     "ii",
  53     "iii",
  54     "iv",
  55     "ix",
  56     "jr",
  57     "m.a",
  58     "m.d",
  59     "ma",
  60     "md",
  61     "ms",
  62     "ph.d",
  63     "phd",
  64     "sr",
  65     "v",
  66     "vi",
  67     "vii",
  68     "viii",
  69     "x",
  70   ],
  71
  72   FAMILY_NAME_PREFIXES: [
  73     "d'",
  74     "de",
  75     "del",
  76     "der",
  77     "di",
  78     "la",
  79     "le",
  80     "mc",
  81     "san",
  82     "st",
  83     "ter",
  84     "van",
  85     "von",
  86   ],
  87
  88   // The common and non-ambiguous CJK surnames (last names) that have more than
  89   // one character.
  90   COMMON_CJK_MULTI_CHAR_SURNAMES: [
  91     // Korean, taken from the list of surnames:
  92     // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
  93     "남궁",
  94     "사공",
  95     "서문",
  96     "선우",
  97     "제갈",
  98     "황보",
  99     "독고",
 100     "망절",
 101
 102     // Chinese, taken from the top 10 Chinese 2-character surnames:
 103     // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
 104     // Simplified Chinese (mostly mainland China)
 105     "欧阳",
 106     "令狐",
 107     "皇甫",
 108     "上官",
 109     "司徒",
 110     "诸葛",
 111     "司马",
 112     "宇文",
 113     "呼延",
 114     "端木",
 115     // Traditional Chinese (mostly Taiwan)
 116     "張簡",
 117     "歐陽",
 118     "諸葛",
 119     "申屠",
 120     "尉遲",
 121     "司馬",
 122     "軒轅",
 123     "夏侯",
 124   ],
 125
 126   // All Korean surnames that have more than one character, even the
 127   // rare/ambiguous ones.
 128   KOREAN_MULTI_CHAR_SURNAMES: [
 129     "강전",
 130     "남궁",
 131     "독고",
 132     "동방",
 133     "망절",
 134     "사공",
 135     "서문",
 136     "선우",
 137     "소봉",
 138     "어금",
 139     "장곡",
 140     "제갈",
 141     "황목",
 142     "황보",
 143   ],
 144
 145   // The whitespace definition based on
 146   // https://cs.chromium.org/chromium/src/base/strings/string_util_constants.cc?l=9&rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
 147   WHITESPACE: [
 148     "\u0009", // CHARACTER TABULATION
 149     "\u000A", // LINE FEED (LF)
 150     "\u000B", // LINE TABULATION
 151     "\u000C", // FORM FEED (FF)
 152     "\u000D", // CARRIAGE RETURN (CR)
 153     "\u0020", // SPACE
 154     "\u0085", // NEXT LINE (NEL)
 155     "\u00A0", // NO-BREAK SPACE
 156     "\u1680", // OGHAM SPACE MARK
 157     "\u2000", // EN QUAD
 158     "\u2001", // EM QUAD
 159     "\u2002", // EN SPACE
 160     "\u2003", // EM SPACE
 161     "\u2004", // THREE-PER-EM SPACE
 162     "\u2005", // FOUR-PER-EM SPACE
 163     "\u2006", // SIX-PER-EM SPACE
 164     "\u2007", // FIGURE SPACE
 165     "\u2008", // PUNCTUATION SPACE
 166     "\u2009", // THIN SPACE
 167     "\u200A", // HAIR SPACE
 168     "\u2028", // LINE SEPARATOR
 169     "\u2029", // PARAGRAPH SEPARATOR
 170     "\u202F", // NARROW NO-BREAK SPACE
 171     "\u205F", // MEDIUM MATHEMATICAL SPACE
 172     "\u3000", // IDEOGRAPHIC SPACE
 173   ],
 174
 175   // The middle dot is used as a separator for foreign names in Japanese.
 176   MIDDLE_DOT: [
 177     "\u30FB", // KATAKANA MIDDLE DOT
 178     "\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT"
 179   ],
 180
 181   // The Unicode range is based on Wiki:
 182   // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
 183   // https://en.wikipedia.org/wiki/Hangul
 184   // https://en.wikipedia.org/wiki/Japanese_writing_system
 185   CJK_RANGE: [
 186     "\u1100-\u11FF", // Hangul Jamo
 187     "\u3040-\u309F", // Hiragana
 188     "\u30A0-\u30FF", // Katakana
 189     "\u3105-\u312C", // Bopomofo
 190     "\u3130-\u318F", // Hangul Compatibility Jamo
 191     "\u31F0-\u31FF", // Katakana Phonetic Extensions
 192     "\u3200-\u32FF", // Enclosed CJK Letters and Months
 193     "\u3400-\u4DBF", // CJK unified ideographs Extension A
 194     "\u4E00-\u9FFF", // CJK Unified Ideographs
 195     "\uA960-\uA97F", // Hangul Jamo Extended-A
 196     "\uAC00-\uD7AF", // Hangul Syllables
 197     "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
 198     "\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms
 199   ],
 200
 201   HANGUL_RANGE: [
 202     "\u1100-\u11FF", // Hangul Jamo
 203     "\u3130-\u318F", // Hangul Compatibility Jamo
 204     "\uA960-\uA97F", // Hangul Jamo Extended-A
 205     "\uAC00-\uD7AF", // Hangul Syllables
 206     "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
 207   ],
 208
 209   _dataLoaded: false,
 210
 211   // Returns true if |set| contains |token|, modulo a final period.
 212   _containsString(set, token) {
 213     let target = token.replace(/\.$/, "").toLowerCase();
 214     return set.includes(target);
 215   },
 216
 217   // Removes common name prefixes from |name_tokens|.
 218   _stripPrefixes(nameTokens) {
 219     for (let i in nameTokens) {
 220       if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) {
 221         return nameTokens.slice(i);
 222       }
 223     }
 224     return [];
 225   },
 226
 227   // Removes common name suffixes from |name_tokens|.
 228   _stripSuffixes(nameTokens) {
 229     for (let i = nameTokens.length - 1; i >= 0; i--) {
 230       if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) {
 231         return nameTokens.slice(0, i + 1);
 232       }
 233     }
 234     return [];
 235   },
 236
 237   _isCJKName(name) {
 238     // The name is considered to be a CJK name if it is only CJK characters,
 239     // spaces, and "middle dot" separators, with at least one CJK character, and
 240     // no more than 2 words.
 241     //
 242     // Chinese and Japanese names are usually spelled out using the Han
 243     // characters (logographs), which constitute the "CJK Unified Ideographs"
 244     // block in Unicode, also referred to as Unihan. Korean names are usually
 245     // spelled out in the Korean alphabet (Hangul), although they do have a Han
 246     // equivalent as well.
 247
 248     if (!name) {
 249       return false;
 250     }
 251
 252     let previousWasCJK = false;
 253     let wordCount = 0;
 254
 255     for (let c of name) {
 256       let isMiddleDot = this.MIDDLE_DOT.includes(c);
 257       let isCJK = !isMiddleDot && this.reCJK.test(c);
 258       if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) {
 259         return false;
 260       }
 261       if (isCJK && !previousWasCJK) {
 262         wordCount++;
 263       }
 264       previousWasCJK = isCJK;
 265     }
 266
 267     return wordCount > 0 && wordCount < 3;
 268   },
 269
 270   // Tries to split a Chinese, Japanese, or Korean name into its given name &
 271   // surname parts. If splitting did not work for whatever reason, returns null.
 272   _splitCJKName(nameTokens) {
 273     // The convention for CJK languages is to put the surname (last name) first,
 274     // and the given name (first name) second. In a continuous text, there is
 275     // normally no space between the two parts of the name. When entering their
 276     // name into a field, though, some people add a space to disambiguate. CJK
 277     // names (almost) never have a middle name.
 278
 279     let reHangulName = new RegExp(
 280       "^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$",
 281       "u"
 282     );
 283     let nameParts = {
 284       given: "",
 285       middle: "",
 286       family: "",
 287     };
 288
 289     if (nameTokens.length == 1) {
 290       // There is no space between the surname and given name. Try to infer
 291       // where to separate between the two. Most Chinese and Korean surnames
 292       // have only one character, but there are a few that have 2. If the name
 293       // does not start with a surname from a known list, default to one
 294       // character.
 295       let name = nameTokens[0];
 296       let isKorean = reHangulName.test(name);
 297       let surnameLength = 0;
 298
 299       // 4-character Korean names are more likely to be 2/2 than 1/3, so use
 300       // the full list of Korean 2-char surnames. (instead of only the common
 301       // ones)
 302       let multiCharSurnames =
 303         isKorean && name.length > 3
 304           ? this.KOREAN_MULTI_CHAR_SURNAMES
 305           : this.COMMON_CJK_MULTI_CHAR_SURNAMES;
 306
 307       // Default to 1 character if the surname is not in the list.
 308       surnameLength = multiCharSurnames.some(surname =>
 309         name.startsWith(surname)
 310       )
 311         ? 2
 312         : 1;
 313
 314       nameParts.family = name.substr(0, surnameLength);
 315       nameParts.given = name.substr(surnameLength);
 316     } else if (nameTokens.length == 2) {
 317       // The user entered a space between the two name parts. This makes our job
 318       // easier. Family name first, given name second.
 319       nameParts.family = nameTokens[0];
 320       nameParts.given = nameTokens[1];
 321     } else {
 322       return null;
 323     }
 324
 325     return nameParts;
 326   },
 327
 328   init() {
 329     if (this._dataLoaded) {
 330       return;
 331     }
 332     this._dataLoaded = true;
 333
 334     this.reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]", "u");
 335   },
 336
 337   splitName(name) {
 338     let nameParts = {
 339       given: "",
 340       middle: "",
 341       family: "",
 342     };
 343
 344     if (!name) {
 345       return nameParts;
 346     }
 347
 348     let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/);
 349     nameTokens = this._stripPrefixes(nameTokens);
 350
 351     if (this._isCJKName(name)) {
 352       let parts = this._splitCJKName(nameTokens);
 353       if (parts) {
 354         return parts;
 355       }
 356     }
 357
 358     // Don't assume "Ma" is a suffix in John Ma.
 359     if (nameTokens.length > 2) {
 360       nameTokens = this._stripSuffixes(nameTokens);
 361     }
 362
 363     if (!nameTokens.length) {
 364       // Bad things have happened; just assume the whole thing is a given name.
 365       nameParts.given = name;
 366       return nameParts;
 367     }
 368
 369     // Only one token, assume given name.
 370     if (nameTokens.length == 1) {
 371       nameParts.given = nameTokens[0];
 372       return nameParts;
 373     }
 374
 375     // 2 or more tokens. Grab the family, which is the last word plus any
 376     // recognizable family prefixes.
 377     let familyTokens = [nameTokens.pop()];
 378     while (nameTokens.length) {
 379       let lastToken = nameTokens[nameTokens.length - 1];
 380       if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) {
 381         break;
 382       }
 383       familyTokens.unshift(lastToken);
 384       nameTokens.pop();
 385     }
 386     nameParts.family = familyTokens.join(" ");
 387
 388     // Take the last remaining token as the middle name (if there are at least 2
 389     // tokens).
 390     if (nameTokens.length >= 2) {
 391       nameParts.middle = nameTokens.pop();
 392     }
 393
 394     // Remainder is given name.
 395     nameParts.given = nameTokens.join(" ");
 396
 397     return nameParts;
 398   },
 399
 400   joinNameParts({ given, middle, family }) {
 401     if (this._isCJKName(given) && this._isCJKName(family) && !middle) {
 402       return family + given;
 403     }
 404     return [given, middle, family]
 405       .filter(part => part && part.length)
 406       .join(" ");
 407   },
 408 };
 409
 410 FormAutofillNameUtils.init();