1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 var EXPORTED_SYMBOLS = ["FormAutofillNameUtils"];
9 // FormAutofillNameUtils is initially translated from
10 // https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc?rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
11 var FormAutofillNameUtils = {
72 FAMILY_NAME_PREFIXES: [
88 // The common and non-ambiguous CJK surnames (last names) that have more than
90 COMMON_CJK_MULTI_CHAR_SURNAMES: [
91 // Korean, taken from the list of surnames:
92 // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
102 // Chinese, taken from the top 10 Chinese 2-character surnames:
103 // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
104 // Simplified Chinese (mostly mainland China)
115 // Traditional Chinese (mostly Taiwan)
126 // All Korean surnames that have more than one character, even the
127 // rare/ambiguous ones.
128 KOREAN_MULTI_CHAR_SURNAMES: [
145 // The whitespace definition based on
146 // https://cs.chromium.org/chromium/src/base/strings/string_util_constants.cc?l=9&rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
148 "\u0009", // CHARACTER TABULATION
149 "\u000A", // LINE FEED (LF)
150 "\u000B", // LINE TABULATION
151 "\u000C", // FORM FEED (FF)
152 "\u000D", // CARRIAGE RETURN (CR)
154 "\u0085", // NEXT LINE (NEL)
155 "\u00A0", // NO-BREAK SPACE
156 "\u1680", // OGHAM SPACE MARK
159 "\u2002", // EN SPACE
160 "\u2003", // EM SPACE
161 "\u2004", // THREE-PER-EM SPACE
162 "\u2005", // FOUR-PER-EM SPACE
163 "\u2006", // SIX-PER-EM SPACE
164 "\u2007", // FIGURE SPACE
165 "\u2008", // PUNCTUATION SPACE
166 "\u2009", // THIN SPACE
167 "\u200A", // HAIR SPACE
168 "\u2028", // LINE SEPARATOR
169 "\u2029", // PARAGRAPH SEPARATOR
170 "\u202F", // NARROW NO-BREAK SPACE
171 "\u205F", // MEDIUM MATHEMATICAL SPACE
172 "\u3000", // IDEOGRAPHIC SPACE
175 // The middle dot is used as a separator for foreign names in Japanese.
177 "\u30FB", // KATAKANA MIDDLE DOT
178 "\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT"
181 // The Unicode range is based on Wiki:
182 // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
183 // https://en.wikipedia.org/wiki/Hangul
184 // https://en.wikipedia.org/wiki/Japanese_writing_system
186 "\u1100-\u11FF", // Hangul Jamo
187 "\u3040-\u309F", // Hiragana
188 "\u30A0-\u30FF", // Katakana
189 "\u3105-\u312C", // Bopomofo
190 "\u3130-\u318F", // Hangul Compatibility Jamo
191 "\u31F0-\u31FF", // Katakana Phonetic Extensions
192 "\u3200-\u32FF", // Enclosed CJK Letters and Months
193 "\u3400-\u4DBF", // CJK unified ideographs Extension A
194 "\u4E00-\u9FFF", // CJK Unified Ideographs
195 "\uA960-\uA97F", // Hangul Jamo Extended-A
196 "\uAC00-\uD7AF", // Hangul Syllables
197 "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
198 "\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms
202 "\u1100-\u11FF", // Hangul Jamo
203 "\u3130-\u318F", // Hangul Compatibility Jamo
204 "\uA960-\uA97F", // Hangul Jamo Extended-A
205 "\uAC00-\uD7AF", // Hangul Syllables
206 "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
211 // Returns true if |set| contains |token|, modulo a final period.
212 _containsString(set, token) {
213 let target = token.replace(/\.$/, "").toLowerCase();
214 return set.includes(target);
217 // Removes common name prefixes from |name_tokens|.
218 _stripPrefixes(nameTokens) {
219 for (let i in nameTokens) {
220 if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) {
221 return nameTokens.slice(i);
227 // Removes common name suffixes from |name_tokens|.
228 _stripSuffixes(nameTokens) {
229 for (let i = nameTokens.length - 1; i >= 0; i--) {
230 if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) {
231 return nameTokens.slice(0, i + 1);
238 // The name is considered to be a CJK name if it is only CJK characters,
239 // spaces, and "middle dot" separators, with at least one CJK character, and
240 // no more than 2 words.
242 // Chinese and Japanese names are usually spelled out using the Han
243 // characters (logographs), which constitute the "CJK Unified Ideographs"
244 // block in Unicode, also referred to as Unihan. Korean names are usually
245 // spelled out in the Korean alphabet (Hangul), although they do have a Han
246 // equivalent as well.
252 let previousWasCJK = false;
255 for (let c of name) {
256 let isMiddleDot = this.MIDDLE_DOT.includes(c);
257 let isCJK = !isMiddleDot && this.reCJK.test(c);
258 if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) {
261 if (isCJK && !previousWasCJK) {
264 previousWasCJK = isCJK;
267 return wordCount > 0 && wordCount < 3;
270 // Tries to split a Chinese, Japanese, or Korean name into its given name &
271 // surname parts. If splitting did not work for whatever reason, returns null.
272 _splitCJKName(nameTokens) {
273 // The convention for CJK languages is to put the surname (last name) first,
274 // and the given name (first name) second. In a continuous text, there is
275 // normally no space between the two parts of the name. When entering their
276 // name into a field, though, some people add a space to disambiguate. CJK
277 // names (almost) never have a middle name.
279 let reHangulName = new RegExp(
280 "^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$",
289 if (nameTokens.length == 1) {
290 // There is no space between the surname and given name. Try to infer
291 // where to separate between the two. Most Chinese and Korean surnames
292 // have only one character, but there are a few that have 2. If the name
293 // does not start with a surname from a known list, default to one
295 let name = nameTokens[0];
296 let isKorean = reHangulName.test(name);
297 let surnameLength = 0;
299 // 4-character Korean names are more likely to be 2/2 than 1/3, so use
300 // the full list of Korean 2-char surnames. (instead of only the common
302 let multiCharSurnames =
303 isKorean && name.length > 3
304 ? this.KOREAN_MULTI_CHAR_SURNAMES
305 : this.COMMON_CJK_MULTI_CHAR_SURNAMES;
307 // Default to 1 character if the surname is not in the list.
308 surnameLength = multiCharSurnames.some(surname =>
309 name.startsWith(surname)
314 nameParts.family = name.substr(0, surnameLength);
315 nameParts.given = name.substr(surnameLength);
316 } else if (nameTokens.length == 2) {
317 // The user entered a space between the two name parts. This makes our job
318 // easier. Family name first, given name second.
319 nameParts.family = nameTokens[0];
320 nameParts.given = nameTokens[1];
329 if (this._dataLoaded) {
332 this._dataLoaded = true;
334 this.reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]", "u");
348 let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/);
349 nameTokens = this._stripPrefixes(nameTokens);
351 if (this._isCJKName(name)) {
352 let parts = this._splitCJKName(nameTokens);
358 // Don't assume "Ma" is a suffix in John Ma.
359 if (nameTokens.length > 2) {
360 nameTokens = this._stripSuffixes(nameTokens);
363 if (!nameTokens.length) {
364 // Bad things have happened; just assume the whole thing is a given name.
365 nameParts.given = name;
369 // Only one token, assume given name.
370 if (nameTokens.length == 1) {
371 nameParts.given = nameTokens[0];
375 // 2 or more tokens. Grab the family, which is the last word plus any
376 // recognizable family prefixes.
377 let familyTokens = [nameTokens.pop()];
378 while (nameTokens.length) {
379 let lastToken = nameTokens[nameTokens.length - 1];
380 if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) {
383 familyTokens.unshift(lastToken);
386 nameParts.family = familyTokens.join(" ");
388 // Take the last remaining token as the middle name (if there are at least 2
390 if (nameTokens.length >= 2) {
391 nameParts.middle = nameTokens.pop();
394 // Remainder is given name.
395 nameParts.given = nameTokens.join(" ");
400 joinNameParts({ given, middle, family }) {
401 if (this._isCJKName(given) && this._isCJKName(family) && !middle) {
402 return family + given;
404 return [given, middle, family]
405 .filter(part => part && part.length)
410 FormAutofillNameUtils.init();