2 YUI 3.17.2 (build 9c3c78e)
3 Copyright 2014 Yahoo! Inc. All rights reserved.
4 Licensed under the BSD License.
5 http://yuilibrary.com/license/
8 YUI.add('text-wordbreak', function (Y, NAME) {
11 * Provides utility methods for splitting strings on word breaks and determining
12 * whether a character index represents a word boundary.
15 * @submodule text-wordbreak
20 * Provides utility methods for splitting strings on word breaks and determining
21 * whether a character index represents a word boundary, using the generic word
22 * breaking algorithm defined in the Unicode Text Segmentation guidelines
23 * (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
28 * This algorithm provides a reasonable default for many languages. However, it
29 * does not cover language or context specific requirements, and it does not
30 * provide meaningful results at all for languages that don't use spaces between
31 * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
32 * word breaking services usually provide significantly better results with
36 * @class Text.WordBreak
41 WBData = Text.Data.WordBreak,
43 // Constants representing code point classifications.
58 // RegExp objects generated from code point data. Each regex matches a single
59 // character against a set of Unicode code points. The index of each item in
60 // this array must match its corresponding code point constant value defined
63 new RegExp(WBData.aletter),
64 new RegExp(WBData.midnumlet),
65 new RegExp(WBData.midletter),
66 new RegExp(WBData.midnum),
67 new RegExp(WBData.numeric),
68 new RegExp(WBData.cr),
69 new RegExp(WBData.lf),
70 new RegExp(WBData.newline),
71 new RegExp(WBData.extend),
72 new RegExp(WBData.format),
73 new RegExp(WBData.katakana),
74 new RegExp(WBData.extendnumlet)
78 PUNCTUATION = new RegExp('^' + WBData.punctuation + '$'),
82 // -- Public Static Methods ------------------------------------------------
85 * Splits the specified string into an array of individual words.
88 * @param {String} string String to split.
89 * @param {Object} options (optional) Options object containing zero or more
90 * of the following properties:
93 * <dt>ignoreCase (Boolean)</dt>
95 * If <code>true</code>, the string will be converted to lowercase
96 * before being split. Default is <code>false</code>.
99 * <dt>includePunctuation (Boolean)</dt>
101 * If <code>true</code>, the returned array will include punctuation
102 * characters. Default is <code>false</code>.
105 * <dt>includeWhitespace (Boolean)</dt>
107 * If <code>true</code>, the returned array will include whitespace
108 * characters. Default is <code>false</code>.
111 * @return {Array} Array of words.
114 getWords: function (string, options) {
116 map = WordBreak._classify(string),
128 if (options.ignoreCase) {
129 string = string.toLowerCase();
132 includePunctuation = options.includePunctuation;
133 includeWhitespace = options.includeWhitespace;
135 // Loop through each character in the classification map and determine
136 // whether it precedes a word boundary, building an array of distinct
138 for (; i < len; ++i) {
139 chr = string.charAt(i);
141 // Append this character to the current word.
144 // If there's a word boundary between the current character and the
145 // next character, append the current word to the words array and
146 // start building a new word.
147 if (WordBreak._isWordBoundary(map, i)) {
148 word = word.join(EMPTY_STRING);
151 (includeWhitespace || !WHITESPACE.test(word)) &&
152 (includePunctuation || !PUNCTUATION.test(word))) {
164 * Returns an array containing only unique words from the specified string.
165 * For example, the string <code>'foo bar baz foo'</code> would result in
166 * the array <code>['foo', 'bar', 'baz']</code>.
168 * @method getUniqueWords
169 * @param {String} string String to split.
170 * @param {Object} options (optional) Options (see <code>getWords()</code>
172 * @return {Array} Array of unique words.
175 getUniqueWords: function (string, options) {
176 return Y.Array.unique(WordBreak.getWords(string, options));
181 * Returns <code>true</code> if there is a word boundary between the
182 * specified character index and the next character index (or the end of the
187 * Note that there are always word breaks at the beginning and end of a
188 * string, so <code>isWordBoundary('', 0)</code> and
189 * <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
192 * @method isWordBoundary
193 * @param {String} string String to test.
194 * @param {Number} index Character index to test within the string.
195 * @return {Boolean} <code>true</code> for a word boundary,
196 * <code>false</code> otherwise.
199 isWordBoundary: function (string, index) {
200 return WordBreak._isWordBoundary(WordBreak._classify(string), index);
203 // -- Protected Static Methods ---------------------------------------------
206 * Returns a character classification map for the specified string.
209 * @param {String} string String to classify.
210 * @return {Array} Classification map.
214 _classify: function (string) {
220 stringLength = string.length,
221 setsLength = SETS.length,
224 for (; i < stringLength; ++i) {
225 chr = string.charAt(i);
228 for (j = 0; j < setsLength; ++j) {
231 if (set && set.test(chr)) {
245 * Returns <code>true</code> if there is a word boundary between the
246 * specified character index and the next character index (or the end of the
251 * Note that there are always word breaks at the beginning and end of a
252 * string, so <code>_isWordBoundary('', 0)</code> and
253 * <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
256 * @method _isWordBoundary
257 * @param {Array} map Character classification map generated by
258 * <code>_classify</code>.
259 * @param {Number} index Character index to test.
264 _isWordBoundary: function (map, index) {
267 nextType = map[index + 1],
270 if (index < 0 || (index > map.length - 1 && index !== 0)) {
271 Y.log('isWordBoundary: index out of bounds', 'warn', 'text-wordbreak');
275 // WB5. Don't break between most letters.
276 if (type === ALETTER && nextType === ALETTER) {
280 nextNextType = map[index + 2];
282 // WB6. Don't break letters across certain punctuation.
283 if (type === ALETTER &&
284 (nextType === MIDLETTER || nextType === MIDNUMLET) &&
285 nextNextType === ALETTER) {
289 prevType = map[index - 1];
291 // WB7. Don't break letters across certain punctuation.
292 if ((type === MIDLETTER || type === MIDNUMLET) &&
293 nextType === ALETTER &&
294 prevType === ALETTER) {
298 // WB8/WB9/WB10. Don't break inside sequences of digits or digits
299 // adjacent to letters.
300 if ((type === NUMERIC || type === ALETTER) &&
301 (nextType === NUMERIC || nextType === ALETTER)) {
305 // WB11. Don't break inside numeric sequences like "3.2" or
307 if ((type === MIDNUM || type === MIDNUMLET) &&
308 nextType === NUMERIC &&
309 prevType === NUMERIC) {
313 // WB12. Don't break inside numeric sequences like "3.2" or
315 if (type === NUMERIC &&
316 (nextType === MIDNUM || nextType === MIDNUMLET) &&
317 nextNextType === NUMERIC) {
321 // WB4. Ignore format and extend characters.
322 if (type === EXTEND || type === FORMAT ||
323 prevType === EXTEND || prevType === FORMAT ||
324 nextType === EXTEND || nextType === FORMAT) {
328 // WB3. Don't break inside CRLF.
329 if (type === CR && nextType === LF) {
333 // WB3a. Break before newlines (including CR and LF).
334 if (type === NEWLINE || type === CR || type === LF) {
338 // WB3b. Break after newlines (including CR and LF).
339 if (nextType === NEWLINE || nextType === CR || nextType === LF) {
343 // WB13. Don't break between Katakana characters.
344 if (type === KATAKANA && nextType === KATAKANA) {
348 // WB13a. Don't break from extenders.
349 if (nextType === EXTENDNUMLET &&
350 (type === ALETTER || type === NUMERIC || type === KATAKANA ||
351 type === EXTENDNUMLET)) {
355 // WB13b. Don't break from extenders.
356 if (type === EXTENDNUMLET &&
357 (nextType === ALETTER || nextType === NUMERIC ||
358 nextType === KATAKANA)) {
362 // Break after any character not covered by the rules above.
367 Text.WordBreak = WordBreak;
370 }, '3.17.2', {"requires": ["array-extras", "text-data-wordbreak"]});