lib/yuilib/3.17.2/text-wordbreak/text-wordbreak-debug.js

   1 /*
   2 YUI 3.17.2 (build 9c3c78e)
   3 Copyright 2014 Yahoo! Inc. All rights reserved.
   4 Licensed under the BSD License.
   5 http://yuilibrary.com/license/
   6 */
   7
   8 YUI.add('text-wordbreak', function (Y, NAME) {
   9
  10 /**
  11  * Provides utility methods for splitting strings on word breaks and determining
  12  * whether a character index represents a word boundary.
  13  *
  14  * @module text
  15  * @submodule text-wordbreak
  16  */
  17
  18 /**
  19  * <p>
  20  * Provides utility methods for splitting strings on word breaks and determining
  21  * whether a character index represents a word boundary, using the generic word
  22  * breaking algorithm defined in the Unicode Text Segmentation guidelines
  23  * (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
  24  * Annex #29</a>).
  25  * </p>
  26  *
  27  * <p>
  28  * This algorithm provides a reasonable default for many languages. However, it
  29  * does not cover language or context specific requirements, and it does not
  30  * provide meaningful results at all for languages that don't use spaces between
  31  * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
  32  * word breaking services usually provide significantly better results with
  33  * better performance.
  34  * </p>
  35  *
  36  * @class Text.WordBreak
  37  * @static
  38  */
  39
  40 var Text   = Y.Text,
  41     WBData = Text.Data.WordBreak,
  42
  43 // Constants representing code point classifications.
  44 ALETTER      = 0,
  45 MIDNUMLET    = 1,
  46 MIDLETTER    = 2,
  47 MIDNUM       = 3,
  48 NUMERIC      = 4,
  49 CR           = 5,
  50 LF           = 6,
  51 NEWLINE      = 7,
  52 EXTEND       = 8,
  53 FORMAT       = 9,
  54 KATAKANA     = 10,
  55 EXTENDNUMLET = 11,
  56 OTHER        = 12,
  57
  58 // RegExp objects generated from code point data. Each regex matches a single
  59 // character against a set of Unicode code points. The index of each item in
  60 // this array must match its corresponding code point constant value defined
  61 // above.
  62 SETS = [
  63     new RegExp(WBData.aletter),
  64     new RegExp(WBData.midnumlet),
  65     new RegExp(WBData.midletter),
  66     new RegExp(WBData.midnum),
  67     new RegExp(WBData.numeric),
  68     new RegExp(WBData.cr),
  69     new RegExp(WBData.lf),
  70     new RegExp(WBData.newline),
  71     new RegExp(WBData.extend),
  72     new RegExp(WBData.format),
  73     new RegExp(WBData.katakana),
  74     new RegExp(WBData.extendnumlet)
  75 ],
  76
  77 EMPTY_STRING = '',
  78 PUNCTUATION  = new RegExp('^' + WBData.punctuation + '$'),
  79 WHITESPACE   = /\s/,
  80
  81 WordBreak = {
  82     // -- Public Static Methods ------------------------------------------------
  83
  84     /**
  85      * Splits the specified string into an array of individual words.
  86      *
  87      * @method getWords
  88      * @param {String} string String to split.
  89      * @param {Object} options (optional) Options object containing zero or more
  90      *   of the following properties:
  91      *
  92      * <dl>
  93      *   <dt>ignoreCase (Boolean)</dt>
  94      *   <dd>
  95      *     If <code>true</code>, the string will be converted to lowercase
  96      *     before being split. Default is <code>false</code>.
  97      *   </dd>
  98      *
  99      *   <dt>includePunctuation (Boolean)</dt>
 100      *   <dd>
 101      *     If <code>true</code>, the returned array will include punctuation
 102      *     characters. Default is <code>false</code>.
 103      *   </dd>
 104      *
 105      *   <dt>includeWhitespace (Boolean)</dt>
 106      *   <dd>
 107      *     If <code>true</code>, the returned array will include whitespace
 108      *     characters. Default is <code>false</code>.
 109      *   </dd>
 110      * </dl>
 111      * @return {Array} Array of words.
 112      * @static
 113      */
 114     getWords: function (string, options) {
 115         var i     = 0,
 116             map   = WordBreak._classify(string),
 117             len   = map.length,
 118             word  = [],
 119             words = [],
 120             chr,
 121             includePunctuation,
 122             includeWhitespace;
 123
 124         if (!options) {
 125             options = {};
 126         }
 127
 128         if (options.ignoreCase) {
 129             string = string.toLowerCase();
 130         }
 131
 132         includePunctuation = options.includePunctuation;
 133         includeWhitespace  = options.includeWhitespace;
 134
 135         // Loop through each character in the classification map and determine
 136         // whether it precedes a word boundary, building an array of distinct
 137         // words as we go.
 138         for (; i < len; ++i) {
 139             chr = string.charAt(i);
 140
 141             // Append this character to the current word.
 142             word.push(chr);
 143
 144             // If there's a word boundary between the current character and the
 145             // next character, append the current word to the words array and
 146             // start building a new word.
 147             if (WordBreak._isWordBoundary(map, i)) {
 148                 word = word.join(EMPTY_STRING);
 149
 150                 if (word &&
 151                         (includeWhitespace  || !WHITESPACE.test(word)) &&
 152                         (includePunctuation || !PUNCTUATION.test(word))) {
 153                     words.push(word);
 154                 }
 155
 156                 word = [];
 157             }
 158         }
 159
 160         return words;
 161     },
 162
 163     /**
 164      * Returns an array containing only unique words from the specified string.
 165      * For example, the string <code>'foo bar baz foo'</code> would result in
 166      * the array <code>['foo', 'bar', 'baz']</code>.
 167      *
 168      * @method getUniqueWords
 169      * @param {String} string String to split.
 170      * @param {Object} options (optional) Options (see <code>getWords()</code>
 171      *   for details).
 172      * @return {Array} Array of unique words.
 173      * @static
 174      */
 175     getUniqueWords: function (string, options) {
 176         return Y.Array.unique(WordBreak.getWords(string, options));
 177     },
 178
 179     /**
 180      * <p>
 181      * Returns <code>true</code> if there is a word boundary between the
 182      * specified character index and the next character index (or the end of the
 183      * string).
 184      * </p>
 185      *
 186      * <p>
 187      * Note that there are always word breaks at the beginning and end of a
 188      * string, so <code>isWordBoundary('', 0)</code> and
 189      * <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
 190      * </p>
 191      *
 192      * @method isWordBoundary
 193      * @param {String} string String to test.
 194      * @param {Number} index Character index to test within the string.
 195      * @return {Boolean} <code>true</code> for a word boundary,
 196      *   <code>false</code> otherwise.
 197      * @static
 198      */
 199     isWordBoundary: function (string, index) {
 200         return WordBreak._isWordBoundary(WordBreak._classify(string), index);
 201     },
 202
 203     // -- Protected Static Methods ---------------------------------------------
 204
 205     /**
 206      * Returns a character classification map for the specified string.
 207      *
 208      * @method _classify
 209      * @param {String} string String to classify.
 210      * @return {Array} Classification map.
 211      * @protected
 212      * @static
 213      */
 214     _classify: function (string) {
 215         var chr,
 216             map          = [],
 217             i            = 0,
 218             j,
 219             set,
 220             stringLength = string.length,
 221             setsLength   = SETS.length,
 222             type;
 223
 224         for (; i < stringLength; ++i) {
 225             chr  = string.charAt(i);
 226             type = OTHER;
 227
 228             for (j = 0; j < setsLength; ++j) {
 229                 set = SETS[j];
 230
 231                 if (set && set.test(chr)) {
 232                     type = j;
 233                     break;
 234                 }
 235             }
 236
 237             map.push(type);
 238         }
 239
 240         return map;
 241     },
 242
 243     /**
 244      * <p>
 245      * Returns <code>true</code> if there is a word boundary between the
 246      * specified character index and the next character index (or the end of the
 247      * string).
 248      * </p>
 249      *
 250      * <p>
 251      * Note that there are always word breaks at the beginning and end of a
 252      * string, so <code>_isWordBoundary('', 0)</code> and
 253      * <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
 254      * </p>
 255      *
 256      * @method _isWordBoundary
 257      * @param {Array} map Character classification map generated by
 258      *   <code>_classify</code>.
 259      * @param {Number} index Character index to test.
 260      * @return {Boolean}
 261      * @protected
 262      * @static
 263      */
 264     _isWordBoundary: function (map, index) {
 265         var prevType,
 266             type     = map[index],
 267             nextType = map[index + 1],
 268             nextNextType;
 269
 270         if (index < 0 || (index > map.length - 1 && index !== 0)) {
 271             Y.log('isWordBoundary: index out of bounds', 'warn', 'text-wordbreak');
 272             return false;
 273         }
 274
 275         // WB5. Don't break between most letters.
 276         if (type === ALETTER && nextType === ALETTER) {
 277             return false;
 278         }
 279
 280         nextNextType = map[index + 2];
 281
 282         // WB6. Don't break letters across certain punctuation.
 283         if (type === ALETTER &&
 284                 (nextType === MIDLETTER || nextType === MIDNUMLET) &&
 285                 nextNextType === ALETTER) {
 286             return false;
 287         }
 288
 289         prevType = map[index - 1];
 290
 291         // WB7. Don't break letters across certain punctuation.
 292         if ((type === MIDLETTER || type === MIDNUMLET) &&
 293                 nextType === ALETTER &&
 294                 prevType === ALETTER) {
 295             return false;
 296         }
 297
 298         // WB8/WB9/WB10. Don't break inside sequences of digits or digits
 299         // adjacent to letters.
 300         if ((type === NUMERIC || type === ALETTER) &&
 301                 (nextType === NUMERIC || nextType === ALETTER)) {
 302             return false;
 303         }
 304
 305         // WB11. Don't break inside numeric sequences like "3.2" or
 306         // "3,456.789".
 307         if ((type === MIDNUM || type === MIDNUMLET) &&
 308                 nextType === NUMERIC &&
 309                 prevType === NUMERIC) {
 310             return false;
 311         }
 312
 313         // WB12. Don't break inside numeric sequences like "3.2" or
 314         // "3,456.789".
 315         if (type === NUMERIC &&
 316                 (nextType === MIDNUM || nextType === MIDNUMLET) &&
 317                 nextNextType === NUMERIC) {
 318             return false;
 319         }
 320
 321         // WB4. Ignore format and extend characters.
 322         if (type === EXTEND || type === FORMAT ||
 323                 prevType === EXTEND || prevType === FORMAT ||
 324                 nextType === EXTEND || nextType === FORMAT) {
 325             return false;
 326         }
 327
 328         // WB3. Don't break inside CRLF.
 329         if (type === CR && nextType === LF) {
 330             return false;
 331         }
 332
 333         // WB3a. Break before newlines (including CR and LF).
 334         if (type === NEWLINE || type === CR || type === LF) {
 335             return true;
 336         }
 337
 338         // WB3b. Break after newlines (including CR and LF).
 339         if (nextType === NEWLINE || nextType === CR || nextType === LF) {
 340             return true;
 341         }
 342
 343         // WB13. Don't break between Katakana characters.
 344         if (type === KATAKANA && nextType === KATAKANA) {
 345             return false;
 346         }
 347
 348         // WB13a. Don't break from extenders.
 349         if (nextType === EXTENDNUMLET &&
 350                 (type === ALETTER || type === NUMERIC || type === KATAKANA ||
 351                 type === EXTENDNUMLET)) {
 352             return false;
 353         }
 354
 355         // WB13b. Don't break from extenders.
 356         if (type === EXTENDNUMLET &&
 357                 (nextType === ALETTER || nextType === NUMERIC ||
 358                 nextType === KATAKANA)) {
 359             return false;
 360         }
 361
 362         // Break after any character not covered by the rules above.
 363         return true;
 364     }
 365 };
 366
 367 Text.WordBreak = WordBreak;
 368
 369
 370 }, '3.17.2', {"requires": ["array-extras", "text-data-wordbreak"]});