base/string_util.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "base/string_util.h"
   6
   7 #include "build/build_config.h"
   8
   9 #include <ctype.h>
  10 #include <errno.h>
  11 #include <math.h>
  12 #include <stdarg.h>
  13 #include <stdio.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16 #include <time.h>
  17 #include <wchar.h>
  18 #include <wctype.h>
  19
  20 #include <algorithm>
  21 #include <vector>
  22
  23 #include "base/basictypes.h"
  24 #include "base/logging.h"
  25 #include "base/memory/singleton.h"
  26 #include "base/strings/utf_string_conversion_utils.h"
  27 #include "base/utf_string_conversions.h"
  28 #include "base/third_party/icu/icu_utf.h"
  29
  30 namespace {
  31
  32 // Force the singleton used by Empty[W]String[16] to be a unique type. This
  33 // prevents other code that might accidentally use Singleton<string> from
  34 // getting our internal one.
  35 struct EmptyStrings {
  36   EmptyStrings() {}
  37   const std::string s;
  38   const std::wstring ws;
  39   const string16 s16;
  40
  41   static EmptyStrings* GetInstance() {
  42     return Singleton<EmptyStrings>::get();
  43   }
  44 };
  45
  46 // Used by ReplaceStringPlaceholders to track the position in the string of
  47 // replaced parameters.
  48 struct ReplacementOffset {
  49   ReplacementOffset(uintptr_t parameter, size_t offset)
  50       : parameter(parameter),
  51         offset(offset) {}
  52
  53   // Index of the parameter.
  54   uintptr_t parameter;
  55
  56   // Starting position in the string.
  57   size_t offset;
  58 };
  59
  60 static bool CompareParameter(const ReplacementOffset& elem1,
  61                              const ReplacementOffset& elem2) {
  62   return elem1.parameter < elem2.parameter;
  63 }
  64
  65 }  // namespace
  66
  67 namespace base {
  68
  69 bool IsWprintfFormatPortable(const wchar_t* format) {
  70   for (const wchar_t* position = format; *position != '\0'; ++position) {
  71     if (*position == '%') {
  72       bool in_specification = true;
  73       bool modifier_l = false;
  74       while (in_specification) {
  75         // Eat up characters until reaching a known specifier.
  76         if (*++position == '\0') {
  77           // The format string ended in the middle of a specification.  Call
  78           // it portable because no unportable specifications were found.  The
  79           // string is equally broken on all platforms.
  80           return true;
  81         }
  82
  83         if (*position == 'l') {
  84           // 'l' is the only thing that can save the 's' and 'c' specifiers.
  85           modifier_l = true;
  86         } else if (((*position == 's' || *position == 'c') && !modifier_l) ||
  87                    *position == 'S' || *position == 'C' || *position == 'F' ||
  88                    *position == 'D' || *position == 'O' || *position == 'U') {
  89           // Not portable.
  90           return false;
  91         }
  92
  93         if (wcschr(L"diouxXeEfgGaAcspn%", *position)) {
  94           // Portable, keep scanning the rest of the format string.
  95           in_specification = false;
  96         }
  97       }
  98     }
  99   }
 100
 101   return true;
 102 }
 103
 104 }  // namespace base
 105
 106
 107 const std::string& EmptyString() {
 108   return EmptyStrings::GetInstance()->s;
 109 }
 110
 111 const std::wstring& EmptyWString() {
 112   return EmptyStrings::GetInstance()->ws;
 113 }
 114
 115 const string16& EmptyString16() {
 116   return EmptyStrings::GetInstance()->s16;
 117 }
 118
 119 #define WHITESPACE_UNICODE \
 120   0x0009, /* <control-0009> to <control-000D> */ \
 121   0x000A,                                        \
 122   0x000B,                                        \
 123   0x000C,                                        \
 124   0x000D,                                        \
 125   0x0020, /* Space */                            \
 126   0x0085, /* <control-0085> */                   \
 127   0x00A0, /* No-Break Space */                   \
 128   0x1680, /* Ogham Space Mark */                 \
 129   0x180E, /* Mongolian Vowel Separator */        \
 130   0x2000, /* En Quad to Hair Space */            \
 131   0x2001,                                        \
 132   0x2002,                                        \
 133   0x2003,                                        \
 134   0x2004,                                        \
 135   0x2005,                                        \
 136   0x2006,                                        \
 137   0x2007,                                        \
 138   0x2008,                                        \
 139   0x2009,                                        \
 140   0x200A,                                        \
 141   0x200C, /* Zero Width Non-Joiner */            \
 142   0x2028, /* Line Separator */                   \
 143   0x2029, /* Paragraph Separator */              \
 144   0x202F, /* Narrow No-Break Space */            \
 145   0x205F, /* Medium Mathematical Space */        \
 146   0x3000, /* Ideographic Space */                \
 147   0
 148
 149 const wchar_t kWhitespaceWide[] = {
 150   WHITESPACE_UNICODE
 151 };
 152 const char16 kWhitespaceUTF16[] = {
 153   WHITESPACE_UNICODE
 154 };
 155 const char kWhitespaceASCII[] = {
 156   0x09,    // <control-0009> to <control-000D>
 157   0x0A,
 158   0x0B,
 159   0x0C,
 160   0x0D,
 161   0x20,    // Space
 162   0
 163 };
 164
 165 const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF";
 166
 167 template<typename STR>
 168 bool ReplaceCharsT(const STR& input,
 169                    const typename STR::value_type replace_chars[],
 170                    const STR& replace_with,
 171                    STR* output) {
 172   bool removed = false;
 173   size_t replace_length = replace_with.length();
 174
 175   *output = input;
 176
 177   size_t found = output->find_first_of(replace_chars);
 178   while (found != STR::npos) {
 179     removed = true;
 180     output->replace(found, 1, replace_with);
 181     found = output->find_first_of(replace_chars, found + replace_length);
 182   }
 183
 184   return removed;
 185 }
 186
 187 bool ReplaceChars(const string16& input,
 188                   const char16 replace_chars[],
 189                   const string16& replace_with,
 190                   string16* output) {
 191   return ReplaceCharsT(input, replace_chars, replace_with, output);
 192 }
 193
 194 bool ReplaceChars(const std::string& input,
 195                   const char replace_chars[],
 196                   const std::string& replace_with,
 197                   std::string* output) {
 198   return ReplaceCharsT(input, replace_chars, replace_with, output);
 199 }
 200
 201 bool RemoveChars(const string16& input,
 202                  const char16 remove_chars[],
 203                  string16* output) {
 204   return ReplaceChars(input, remove_chars, string16(), output);
 205 }
 206
 207 bool RemoveChars(const std::string& input,
 208                  const char remove_chars[],
 209                  std::string* output) {
 210   return ReplaceChars(input, remove_chars, std::string(), output);
 211 }
 212
 213 template<typename STR>
 214 TrimPositions TrimStringT(const STR& input,
 215                           const typename STR::value_type trim_chars[],
 216                           TrimPositions positions,
 217                           STR* output) {
 218   // Find the edges of leading/trailing whitespace as desired.
 219   const typename STR::size_type last_char = input.length() - 1;
 220   const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ?
 221       input.find_first_not_of(trim_chars) : 0;
 222   const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ?
 223       input.find_last_not_of(trim_chars) : last_char;
 224
 225   // When the string was all whitespace, report that we stripped off whitespace
 226   // from whichever position the caller was interested in.  For empty input, we
 227   // stripped no whitespace, but we still need to clear |output|.
 228   if (input.empty() ||
 229       (first_good_char == STR::npos) || (last_good_char == STR::npos)) {
 230     bool input_was_empty = input.empty();  // in case output == &input
 231     output->clear();
 232     return input_was_empty ? TRIM_NONE : positions;
 233   }
 234
 235   // Trim the whitespace.
 236   *output =
 237       input.substr(first_good_char, last_good_char - first_good_char + 1);
 238
 239   // Return where we trimmed from.
 240   return static_cast<TrimPositions>(
 241       ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
 242       ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
 243 }
 244
 245 bool TrimString(const std::wstring& input,
 246                 const wchar_t trim_chars[],
 247                 std::wstring* output) {
 248   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
 249 }
 250
 251 #if !defined(WCHAR_T_IS_UTF16)
 252 bool TrimString(const string16& input,
 253                 const char16 trim_chars[],
 254                 string16* output) {
 255   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
 256 }
 257 #endif
 258
 259 bool TrimString(const std::string& input,
 260                 const char trim_chars[],
 261                 std::string* output) {
 262   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
 263 }
 264
 265 void TruncateUTF8ToByteSize(const std::string& input,
 266                             const size_t byte_size,
 267                             std::string* output) {
 268   DCHECK(output);
 269   if (byte_size > input.length()) {
 270     *output = input;
 271     return;
 272   }
 273   DCHECK_LE(byte_size, static_cast<uint32>(kint32max));
 274   // Note: This cast is necessary because CBU8_NEXT uses int32s.
 275   int32 truncation_length = static_cast<int32>(byte_size);
 276   int32 char_index = truncation_length - 1;
 277   const char* data = input.data();
 278
 279   // Using CBU8, we will move backwards from the truncation point
 280   // to the beginning of the string looking for a valid UTF8
 281   // character.  Once a full UTF8 character is found, we will
 282   // truncate the string to the end of that character.
 283   while (char_index >= 0) {
 284     int32 prev = char_index;
 285     uint32 code_point = 0;
 286     CBU8_NEXT(data, char_index, truncation_length, code_point);
 287     if (!base::IsValidCharacter(code_point) ||
 288         !base::IsValidCodepoint(code_point)) {
 289       char_index = prev - 1;
 290     } else {
 291       break;
 292     }
 293   }
 294
 295   if (char_index >= 0 )
 296     *output = input.substr(0, char_index);
 297   else
 298     output->clear();
 299 }
 300
 301 TrimPositions TrimWhitespace(const string16& input,
 302                              TrimPositions positions,
 303                              string16* output) {
 304   return TrimStringT(input, kWhitespaceUTF16, positions, output);
 305 }
 306
 307 TrimPositions TrimWhitespaceASCII(const std::string& input,
 308                                   TrimPositions positions,
 309                                   std::string* output) {
 310   return TrimStringT(input, kWhitespaceASCII, positions, output);
 311 }
 312
 313 // This function is only for backward-compatibility.
 314 // To be removed when all callers are updated.
 315 TrimPositions TrimWhitespace(const std::string& input,
 316                              TrimPositions positions,
 317                              std::string* output) {
 318   return TrimWhitespaceASCII(input, positions, output);
 319 }
 320
 321 template<typename STR>
 322 STR CollapseWhitespaceT(const STR& text,
 323                         bool trim_sequences_with_line_breaks) {
 324   STR result;
 325   result.resize(text.size());
 326
 327   // Set flags to pretend we're already in a trimmed whitespace sequence, so we
 328   // will trim any leading whitespace.
 329   bool in_whitespace = true;
 330   bool already_trimmed = true;
 331
 332   int chars_written = 0;
 333   for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
 334     if (IsWhitespace(*i)) {
 335       if (!in_whitespace) {
 336         // Reduce all whitespace sequences to a single space.
 337         in_whitespace = true;
 338         result[chars_written++] = L' ';
 339       }
 340       if (trim_sequences_with_line_breaks && !already_trimmed &&
 341           ((*i == '\n') || (*i == '\r'))) {
 342         // Whitespace sequences containing CR or LF are eliminated entirely.
 343         already_trimmed = true;
 344         --chars_written;
 345       }
 346     } else {
 347       // Non-whitespace chracters are copied straight across.
 348       in_whitespace = false;
 349       already_trimmed = false;
 350       result[chars_written++] = *i;
 351     }
 352   }
 353
 354   if (in_whitespace && !already_trimmed) {
 355     // Any trailing whitespace is eliminated.
 356     --chars_written;
 357   }
 358
 359   result.resize(chars_written);
 360   return result;
 361 }
 362
 363 std::wstring CollapseWhitespace(const std::wstring& text,
 364                                 bool trim_sequences_with_line_breaks) {
 365   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 366 }
 367
 368 #if !defined(WCHAR_T_IS_UTF16)
 369 string16 CollapseWhitespace(const string16& text,
 370                             bool trim_sequences_with_line_breaks) {
 371   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 372 }
 373 #endif
 374
 375 std::string CollapseWhitespaceASCII(const std::string& text,
 376                                     bool trim_sequences_with_line_breaks) {
 377   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
 378 }
 379
 380 bool ContainsOnlyWhitespaceASCII(const std::string& str) {
 381   for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) {
 382     if (!IsAsciiWhitespace(*i))
 383       return false;
 384   }
 385   return true;
 386 }
 387
 388 bool ContainsOnlyWhitespace(const string16& str) {
 389   return str.find_first_not_of(kWhitespaceUTF16) == string16::npos;
 390 }
 391
 392 template<typename STR>
 393 static bool ContainsOnlyCharsT(const STR& input, const STR& characters) {
 394   for (typename STR::const_iterator iter = input.begin();
 395        iter != input.end(); ++iter) {
 396     if (characters.find(*iter) == STR::npos)
 397       return false;
 398   }
 399   return true;
 400 }
 401
 402 bool ContainsOnlyChars(const std::wstring& input,
 403                        const std::wstring& characters) {
 404   return ContainsOnlyCharsT(input, characters);
 405 }
 406
 407 #if !defined(WCHAR_T_IS_UTF16)
 408 bool ContainsOnlyChars(const string16& input, const string16& characters) {
 409   return ContainsOnlyCharsT(input, characters);
 410 }
 411 #endif
 412
 413 bool ContainsOnlyChars(const std::string& input,
 414                        const std::string& characters) {
 415   return ContainsOnlyCharsT(input, characters);
 416 }
 417
 418 std::string WideToASCII(const std::wstring& wide) {
 419   DCHECK(IsStringASCII(wide)) << wide;
 420   return std::string(wide.begin(), wide.end());
 421 }
 422
 423 std::string UTF16ToASCII(const string16& utf16) {
 424   DCHECK(IsStringASCII(utf16)) << utf16;
 425   return std::string(utf16.begin(), utf16.end());
 426 }
 427
 428 // Latin1 is just the low range of Unicode, so we can copy directly to convert.
 429 bool WideToLatin1(const std::wstring& wide, std::string* latin1) {
 430   std::string output;
 431   output.resize(wide.size());
 432   latin1->clear();
 433   for (size_t i = 0; i < wide.size(); i++) {
 434     if (wide[i] > 255)
 435       return false;
 436     output[i] = static_cast<char>(wide[i]);
 437   }
 438   latin1->swap(output);
 439   return true;
 440 }
 441
 442 template<class STR>
 443 static bool DoIsStringASCII(const STR& str) {
 444   for (size_t i = 0; i < str.length(); i++) {
 445     typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i];
 446     if (c > 0x7F)
 447       return false;
 448   }
 449   return true;
 450 }
 451
 452 bool IsStringASCII(const std::wstring& str) {
 453   return DoIsStringASCII(str);
 454 }
 455
 456 #if !defined(WCHAR_T_IS_UTF16)
 457 bool IsStringASCII(const string16& str) {
 458   return DoIsStringASCII(str);
 459 }
 460 #endif
 461
 462 bool IsStringASCII(const base::StringPiece& str) {
 463   return DoIsStringASCII(str);
 464 }
 465
 466 bool IsStringUTF8(const std::string& str) {
 467   const char *src = str.data();
 468   int32 src_len = static_cast<int32>(str.length());
 469   int32 char_index = 0;
 470
 471   while (char_index < src_len) {
 472     int32 code_point;
 473     CBU8_NEXT(src, char_index, src_len, code_point);
 474     if (!base::IsValidCharacter(code_point))
 475       return false;
 476   }
 477   return true;
 478 }
 479
 480 template<typename Iter>
 481 static inline bool DoLowerCaseEqualsASCII(Iter a_begin,
 482                                           Iter a_end,
 483                                           const char* b) {
 484   for (Iter it = a_begin; it != a_end; ++it, ++b) {
 485     if (!*b || base::ToLowerASCII(*it) != *b)
 486       return false;
 487   }
 488   return *b == 0;
 489 }
 490
 491 // Front-ends for LowerCaseEqualsASCII.
 492 bool LowerCaseEqualsASCII(const std::string& a, const char* b) {
 493   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 494 }
 495
 496 bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) {
 497   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 498 }
 499
 500 #if !defined(WCHAR_T_IS_UTF16)
 501 bool LowerCaseEqualsASCII(const string16& a, const char* b) {
 502   return DoLowerCaseEqualsASCII(a.begin(), a.end(), b);
 503 }
 504 #endif
 505
 506 bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
 507                           std::string::const_iterator a_end,
 508                           const char* b) {
 509   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 510 }
 511
 512 bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
 513                           std::wstring::const_iterator a_end,
 514                           const char* b) {
 515   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 516 }
 517
 518 #if !defined(WCHAR_T_IS_UTF16)
 519 bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
 520                           string16::const_iterator a_end,
 521                           const char* b) {
 522   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 523 }
 524 #endif
 525
 526 // TODO(port): Resolve wchar_t/iterator issues that require OS_ANDROID here.
 527 #if !defined(OS_ANDROID)
 528 bool LowerCaseEqualsASCII(const char* a_begin,
 529                           const char* a_end,
 530                           const char* b) {
 531   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 532 }
 533
 534 bool LowerCaseEqualsASCII(const wchar_t* a_begin,
 535                           const wchar_t* a_end,
 536                           const char* b) {
 537   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 538 }
 539
 540 #if !defined(WCHAR_T_IS_UTF16)
 541 bool LowerCaseEqualsASCII(const char16* a_begin,
 542                           const char16* a_end,
 543                           const char* b) {
 544   return DoLowerCaseEqualsASCII(a_begin, a_end, b);
 545 }
 546 #endif
 547
 548 #endif  // !defined(OS_ANDROID)
 549
 550 bool EqualsASCII(const string16& a, const base::StringPiece& b) {
 551   if (a.length() != b.length())
 552     return false;
 553   return std::equal(b.begin(), b.end(), a.begin());
 554 }
 555
 556 bool StartsWithASCII(const std::string& str,
 557                      const std::string& search,
 558                      bool case_sensitive) {
 559   if (case_sensitive)
 560     return str.compare(0, search.length(), search) == 0;
 561   else
 562     return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0;
 563 }
 564
 565 template <typename STR>
 566 bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) {
 567   if (case_sensitive) {
 568     return str.compare(0, search.length(), search) == 0;
 569   } else {
 570     if (search.size() > str.size())
 571       return false;
 572     return std::equal(search.begin(), search.end(), str.begin(),
 573                       base::CaseInsensitiveCompare<typename STR::value_type>());
 574   }
 575 }
 576
 577 bool StartsWith(const std::wstring& str, const std::wstring& search,
 578                 bool case_sensitive) {
 579   return StartsWithT(str, search, case_sensitive);
 580 }
 581
 582 #if !defined(WCHAR_T_IS_UTF16)
 583 bool StartsWith(const string16& str, const string16& search,
 584                 bool case_sensitive) {
 585   return StartsWithT(str, search, case_sensitive);
 586 }
 587 #endif
 588
 589 template <typename STR>
 590 bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) {
 591   typename STR::size_type str_length = str.length();
 592   typename STR::size_type search_length = search.length();
 593   if (search_length > str_length)
 594     return false;
 595   if (case_sensitive) {
 596     return str.compare(str_length - search_length, search_length, search) == 0;
 597   } else {
 598     return std::equal(search.begin(), search.end(),
 599                       str.begin() + (str_length - search_length),
 600                       base::CaseInsensitiveCompare<typename STR::value_type>());
 601   }
 602 }
 603
 604 bool EndsWith(const std::string& str, const std::string& search,
 605               bool case_sensitive) {
 606   return EndsWithT(str, search, case_sensitive);
 607 }
 608
 609 bool EndsWith(const std::wstring& str, const std::wstring& search,
 610               bool case_sensitive) {
 611   return EndsWithT(str, search, case_sensitive);
 612 }
 613
 614 #if !defined(WCHAR_T_IS_UTF16)
 615 bool EndsWith(const string16& str, const string16& search,
 616               bool case_sensitive) {
 617   return EndsWithT(str, search, case_sensitive);
 618 }
 619 #endif
 620
 621 static const char* const kByteStringsUnlocalized[] = {
 622   " B",
 623   " kB",
 624   " MB",
 625   " GB",
 626   " TB",
 627   " PB"
 628 };
 629
 630 string16 FormatBytesUnlocalized(int64 bytes) {
 631   double unit_amount = static_cast<double>(bytes);
 632   size_t dimension = 0;
 633   const int kKilo = 1024;
 634   while (unit_amount >= kKilo &&
 635          dimension < arraysize(kByteStringsUnlocalized) - 1) {
 636     unit_amount /= kKilo;
 637     dimension++;
 638   }
 639
 640   char buf[64];
 641   if (bytes != 0 && dimension > 0 && unit_amount < 100) {
 642     base::snprintf(buf, arraysize(buf), "%.1lf%s", unit_amount,
 643                    kByteStringsUnlocalized[dimension]);
 644   } else {
 645     base::snprintf(buf, arraysize(buf), "%.0lf%s", unit_amount,
 646                    kByteStringsUnlocalized[dimension]);
 647   }
 648
 649   return ASCIIToUTF16(buf);
 650 }
 651
 652 template<class StringType>
 653 void DoReplaceSubstringsAfterOffset(StringType* str,
 654                                     typename StringType::size_type start_offset,
 655                                     const StringType& find_this,
 656                                     const StringType& replace_with,
 657                                     bool replace_all) {
 658   if ((start_offset == StringType::npos) || (start_offset >= str->length()))
 659     return;
 660
 661   DCHECK(!find_this.empty());
 662   for (typename StringType::size_type offs(str->find(find_this, start_offset));
 663       offs != StringType::npos; offs = str->find(find_this, offs)) {
 664     str->replace(offs, find_this.length(), replace_with);
 665     offs += replace_with.length();
 666
 667     if (!replace_all)
 668       break;
 669   }
 670 }
 671
 672 void ReplaceFirstSubstringAfterOffset(string16* str,
 673                                       string16::size_type start_offset,
 674                                       const string16& find_this,
 675                                       const string16& replace_with) {
 676   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 677                                  false);  // replace first instance
 678 }
 679
 680 void ReplaceFirstSubstringAfterOffset(std::string* str,
 681                                       std::string::size_type start_offset,
 682                                       const std::string& find_this,
 683                                       const std::string& replace_with) {
 684   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 685                                  false);  // replace first instance
 686 }
 687
 688 void ReplaceSubstringsAfterOffset(string16* str,
 689                                   string16::size_type start_offset,
 690                                   const string16& find_this,
 691                                   const string16& replace_with) {
 692   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 693                                  true);  // replace all instances
 694 }
 695
 696 void ReplaceSubstringsAfterOffset(std::string* str,
 697                                   std::string::size_type start_offset,
 698                                   const std::string& find_this,
 699                                   const std::string& replace_with) {
 700   DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with,
 701                                  true);  // replace all instances
 702 }
 703
 704
 705 template<typename STR>
 706 static size_t TokenizeT(const STR& str,
 707                         const STR& delimiters,
 708                         std::vector<STR>* tokens) {
 709   tokens->clear();
 710
 711   typename STR::size_type start = str.find_first_not_of(delimiters);
 712   while (start != STR::npos) {
 713     typename STR::size_type end = str.find_first_of(delimiters, start + 1);
 714     if (end == STR::npos) {
 715       tokens->push_back(str.substr(start));
 716       break;
 717     } else {
 718       tokens->push_back(str.substr(start, end - start));
 719       start = str.find_first_not_of(delimiters, end + 1);
 720     }
 721   }
 722
 723   return tokens->size();
 724 }
 725
 726 size_t Tokenize(const std::wstring& str,
 727                 const std::wstring& delimiters,
 728                 std::vector<std::wstring>* tokens) {
 729   return TokenizeT(str, delimiters, tokens);
 730 }
 731
 732 #if !defined(WCHAR_T_IS_UTF16)
 733 size_t Tokenize(const string16& str,
 734                 const string16& delimiters,
 735                 std::vector<string16>* tokens) {
 736   return TokenizeT(str, delimiters, tokens);
 737 }
 738 #endif
 739
 740 size_t Tokenize(const std::string& str,
 741                 const std::string& delimiters,
 742                 std::vector<std::string>* tokens) {
 743   return TokenizeT(str, delimiters, tokens);
 744 }
 745
 746 size_t Tokenize(const base::StringPiece& str,
 747                 const base::StringPiece& delimiters,
 748                 std::vector<base::StringPiece>* tokens) {
 749   return TokenizeT(str, delimiters, tokens);
 750 }
 751
 752 template<typename STR>
 753 static STR JoinStringT(const std::vector<STR>& parts, const STR& sep) {
 754   if (parts.empty())
 755     return STR();
 756
 757   STR result(parts[0]);
 758   typename std::vector<STR>::const_iterator iter = parts.begin();
 759   ++iter;
 760
 761   for (; iter != parts.end(); ++iter) {
 762     result += sep;
 763     result += *iter;
 764   }
 765
 766   return result;
 767 }
 768
 769 std::string JoinString(const std::vector<std::string>& parts, char sep) {
 770   return JoinStringT(parts, std::string(1, sep));
 771 }
 772
 773 string16 JoinString(const std::vector<string16>& parts, char16 sep) {
 774   return JoinStringT(parts, string16(1, sep));
 775 }
 776
 777 std::string JoinString(const std::vector<std::string>& parts,
 778                        const std::string& separator) {
 779   return JoinStringT(parts, separator);
 780 }
 781
 782 string16 JoinString(const std::vector<string16>& parts,
 783                     const string16& separator) {
 784   return JoinStringT(parts, separator);
 785 }
 786
 787 template<class FormatStringType, class OutStringType>
 788 OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string,
 789     const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) {
 790   size_t substitutions = subst.size();
 791
 792   size_t sub_length = 0;
 793   for (typename std::vector<OutStringType>::const_iterator iter = subst.begin();
 794        iter != subst.end(); ++iter) {
 795     sub_length += iter->length();
 796   }
 797
 798   OutStringType formatted;
 799   formatted.reserve(format_string.length() + sub_length);
 800
 801   std::vector<ReplacementOffset> r_offsets;
 802   for (typename FormatStringType::const_iterator i = format_string.begin();
 803        i != format_string.end(); ++i) {
 804     if ('$' == *i) {
 805       if (i + 1 != format_string.end()) {
 806         ++i;
 807         DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i;
 808         if ('$' == *i) {
 809           while (i != format_string.end() && '$' == *i) {
 810             formatted.push_back('$');
 811             ++i;
 812           }
 813           --i;
 814         } else {
 815           uintptr_t index = 0;
 816           while (i != format_string.end() && '0' <= *i && *i <= '9') {
 817             index *= 10;
 818             index += *i - '0';
 819             ++i;
 820           }
 821           --i;
 822           index -= 1;
 823           if (offsets) {
 824             ReplacementOffset r_offset(index,
 825                 static_cast<int>(formatted.size()));
 826             r_offsets.insert(std::lower_bound(r_offsets.begin(),
 827                                               r_offsets.end(),
 828                                               r_offset,
 829                                               &CompareParameter),
 830                              r_offset);
 831           }
 832           if (index < substitutions)
 833             formatted.append(subst.at(index));
 834         }
 835       }
 836     } else {
 837       formatted.push_back(*i);
 838     }
 839   }
 840   if (offsets) {
 841     for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin();
 842          i != r_offsets.end(); ++i) {
 843       offsets->push_back(i->offset);
 844     }
 845   }
 846   return formatted;
 847 }
 848
 849 string16 ReplaceStringPlaceholders(const string16& format_string,
 850                                    const std::vector<string16>& subst,
 851                                    std::vector<size_t>* offsets) {
 852   return DoReplaceStringPlaceholders(format_string, subst, offsets);
 853 }
 854
 855 std::string ReplaceStringPlaceholders(const base::StringPiece& format_string,
 856                                       const std::vector<std::string>& subst,
 857                                       std::vector<size_t>* offsets) {
 858   return DoReplaceStringPlaceholders(format_string, subst, offsets);
 859 }
 860
 861 string16 ReplaceStringPlaceholders(const string16& format_string,
 862                                    const string16& a,
 863                                    size_t* offset) {
 864   std::vector<size_t> offsets;
 865   std::vector<string16> subst;
 866   subst.push_back(a);
 867   string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets);
 868
 869   DCHECK(offsets.size() == 1);
 870   if (offset) {
 871     *offset = offsets[0];
 872   }
 873   return result;
 874 }
 875
 876 static bool IsWildcard(base_icu::UChar32 character) {
 877   return character == '*' || character == '?';
 878 }
 879
 880 // Move the strings pointers to the point where they start to differ.
 881 template <typename CHAR, typename NEXT>
 882 static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
 883                          const CHAR** string, const CHAR* string_end,
 884                          NEXT next) {
 885   const CHAR* escape = NULL;
 886   while (*pattern != pattern_end && *string != string_end) {
 887     if (!escape && IsWildcard(**pattern)) {
 888       // We don't want to match wildcard here, except if it's escaped.
 889       return;
 890     }
 891
 892     // Check if the escapement char is found. If so, skip it and move to the
 893     // next character.
 894     if (!escape && **pattern == '\\') {
 895       escape = *pattern;
 896       next(pattern, pattern_end);
 897       continue;
 898     }
 899
 900     // Check if the chars match, if so, increment the ptrs.
 901     const CHAR* pattern_next = *pattern;
 902     const CHAR* string_next = *string;
 903     base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
 904     if (pattern_char == next(&string_next, string_end) &&
 905         pattern_char != (base_icu::UChar32) CBU_SENTINEL) {
 906       *pattern = pattern_next;
 907       *string = string_next;
 908     } else {
 909       // Uh ho, it did not match, we are done. If the last char was an
 910       // escapement, that means that it was an error to advance the ptr here,
 911       // let's put it back where it was. This also mean that the MatchPattern
 912       // function will return false because if we can't match an escape char
 913       // here, then no one will.
 914       if (escape) {
 915         *pattern = escape;
 916       }
 917       return;
 918     }
 919
 920     escape = NULL;
 921   }
 922 }
 923
 924 template <typename CHAR, typename NEXT>
 925 static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
 926   while (*pattern != end) {
 927     if (!IsWildcard(**pattern))
 928       return;
 929     next(pattern, end);
 930   }
 931 }
 932
 933 template <typename CHAR, typename NEXT>
 934 static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
 935                           const CHAR* pattern, const CHAR* pattern_end,
 936                           int depth,
 937                           NEXT next) {
 938   const int kMaxDepth = 16;
 939   if (depth > kMaxDepth)
 940     return false;
 941
 942   // Eat all the matching chars.
 943   EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
 944
 945   // If the string is empty, then the pattern must be empty too, or contains
 946   // only wildcards.
 947   if (eval == eval_end) {
 948     EatWildcard(&pattern, pattern_end, next);
 949     return pattern == pattern_end;
 950   }
 951
 952   // Pattern is empty but not string, this is not a match.
 953   if (pattern == pattern_end)
 954     return false;
 955
 956   // If this is a question mark, then we need to compare the rest with
 957   // the current string or the string with one character eaten.
 958   const CHAR* next_pattern = pattern;
 959   next(&next_pattern, pattern_end);
 960   if (pattern[0] == '?') {
 961     if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
 962                       depth + 1, next))
 963       return true;
 964     const CHAR* next_eval = eval;
 965     next(&next_eval, eval_end);
 966     if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
 967                       depth + 1, next))
 968       return true;
 969   }
 970
 971   // This is a *, try to match all the possible substrings with the remainder
 972   // of the pattern.
 973   if (pattern[0] == '*') {
 974     // Collapse duplicate wild cards (********** into *) so that the
 975     // method does not recurse unnecessarily. http://crbug.com/52839
 976     EatWildcard(&next_pattern, pattern_end, next);
 977
 978     while (eval != eval_end) {
 979       if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
 980                         depth + 1, next))
 981         return true;
 982       eval++;
 983     }
 984
 985     // We reached the end of the string, let see if the pattern contains only
 986     // wildcards.
 987     if (eval == eval_end) {
 988       EatWildcard(&pattern, pattern_end, next);
 989       if (pattern != pattern_end)
 990         return false;
 991       return true;
 992     }
 993   }
 994
 995   return false;
 996 }
 997
 998 struct NextCharUTF8 {
 999   base_icu::UChar32 operator()(const char** p, const char* end) {
1000     base_icu::UChar32 c;
1001     int offset = 0;
1002     CBU8_NEXT(*p, offset, end - *p, c);
1003     *p += offset;
1004     return c;
1005   }
1006 };
1007
1008 struct NextCharUTF16 {
1009   base_icu::UChar32 operator()(const char16** p, const char16* end) {
1010     base_icu::UChar32 c;
1011     int offset = 0;
1012     CBU16_NEXT(*p, offset, end - *p, c);
1013     *p += offset;
1014     return c;
1015   }
1016 };
1017
1018 bool MatchPattern(const base::StringPiece& eval,
1019                   const base::StringPiece& pattern) {
1020   return MatchPatternT(eval.data(), eval.data() + eval.size(),
1021                        pattern.data(), pattern.data() + pattern.size(),
1022                        0, NextCharUTF8());
1023 }
1024
1025 bool MatchPattern(const string16& eval, const string16& pattern) {
1026   return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
1027                        pattern.c_str(), pattern.c_str() + pattern.size(),
1028                        0, NextCharUTF16());
1029 }
1030
1031 // The following code is compatible with the OpenBSD lcpy interface.  See:
1032 //   http://www.gratisoft.us/todd/papers/strlcpy.html
1033 //   ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1034
1035 namespace {
1036
1037 template <typename CHAR>
1038 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1039   for (size_t i = 0; i < dst_size; ++i) {
1040     if ((dst[i] = src[i]) == 0)  // We hit and copied the terminating NULL.
1041       return i;
1042   }
1043
1044   // We were left off at dst_size.  We over copied 1 byte.  Null terminate.
1045   if (dst_size != 0)
1046     dst[dst_size - 1] = 0;
1047
1048   // Count the rest of the |src|, and return it's length in characters.
1049   while (src[dst_size]) ++dst_size;
1050   return dst_size;
1051 }
1052
1053 }  // namespace
1054
1055 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) {
1056   return lcpyT<char>(dst, src, dst_size);
1057 }
1058 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) {
1059   return lcpyT<wchar_t>(dst, src, dst_size);
1060 }