WebKit roll 98705:98715
[chromium-blink-merge.git] / base / string_util.h
blob740124fb84973f2b29652c67892dbd4d19e0c84b
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // This file defines utility functions for working with strings.
7 #ifndef BASE_STRING_UTIL_H_
8 #define BASE_STRING_UTIL_H_
9 #pragma once
11 #include <ctype.h>
12 #include <stdarg.h> // va_list
14 #include <string>
15 #include <vector>
17 #include "base/base_export.h"
18 #include "base/basictypes.h"
19 #include "base/compiler_specific.h"
20 #include "base/string16.h"
21 #include "base/string_piece.h" // For implicit conversions.
23 // Safe standard library wrappers for all platforms.
25 namespace base {
27 // C standard-library functions like "strncasecmp" and "snprintf" that aren't
28 // cross-platform are provided as "base::strncasecmp", and their prototypes
29 // are listed below. These functions are then implemented as inline calls
30 // to the platform-specific equivalents in the platform-specific headers.
32 // Compares the two strings s1 and s2 without regard to case using
33 // the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
34 // s2 > s1 according to a lexicographic comparison.
35 int strcasecmp(const char* s1, const char* s2);
37 // Compares up to count characters of s1 and s2 without regard to case using
38 // the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
39 // s2 > s1 according to a lexicographic comparison.
40 int strncasecmp(const char* s1, const char* s2, size_t count);
42 // Same as strncmp but for char16 strings.
43 int strncmp16(const char16* s1, const char16* s2, size_t count);
45 // Wrapper for vsnprintf that always null-terminates and always returns the
46 // number of characters that would be in an untruncated formatted
47 // string, even when truncation occurs.
48 int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments)
49 PRINTF_FORMAT(3, 0);
51 // vswprintf always null-terminates, but when truncation occurs, it will either
52 // return -1 or the number of characters that would be in an untruncated
53 // formatted string. The actual return value depends on the underlying
54 // C library's vswprintf implementation.
55 int vswprintf(wchar_t* buffer, size_t size,
56 const wchar_t* format, va_list arguments)
57 WPRINTF_FORMAT(3, 0);
59 // Some of these implementations need to be inlined.
61 // We separate the declaration from the implementation of this inline
62 // function just so the PRINTF_FORMAT works.
63 inline int snprintf(char* buffer, size_t size, const char* format, ...)
64 PRINTF_FORMAT(3, 4);
65 inline int snprintf(char* buffer, size_t size, const char* format, ...) {
66 va_list arguments;
67 va_start(arguments, format);
68 int result = vsnprintf(buffer, size, format, arguments);
69 va_end(arguments);
70 return result;
73 // We separate the declaration from the implementation of this inline
74 // function just so the WPRINTF_FORMAT works.
75 inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...)
76 WPRINTF_FORMAT(3, 4);
77 inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) {
78 va_list arguments;
79 va_start(arguments, format);
80 int result = vswprintf(buffer, size, format, arguments);
81 va_end(arguments);
82 return result;
85 // BSD-style safe and consistent string copy functions.
86 // Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|.
87 // Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as
88 // long as |dst_size| is not 0. Returns the length of |src| in characters.
89 // If the return value is >= dst_size, then the output was truncated.
90 // NOTE: All sizes are in number of characters, NOT in bytes.
91 BASE_EXPORT size_t strlcpy(char* dst, const char* src, size_t dst_size);
92 BASE_EXPORT size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size);
94 // Scan a wprintf format string to determine whether it's portable across a
95 // variety of systems. This function only checks that the conversion
96 // specifiers used by the format string are supported and have the same meaning
97 // on a variety of systems. It doesn't check for other errors that might occur
98 // within a format string.
100 // Nonportable conversion specifiers for wprintf are:
101 // - 's' and 'c' without an 'l' length modifier. %s and %c operate on char
102 // data on all systems except Windows, which treat them as wchar_t data.
103 // Use %ls and %lc for wchar_t data instead.
104 // - 'S' and 'C', which operate on wchar_t data on all systems except Windows,
105 // which treat them as char data. Use %ls and %lc for wchar_t data
106 // instead.
107 // - 'F', which is not identified by Windows wprintf documentation.
108 // - 'D', 'O', and 'U', which are deprecated and not available on all systems.
109 // Use %ld, %lo, and %lu instead.
111 // Note that there is no portable conversion specifier for char data when
112 // working with wprintf.
114 // This function is intended to be called from base::vswprintf.
115 BASE_EXPORT bool IsWprintfFormatPortable(const wchar_t* format);
117 // ASCII-specific tolower. The standard library's tolower is locale sensitive,
118 // so we don't want to use it here.
119 template <class Char> inline Char ToLowerASCII(Char c) {
120 return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
123 // ASCII-specific toupper. The standard library's toupper is locale sensitive,
124 // so we don't want to use it here.
125 template <class Char> inline Char ToUpperASCII(Char c) {
126 return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c;
129 // Function objects to aid in comparing/searching strings.
131 template<typename Char> struct CaseInsensitiveCompare {
132 public:
133 bool operator()(Char x, Char y) const {
134 // TODO(darin): Do we really want to do locale sensitive comparisons here?
135 // See http://crbug.com/24917
136 return tolower(x) == tolower(y);
140 template<typename Char> struct CaseInsensitiveCompareASCII {
141 public:
142 bool operator()(Char x, Char y) const {
143 return ToLowerASCII(x) == ToLowerASCII(y);
147 } // namespace base
149 #if defined(OS_WIN)
150 #include "base/string_util_win.h"
151 #elif defined(OS_POSIX)
152 #include "base/string_util_posix.h"
153 #else
154 #error Define string operations appropriately for your platform
155 #endif
157 // These threadsafe functions return references to globally unique empty
158 // strings.
160 // DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT CONSTRUCTORS.
161 // There is only one case where you should use these: functions which need to
162 // return a string by reference (e.g. as a class member accessor), and don't
163 // have an empty string to use (e.g. in an error case). These should not be
164 // used as initializers, function arguments, or return values for functions
165 // which return by value or outparam.
166 BASE_EXPORT const std::string& EmptyString();
167 BASE_EXPORT const std::wstring& EmptyWString();
168 BASE_EXPORT const string16& EmptyString16();
170 BASE_EXPORT extern const wchar_t kWhitespaceWide[];
171 BASE_EXPORT extern const char16 kWhitespaceUTF16[];
172 BASE_EXPORT extern const char kWhitespaceASCII[];
174 BASE_EXPORT extern const char kUtf8ByteOrderMark[];
176 // Removes characters in |remove_chars| from anywhere in input. Returns true if
177 // any characters were removed. |remove_chars| must be null-terminated.
178 // NOTE: Safe to use the same variable for both input and output.
179 BASE_EXPORT bool RemoveChars(const string16& input,
180 const char16 remove_chars[],
181 string16* output);
182 BASE_EXPORT bool RemoveChars(const std::string& input,
183 const char remove_chars[],
184 std::string* output);
186 // Removes characters in |trim_chars| from the beginning and end of input.
187 // |trim_chars| must be null-terminated.
188 // NOTE: Safe to use the same variable for both input and output.
189 BASE_EXPORT bool TrimString(const std::wstring& input,
190 const wchar_t trim_chars[],
191 std::wstring* output);
192 BASE_EXPORT bool TrimString(const string16& input,
193 const char16 trim_chars[],
194 string16* output);
195 BASE_EXPORT bool TrimString(const std::string& input,
196 const char trim_chars[],
197 std::string* output);
199 // Truncates a string to the nearest UTF-8 character that will leave
200 // the string less than or equal to the specified byte size.
201 BASE_EXPORT void TruncateUTF8ToByteSize(const std::string& input,
202 const size_t byte_size,
203 std::string* output);
205 // Trims any whitespace from either end of the input string. Returns where
206 // whitespace was found.
207 // The non-wide version has two functions:
208 // * TrimWhitespaceASCII()
209 // This function is for ASCII strings and only looks for ASCII whitespace;
210 // Please choose the best one according to your usage.
211 // NOTE: Safe to use the same variable for both input and output.
212 enum TrimPositions {
213 TRIM_NONE = 0,
214 TRIM_LEADING = 1 << 0,
215 TRIM_TRAILING = 1 << 1,
216 TRIM_ALL = TRIM_LEADING | TRIM_TRAILING,
218 BASE_EXPORT TrimPositions TrimWhitespace(const string16& input,
219 TrimPositions positions,
220 string16* output);
221 BASE_EXPORT TrimPositions TrimWhitespaceASCII(const std::string& input,
222 TrimPositions positions,
223 std::string* output);
225 // Deprecated. This function is only for backward compatibility and calls
226 // TrimWhitespaceASCII().
227 BASE_EXPORT TrimPositions TrimWhitespace(const std::string& input,
228 TrimPositions positions,
229 std::string* output);
231 // Searches for CR or LF characters. Removes all contiguous whitespace
232 // strings that contain them. This is useful when trying to deal with text
233 // copied from terminals.
234 // Returns |text|, with the following three transformations:
235 // (1) Leading and trailing whitespace is trimmed.
236 // (2) If |trim_sequences_with_line_breaks| is true, any other whitespace
237 // sequences containing a CR or LF are trimmed.
238 // (3) All other whitespace sequences are converted to single spaces.
239 BASE_EXPORT std::wstring CollapseWhitespace(
240 const std::wstring& text,
241 bool trim_sequences_with_line_breaks);
242 BASE_EXPORT string16 CollapseWhitespace(
243 const string16& text,
244 bool trim_sequences_with_line_breaks);
245 BASE_EXPORT std::string CollapseWhitespaceASCII(
246 const std::string& text,
247 bool trim_sequences_with_line_breaks);
249 // Returns true if the passed string is empty or contains only white-space
250 // characters.
251 BASE_EXPORT bool ContainsOnlyWhitespaceASCII(const std::string& str);
252 BASE_EXPORT bool ContainsOnlyWhitespace(const string16& str);
254 // Returns true if |input| is empty or contains only characters found in
255 // |characters|.
256 BASE_EXPORT bool ContainsOnlyChars(const std::wstring& input,
257 const std::wstring& characters);
258 BASE_EXPORT bool ContainsOnlyChars(const string16& input,
259 const string16& characters);
260 BASE_EXPORT bool ContainsOnlyChars(const std::string& input,
261 const std::string& characters);
263 // Converts to 7-bit ASCII by truncating. The result must be known to be ASCII
264 // beforehand.
265 BASE_EXPORT std::string WideToASCII(const std::wstring& wide);
266 BASE_EXPORT std::string UTF16ToASCII(const string16& utf16);
268 // Converts the given wide string to the corresponding Latin1. This will fail
269 // (return false) if any characters are more than 255.
270 BASE_EXPORT bool WideToLatin1(const std::wstring& wide, std::string* latin1);
272 // Returns true if the specified string matches the criteria. How can a wide
273 // string be 8-bit or UTF8? It contains only characters that are < 256 (in the
274 // first case) or characters that use only 8-bits and whose 8-bit
275 // representation looks like a UTF-8 string (the second case).
277 // Note that IsStringUTF8 checks not only if the input is structurally
278 // valid but also if it doesn't contain any non-character codepoint
279 // (e.g. U+FFFE). It's done on purpose because all the existing callers want
280 // to have the maximum 'discriminating' power from other encodings. If
281 // there's a use case for just checking the structural validity, we have to
282 // add a new function for that.
283 BASE_EXPORT bool IsStringUTF8(const std::string& str);
284 BASE_EXPORT bool IsStringASCII(const std::wstring& str);
285 BASE_EXPORT bool IsStringASCII(const base::StringPiece& str);
286 BASE_EXPORT bool IsStringASCII(const string16& str);
288 // Converts the elements of the given string. This version uses a pointer to
289 // clearly differentiate it from the non-pointer variant.
290 template <class str> inline void StringToLowerASCII(str* s) {
291 for (typename str::iterator i = s->begin(); i != s->end(); ++i)
292 *i = base::ToLowerASCII(*i);
295 template <class str> inline str StringToLowerASCII(const str& s) {
296 // for std::string and std::wstring
297 str output(s);
298 StringToLowerASCII(&output);
299 return output;
302 // Converts the elements of the given string. This version uses a pointer to
303 // clearly differentiate it from the non-pointer variant.
304 template <class str> inline void StringToUpperASCII(str* s) {
305 for (typename str::iterator i = s->begin(); i != s->end(); ++i)
306 *i = base::ToUpperASCII(*i);
309 template <class str> inline str StringToUpperASCII(const str& s) {
310 // for std::string and std::wstring
311 str output(s);
312 StringToUpperASCII(&output);
313 return output;
316 // Compare the lower-case form of the given string against the given ASCII
317 // string. This is useful for doing checking if an input string matches some
318 // token, and it is optimized to avoid intermediate string copies. This API is
319 // borrowed from the equivalent APIs in Mozilla.
320 BASE_EXPORT bool LowerCaseEqualsASCII(const std::string& a, const char* b);
321 BASE_EXPORT bool LowerCaseEqualsASCII(const std::wstring& a, const char* b);
322 BASE_EXPORT bool LowerCaseEqualsASCII(const string16& a, const char* b);
324 // Same thing, but with string iterators instead.
325 BASE_EXPORT bool LowerCaseEqualsASCII(std::string::const_iterator a_begin,
326 std::string::const_iterator a_end,
327 const char* b);
328 BASE_EXPORT bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin,
329 std::wstring::const_iterator a_end,
330 const char* b);
331 BASE_EXPORT bool LowerCaseEqualsASCII(string16::const_iterator a_begin,
332 string16::const_iterator a_end,
333 const char* b);
334 BASE_EXPORT bool LowerCaseEqualsASCII(const char* a_begin,
335 const char* a_end,
336 const char* b);
337 BASE_EXPORT bool LowerCaseEqualsASCII(const wchar_t* a_begin,
338 const wchar_t* a_end,
339 const char* b);
340 BASE_EXPORT bool LowerCaseEqualsASCII(const char16* a_begin,
341 const char16* a_end,
342 const char* b);
344 // Performs a case-sensitive string compare. The behavior is undefined if both
345 // strings are not ASCII.
346 BASE_EXPORT bool EqualsASCII(const string16& a, const base::StringPiece& b);
348 // Returns true if str starts with search, or false otherwise.
349 BASE_EXPORT bool StartsWithASCII(const std::string& str,
350 const std::string& search,
351 bool case_sensitive);
352 BASE_EXPORT bool StartsWith(const std::wstring& str,
353 const std::wstring& search,
354 bool case_sensitive);
355 BASE_EXPORT bool StartsWith(const string16& str,
356 const string16& search,
357 bool case_sensitive);
359 // Returns true if str ends with search, or false otherwise.
360 BASE_EXPORT bool EndsWith(const std::string& str,
361 const std::string& search,
362 bool case_sensitive);
363 BASE_EXPORT bool EndsWith(const std::wstring& str,
364 const std::wstring& search,
365 bool case_sensitive);
366 BASE_EXPORT bool EndsWith(const string16& str,
367 const string16& search,
368 bool case_sensitive);
371 // Determines the type of ASCII character, independent of locale (the C
372 // library versions will change based on locale).
373 template <typename Char>
374 inline bool IsAsciiWhitespace(Char c) {
375 return c == ' ' || c == '\r' || c == '\n' || c == '\t';
377 template <typename Char>
378 inline bool IsAsciiAlpha(Char c) {
379 return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z'));
381 template <typename Char>
382 inline bool IsAsciiDigit(Char c) {
383 return c >= '0' && c <= '9';
386 template <typename Char>
387 inline bool IsHexDigit(Char c) {
388 return (c >= '0' && c <= '9') ||
389 (c >= 'A' && c <= 'F') ||
390 (c >= 'a' && c <= 'f');
393 template <typename Char>
394 inline Char HexDigitToInt(Char c) {
395 DCHECK(IsHexDigit(c));
396 if (c >= '0' && c <= '9')
397 return c - '0';
398 if (c >= 'A' && c <= 'F')
399 return c - 'A' + 10;
400 if (c >= 'a' && c <= 'f')
401 return c - 'a' + 10;
402 return 0;
405 // Returns true if it's a whitespace character.
406 inline bool IsWhitespace(wchar_t c) {
407 return wcschr(kWhitespaceWide, c) != NULL;
410 // Return a byte string in human-readable format with a unit suffix. Not
411 // appropriate for use in any UI; use of FormatBytes and friends in ui/base is
412 // highly recommended instead. TODO(avi): Figure out how to get callers to use
413 // FormatBytes instead; remove this.
414 BASE_EXPORT string16 FormatBytesUnlocalized(int64 bytes);
416 // Starting at |start_offset| (usually 0), replace the first instance of
417 // |find_this| with |replace_with|.
418 BASE_EXPORT void ReplaceFirstSubstringAfterOffset(
419 string16* str,
420 string16::size_type start_offset,
421 const string16& find_this,
422 const string16& replace_with);
423 BASE_EXPORT void ReplaceFirstSubstringAfterOffset(
424 std::string* str,
425 std::string::size_type start_offset,
426 const std::string& find_this,
427 const std::string& replace_with);
429 // Starting at |start_offset| (usually 0), look through |str| and replace all
430 // instances of |find_this| with |replace_with|.
432 // This does entire substrings; use std::replace in <algorithm> for single
433 // characters, for example:
434 // std::replace(str.begin(), str.end(), 'a', 'b');
435 BASE_EXPORT void ReplaceSubstringsAfterOffset(
436 string16* str,
437 string16::size_type start_offset,
438 const string16& find_this,
439 const string16& replace_with);
440 BASE_EXPORT void ReplaceSubstringsAfterOffset(
441 std::string* str,
442 std::string::size_type start_offset,
443 const std::string& find_this,
444 const std::string& replace_with);
446 // Reserves enough memory in |str| to accommodate |length_with_null|
447 // characters, sets the size of |str| to |length_with_null - 1| characters,
448 // and returns a pointer to the underlying contiguous array of characters.
450 // This is typically used when calling a function that writes results into a
451 // character array, but the caller wants the data to be managed by a
452 // string-like object.
454 // |length_with_null| must be >= 1. In the |length_with_null| == 1 case,
455 // NULL is returned rather than a pointer to the array, since there is no way
456 // to provide access to the underlying character array of a 0-length
457 // string-like object without breaking guarantees provided by the C++
458 // standards.
460 // Internally, this takes linear time because the underlying array needs to
461 // be 0-filled for all |length_with_null - 1| * sizeof(character) bytes.
462 template <class string_type>
463 inline typename string_type::value_type* WriteInto(string_type* str,
464 size_t length_with_null) {
465 DCHECK_NE(0u, length_with_null);
466 str->reserve(length_with_null);
467 str->resize(length_with_null - 1);
469 // If |length_with_null| is 1, calling (*str)[0] is invalid since the
470 // size() is 0. In some implementations this triggers an assertion.
472 // Trying to access the underlying byte array by casting away const
473 // when calling str->data() or str->c_str() is also incorrect.
474 // Some implementations of basic_string use a copy-on-write approach and
475 // this could end up mutating the data that is shared across multiple string
476 // objects.
477 if (length_with_null <= 1)
478 return NULL;
480 return &((*str)[0]);
483 //-----------------------------------------------------------------------------
485 // Splits a string into its fields delimited by any of the characters in
486 // |delimiters|. Each field is added to the |tokens| vector. Returns the
487 // number of tokens found.
488 BASE_EXPORT size_t Tokenize(const std::wstring& str,
489 const std::wstring& delimiters,
490 std::vector<std::wstring>* tokens);
491 BASE_EXPORT size_t Tokenize(const string16& str,
492 const string16& delimiters,
493 std::vector<string16>* tokens);
494 BASE_EXPORT size_t Tokenize(const std::string& str,
495 const std::string& delimiters,
496 std::vector<std::string>* tokens);
497 BASE_EXPORT size_t Tokenize(const base::StringPiece& str,
498 const base::StringPiece& delimiters,
499 std::vector<base::StringPiece>* tokens);
501 // Does the opposite of SplitString().
502 BASE_EXPORT string16 JoinString(const std::vector<string16>& parts, char16 s);
503 BASE_EXPORT std::string JoinString(
504 const std::vector<std::string>& parts, char s);
506 // Replace $1-$2-$3..$9 in the format string with |a|-|b|-|c|..|i| respectively.
507 // Additionally, any number of consecutive '$' characters is replaced by that
508 // number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be
509 // NULL. This only allows you to use up to nine replacements.
510 BASE_EXPORT string16 ReplaceStringPlaceholders(
511 const string16& format_string,
512 const std::vector<string16>& subst,
513 std::vector<size_t>* offsets);
515 BASE_EXPORT std::string ReplaceStringPlaceholders(
516 const base::StringPiece& format_string,
517 const std::vector<std::string>& subst,
518 std::vector<size_t>* offsets);
520 // Single-string shortcut for ReplaceStringHolders. |offset| may be NULL.
521 BASE_EXPORT string16 ReplaceStringPlaceholders(const string16& format_string,
522 const string16& a,
523 size_t* offset);
525 // Returns true if the string passed in matches the pattern. The pattern
526 // string can contain wildcards like * and ?
527 // The backslash character (\) is an escape character for * and ?
528 // We limit the patterns to having a max of 16 * or ? characters.
529 // ? matches 0 or 1 character, while * matches 0 or more characters.
530 BASE_EXPORT bool MatchPattern(const base::StringPiece& string,
531 const base::StringPiece& pattern);
532 BASE_EXPORT bool MatchPattern(const string16& string, const string16& pattern);
534 // Hack to convert any char-like type to its unsigned counterpart.
535 // For example, it will convert char, signed char and unsigned char to unsigned
536 // char.
537 template<typename T>
538 struct ToUnsigned {
539 typedef T Unsigned;
542 template<>
543 struct ToUnsigned<char> {
544 typedef unsigned char Unsigned;
546 template<>
547 struct ToUnsigned<signed char> {
548 typedef unsigned char Unsigned;
550 template<>
551 struct ToUnsigned<wchar_t> {
552 #if defined(WCHAR_T_IS_UTF16)
553 typedef unsigned short Unsigned;
554 #elif defined(WCHAR_T_IS_UTF32)
555 typedef uint32 Unsigned;
556 #endif
558 template<>
559 struct ToUnsigned<short> {
560 typedef unsigned short Unsigned;
563 #endif // BASE_STRING_UTIL_H_