libs/xml2/xmlstring.c

   1 /*
   2  * string.c : an XML string utilities module
   3  *
   4  * This module provides various utility functions for manipulating
   5  * the xmlChar* type. All functions named xmlStr* have been moved here
   6  * from the parser.c file (their original home).
   7  *
   8  * See Copyright for the status of this software.
   9  *
  10  * UTF8 string routines from:
  11  * William Brack <wbrack@mmm.com.hk>
  12  *
  13  * daniel@veillard.com
  14  */
  15
  16 #define IN_LIBXML
  17 #include "libxml.h"
  18
  19 #include <stdlib.h>
  20 #include <string.h>
  21 #include <limits.h>
  22 #include <libxml/xmlmemory.h>
  23 #include <libxml/parserInternals.h>
  24 #include <libxml/xmlstring.h>
  25
  26 /************************************************************************
  27  *                                                                      *
  28  *                Commodity functions to handle xmlChars                *
  29  *                                                                      *
  30  ************************************************************************/
  31
  32 /**
  33  * xmlStrndup:
  34  * @cur:  the input xmlChar *
  35  * @len:  the len of @cur
  36  *
  37  * a strndup for array of xmlChar's
  38  *
  39  * Returns a new xmlChar * or NULL
  40  */
  41 xmlChar *
  42 xmlStrndup(const xmlChar *cur, int len) {
  43     xmlChar *ret;
  44
  45     if ((cur == NULL) || (len < 0)) return(NULL);
  46     ret = (xmlChar *) xmlMallocAtomic(((size_t) len + 1) * sizeof(xmlChar));
  47     if (ret == NULL) {
  48         xmlErrMemory(NULL, NULL);
  49         return(NULL);
  50     }
  51     memcpy(ret, cur, len * sizeof(xmlChar));
  52     ret[len] = 0;
  53     return(ret);
  54 }
  55
  56 /**
  57  * xmlStrdup:
  58  * @cur:  the input xmlChar *
  59  *
  60  * a strdup for array of xmlChar's. Since they are supposed to be
  61  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
  62  * a termination mark of '0'.
  63  *
  64  * Returns a new xmlChar * or NULL
  65  */
  66 xmlChar *
  67 xmlStrdup(const xmlChar *cur) {
  68     const xmlChar *p = cur;
  69
  70     if (cur == NULL) return(NULL);
  71     while (*p != 0) p++; /* non input consuming */
  72     return(xmlStrndup(cur, p - cur));
  73 }
  74
  75 /**
  76  * xmlCharStrndup:
  77  * @cur:  the input char *
  78  * @len:  the len of @cur
  79  *
  80  * a strndup for char's to xmlChar's
  81  *
  82  * Returns a new xmlChar * or NULL
  83  */
  84
  85 xmlChar *
  86 xmlCharStrndup(const char *cur, int len) {
  87     int i;
  88     xmlChar *ret;
  89
  90     if ((cur == NULL) || (len < 0)) return(NULL);
  91     ret = (xmlChar *) xmlMallocAtomic(((size_t) len + 1) * sizeof(xmlChar));
  92     if (ret == NULL) {
  93         xmlErrMemory(NULL, NULL);
  94         return(NULL);
  95     }
  96     for (i = 0;i < len;i++) {
  97         ret[i] = (xmlChar) cur[i];
  98         if (ret[i] == 0) return(ret);
  99     }
 100     ret[len] = 0;
 101     return(ret);
 102 }
 103
 104 /**
 105  * xmlCharStrdup:
 106  * @cur:  the input char *
 107  *
 108  * a strdup for char's to xmlChar's
 109  *
 110  * Returns a new xmlChar * or NULL
 111  */
 112
 113 xmlChar *
 114 xmlCharStrdup(const char *cur) {
 115     const char *p = cur;
 116
 117     if (cur == NULL) return(NULL);
 118     while (*p != '\0') p++; /* non input consuming */
 119     return(xmlCharStrndup(cur, p - cur));
 120 }
 121
 122 /**
 123  * xmlStrcmp:
 124  * @str1:  the first xmlChar *
 125  * @str2:  the second xmlChar *
 126  *
 127  * a strcmp for xmlChar's
 128  *
 129  * Returns the integer result of the comparison
 130  */
 131
 132 int
 133 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
 134     if (str1 == str2) return(0);
 135     if (str1 == NULL) return(-1);
 136     if (str2 == NULL) return(1);
 137 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
 138     return(strcmp((const char *)str1, (const char *)str2));
 139 #else
 140     do {
 141         int tmp = *str1++ - *str2;
 142         if (tmp != 0) return(tmp);
 143     } while (*str2++ != 0);
 144     return 0;
 145 #endif
 146 }
 147
 148 /**
 149  * xmlStrEqual:
 150  * @str1:  the first xmlChar *
 151  * @str2:  the second xmlChar *
 152  *
 153  * Check if both strings are equal of have same content.
 154  * Should be a bit more readable and faster than xmlStrcmp()
 155  *
 156  * Returns 1 if they are equal, 0 if they are different
 157  */
 158
 159 int
 160 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
 161     if (str1 == str2) return(1);
 162     if (str1 == NULL) return(0);
 163     if (str2 == NULL) return(0);
 164 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
 165     return(strcmp((const char *)str1, (const char *)str2) == 0);
 166 #else
 167     do {
 168         if (*str1++ != *str2) return(0);
 169     } while (*str2++);
 170     return(1);
 171 #endif
 172 }
 173
 174 /**
 175  * xmlStrQEqual:
 176  * @pref:  the prefix of the QName
 177  * @name:  the localname of the QName
 178  * @str:  the second xmlChar *
 179  *
 180  * Check if a QName is Equal to a given string
 181  *
 182  * Returns 1 if they are equal, 0 if they are different
 183  */
 184
 185 int
 186 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
 187     if (pref == NULL) return(xmlStrEqual(name, str));
 188     if (name == NULL) return(0);
 189     if (str == NULL) return(0);
 190
 191     do {
 192         if (*pref++ != *str) return(0);
 193     } while ((*str++) && (*pref));
 194     if (*str++ != ':') return(0);
 195     do {
 196         if (*name++ != *str) return(0);
 197     } while (*str++);
 198     return(1);
 199 }
 200
 201 /**
 202  * xmlStrncmp:
 203  * @str1:  the first xmlChar *
 204  * @str2:  the second xmlChar *
 205  * @len:  the max comparison length
 206  *
 207  * a strncmp for xmlChar's
 208  *
 209  * Returns the integer result of the comparison
 210  */
 211
 212 int
 213 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
 214     if (len <= 0) return(0);
 215     if (str1 == str2) return(0);
 216     if (str1 == NULL) return(-1);
 217     if (str2 == NULL) return(1);
 218 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
 219     return(strncmp((const char *)str1, (const char *)str2, len));
 220 #else
 221     do {
 222         int tmp = *str1++ - *str2;
 223         if (tmp != 0 || --len == 0) return(tmp);
 224     } while (*str2++ != 0);
 225     return 0;
 226 #endif
 227 }
 228
 229 static const xmlChar casemap[256] = {
 230     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
 231     0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
 232     0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 233     0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
 234     0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
 235     0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
 236     0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
 237     0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
 238     0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
 239     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
 240     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
 241     0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
 242     0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
 243     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
 244     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
 245     0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
 246     0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
 247     0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
 248     0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
 249     0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
 250     0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
 251     0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
 252     0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
 253     0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
 254     0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
 255     0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
 256     0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
 257     0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
 258     0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
 259     0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
 260     0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
 261     0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
 262 };
 263
 264 /**
 265  * xmlStrcasecmp:
 266  * @str1:  the first xmlChar *
 267  * @str2:  the second xmlChar *
 268  *
 269  * a strcasecmp for xmlChar's
 270  *
 271  * Returns the integer result of the comparison
 272  */
 273
 274 int
 275 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
 276     register int tmp;
 277
 278     if (str1 == str2) return(0);
 279     if (str1 == NULL) return(-1);
 280     if (str2 == NULL) return(1);
 281     do {
 282         tmp = casemap[*str1++] - casemap[*str2];
 283         if (tmp != 0) return(tmp);
 284     } while (*str2++ != 0);
 285     return 0;
 286 }
 287
 288 /**
 289  * xmlStrncasecmp:
 290  * @str1:  the first xmlChar *
 291  * @str2:  the second xmlChar *
 292  * @len:  the max comparison length
 293  *
 294  * a strncasecmp for xmlChar's
 295  *
 296  * Returns the integer result of the comparison
 297  */
 298
 299 int
 300 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
 301     register int tmp;
 302
 303     if (len <= 0) return(0);
 304     if (str1 == str2) return(0);
 305     if (str1 == NULL) return(-1);
 306     if (str2 == NULL) return(1);
 307     do {
 308         tmp = casemap[*str1++] - casemap[*str2];
 309         if (tmp != 0 || --len == 0) return(tmp);
 310     } while (*str2++ != 0);
 311     return 0;
 312 }
 313
 314 /**
 315  * xmlStrchr:
 316  * @str:  the xmlChar * array
 317  * @val:  the xmlChar to search
 318  *
 319  * a strchr for xmlChar's
 320  *
 321  * Returns the xmlChar * for the first occurrence or NULL.
 322  */
 323
 324 const xmlChar *
 325 xmlStrchr(const xmlChar *str, xmlChar val) {
 326     if (str == NULL) return(NULL);
 327     while (*str != 0) { /* non input consuming */
 328         if (*str == val) return((xmlChar *) str);
 329         str++;
 330     }
 331     return(NULL);
 332 }
 333
 334 /**
 335  * xmlStrstr:
 336  * @str:  the xmlChar * array (haystack)
 337  * @val:  the xmlChar to search (needle)
 338  *
 339  * a strstr for xmlChar's
 340  *
 341  * Returns the xmlChar * for the first occurrence or NULL.
 342  */
 343
 344 const xmlChar *
 345 xmlStrstr(const xmlChar *str, const xmlChar *val) {
 346     int n;
 347
 348     if (str == NULL) return(NULL);
 349     if (val == NULL) return(NULL);
 350     n = xmlStrlen(val);
 351
 352     if (n == 0) return(str);
 353     while (*str != 0) { /* non input consuming */
 354         if (*str == *val) {
 355             if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
 356         }
 357         str++;
 358     }
 359     return(NULL);
 360 }
 361
 362 /**
 363  * xmlStrcasestr:
 364  * @str:  the xmlChar * array (haystack)
 365  * @val:  the xmlChar to search (needle)
 366  *
 367  * a case-ignoring strstr for xmlChar's
 368  *
 369  * Returns the xmlChar * for the first occurrence or NULL.
 370  */
 371
 372 const xmlChar *
 373 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
 374     int n;
 375
 376     if (str == NULL) return(NULL);
 377     if (val == NULL) return(NULL);
 378     n = xmlStrlen(val);
 379
 380     if (n == 0) return(str);
 381     while (*str != 0) { /* non input consuming */
 382         if (casemap[*str] == casemap[*val])
 383             if (!xmlStrncasecmp(str, val, n)) return(str);
 384         str++;
 385     }
 386     return(NULL);
 387 }
 388
 389 /**
 390  * xmlStrsub:
 391  * @str:  the xmlChar * array (haystack)
 392  * @start:  the index of the first char (zero based)
 393  * @len:  the length of the substring
 394  *
 395  * Extract a substring of a given string
 396  *
 397  * Returns the xmlChar * for the first occurrence or NULL.
 398  */
 399
 400 xmlChar *
 401 xmlStrsub(const xmlChar *str, int start, int len) {
 402     int i;
 403
 404     if (str == NULL) return(NULL);
 405     if (start < 0) return(NULL);
 406     if (len < 0) return(NULL);
 407
 408     for (i = 0;i < start;i++) {
 409         if (*str == 0) return(NULL);
 410         str++;
 411     }
 412     if (*str == 0) return(NULL);
 413     return(xmlStrndup(str, len));
 414 }
 415
 416 /**
 417  * xmlStrlen:
 418  * @str:  the xmlChar * array
 419  *
 420  * length of a xmlChar's string
 421  *
 422  * Returns the number of xmlChar contained in the ARRAY.
 423  */
 424
 425 int
 426 xmlStrlen(const xmlChar *str) {
 427     size_t len = str ? strlen((const char *)str) : 0;
 428     return(len > INT_MAX ? 0 : len);
 429 }
 430
 431 /**
 432  * xmlStrncat:
 433  * @cur:  the original xmlChar * array
 434  * @add:  the xmlChar * array added
 435  * @len:  the length of @add
 436  *
 437  * a strncat for array of xmlChar's, it will extend @cur with the len
 438  * first bytes of @add. Note that if @len < 0 then this is an API error
 439  * and NULL will be returned.
 440  *
 441  * Returns a new xmlChar *, the original @cur is reallocated and should
 442  * not be freed.
 443  */
 444
 445 xmlChar *
 446 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
 447     int size;
 448     xmlChar *ret;
 449
 450     if ((add == NULL) || (len == 0))
 451         return(cur);
 452     if (len < 0)
 453         return(NULL);
 454     if (cur == NULL)
 455         return(xmlStrndup(add, len));
 456
 457     size = xmlStrlen(cur);
 458     if ((size < 0) || (size > INT_MAX - len))
 459         return(NULL);
 460     ret = (xmlChar *) xmlRealloc(cur, ((size_t) size + len + 1) * sizeof(xmlChar));
 461     if (ret == NULL) {
 462         xmlErrMemory(NULL, NULL);
 463         return(cur);
 464     }
 465     memcpy(&ret[size], add, len * sizeof(xmlChar));
 466     ret[size + len] = 0;
 467     return(ret);
 468 }
 469
 470 /**
 471  * xmlStrncatNew:
 472  * @str1:  first xmlChar string
 473  * @str2:  second xmlChar string
 474  * @len:  the len of @str2 or < 0
 475  *
 476  * same as xmlStrncat, but creates a new string.  The original
 477  * two strings are not freed. If @len is < 0 then the length
 478  * will be calculated automatically.
 479  *
 480  * Returns a new xmlChar * or NULL
 481  */
 482 xmlChar *
 483 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
 484     int size;
 485     xmlChar *ret;
 486
 487     if (len < 0) {
 488         len = xmlStrlen(str2);
 489         if (len < 0)
 490             return(NULL);
 491     }
 492     if ((str2 == NULL) || (len == 0))
 493         return(xmlStrdup(str1));
 494     if (str1 == NULL)
 495         return(xmlStrndup(str2, len));
 496
 497     size = xmlStrlen(str1);
 498     if ((size < 0) || (size > INT_MAX - len))
 499         return(NULL);
 500     ret = (xmlChar *) xmlMalloc(((size_t) size + len + 1) * sizeof(xmlChar));
 501     if (ret == NULL) {
 502         xmlErrMemory(NULL, NULL);
 503         return(xmlStrndup(str1, size));
 504     }
 505     memcpy(ret, str1, size * sizeof(xmlChar));
 506     memcpy(&ret[size], str2, len * sizeof(xmlChar));
 507     ret[size + len] = 0;
 508     return(ret);
 509 }
 510
 511 /**
 512  * xmlStrcat:
 513  * @cur:  the original xmlChar * array
 514  * @add:  the xmlChar * array added
 515  *
 516  * a strcat for array of xmlChar's. Since they are supposed to be
 517  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
 518  * a termination mark of '0'.
 519  *
 520  * Returns a new xmlChar * containing the concatenated string. The original
 521  * @cur is reallocated and should not be freed.
 522  */
 523 xmlChar *
 524 xmlStrcat(xmlChar *cur, const xmlChar *add) {
 525     const xmlChar *p = add;
 526
 527     if (add == NULL) return(cur);
 528     if (cur == NULL)
 529         return(xmlStrdup(add));
 530
 531     while (*p != 0) p++; /* non input consuming */
 532     return(xmlStrncat(cur, add, p - add));
 533 }
 534
 535 /**
 536  * xmlStrPrintf:
 537  * @buf:   the result buffer.
 538  * @len:   the result buffer length.
 539  * @msg:   the message with printf formatting.
 540  * @...:   extra parameters for the message.
 541  *
 542  * Formats @msg and places result into @buf.
 543  *
 544  * Returns the number of characters written to @buf or -1 if an error occurs.
 545  */
 546 int XMLCDECL
 547 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
 548     va_list args;
 549     int ret;
 550
 551     if((buf == NULL) || (msg == NULL)) {
 552         return(-1);
 553     }
 554
 555     va_start(args, msg);
 556     ret = vsnprintf((char *) buf, len, (const char *) msg, args);
 557     va_end(args);
 558     buf[len - 1] = 0; /* be safe ! */
 559
 560     return(ret);
 561 }
 562
 563 /**
 564  * xmlStrVPrintf:
 565  * @buf:   the result buffer.
 566  * @len:   the result buffer length.
 567  * @msg:   the message with printf formatting.
 568  * @ap:    extra parameters for the message.
 569  *
 570  * Formats @msg and places result into @buf.
 571  *
 572  * Returns the number of characters written to @buf or -1 if an error occurs.
 573  */
 574 int
 575 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
 576     int ret;
 577
 578     if((buf == NULL) || (msg == NULL)) {
 579         return(-1);
 580     }
 581
 582     ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
 583     buf[len - 1] = 0; /* be safe ! */
 584
 585     return(ret);
 586 }
 587
 588 /************************************************************************
 589  *                                                                      *
 590  *              Generic UTF8 handling routines                          *
 591  *                                                                      *
 592  * From rfc2044: encoding of the Unicode values on UTF-8:               *
 593  *                                                                      *
 594  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
 595  * 0000 0000-0000 007F   0xxxxxxx                                       *
 596  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
 597  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
 598  *                                                                      *
 599  * I hope we won't use values > 0xFFFF anytime soon !                   *
 600  *                                                                      *
 601  ************************************************************************/
 602
 603
 604 /**
 605  * xmlUTF8Size:
 606  * @utf: pointer to the UTF8 character
 607  *
 608  * calculates the internal size of a UTF8 character
 609  *
 610  * returns the numbers of bytes in the character, -1 on format error
 611  */
 612 int
 613 xmlUTF8Size(const xmlChar *utf) {
 614     xmlChar mask;
 615     int len;
 616
 617     if (utf == NULL)
 618         return -1;
 619     if (*utf < 0x80)
 620         return 1;
 621     /* check valid UTF8 character */
 622     if (!(*utf & 0x40))
 623         return -1;
 624     /* determine number of bytes in char */
 625     len = 2;
 626     for (mask=0x20; mask != 0; mask>>=1) {
 627         if (!(*utf & mask))
 628             return len;
 629         len++;
 630     }
 631     return -1;
 632 }
 633
 634 /**
 635  * xmlUTF8Charcmp:
 636  * @utf1: pointer to first UTF8 char
 637  * @utf2: pointer to second UTF8 char
 638  *
 639  * compares the two UCS4 values
 640  *
 641  * returns result of the compare as with xmlStrncmp
 642  */
 643 int
 644 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
 645
 646     if (utf1 == NULL ) {
 647         if (utf2 == NULL)
 648             return 0;
 649         return -1;
 650     }
 651     return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
 652 }
 653
 654 /**
 655  * xmlUTF8Strlen:
 656  * @utf:  a sequence of UTF-8 encoded bytes
 657  *
 658  * compute the length of an UTF8 string, it doesn't do a full UTF8
 659  * checking of the content of the string.
 660  *
 661  * Returns the number of characters in the string or -1 in case of error
 662  */
 663 int
 664 xmlUTF8Strlen(const xmlChar *utf) {
 665     size_t ret = 0;
 666
 667     if (utf == NULL)
 668         return(-1);
 669
 670     while (*utf != 0) {
 671         if (utf[0] & 0x80) {
 672             if ((utf[1] & 0xc0) != 0x80)
 673                 return(-1);
 674             if ((utf[0] & 0xe0) == 0xe0) {
 675                 if ((utf[2] & 0xc0) != 0x80)
 676                     return(-1);
 677                 if ((utf[0] & 0xf0) == 0xf0) {
 678                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
 679                         return(-1);
 680                     utf += 4;
 681                 } else {
 682                     utf += 3;
 683                 }
 684             } else {
 685                 utf += 2;
 686             }
 687         } else {
 688             utf++;
 689         }
 690         ret++;
 691     }
 692     return(ret > INT_MAX ? 0 : ret);
 693 }
 694
 695 /**
 696  * xmlGetUTF8Char:
 697  * @utf:  a sequence of UTF-8 encoded bytes
 698  * @len:  a pointer to the minimum number of bytes present in
 699  *        the sequence.  This is used to assure the next character
 700  *        is completely contained within the sequence.
 701  *
 702  * Read the first UTF8 character from @utf
 703  *
 704  * Returns the char value or -1 in case of error, and sets *len to
 705  *        the actual number of bytes consumed (0 in case of error)
 706  */
 707 int
 708 xmlGetUTF8Char(const unsigned char *utf, int *len) {
 709     unsigned int c;
 710
 711     if (utf == NULL)
 712         goto error;
 713     if (len == NULL)
 714         goto error;
 715     if (*len < 1)
 716         goto error;
 717
 718     c = utf[0];
 719     if (c & 0x80) {
 720         if (*len < 2)
 721             goto error;
 722         if ((utf[1] & 0xc0) != 0x80)
 723             goto error;
 724         if ((c & 0xe0) == 0xe0) {
 725             if (*len < 3)
 726                 goto error;
 727             if ((utf[2] & 0xc0) != 0x80)
 728                 goto error;
 729             if ((c & 0xf0) == 0xf0) {
 730                 if (*len < 4)
 731                     goto error;
 732                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
 733                     goto error;
 734                 *len = 4;
 735                 /* 4-byte code */
 736                 c = (utf[0] & 0x7) << 18;
 737                 c |= (utf[1] & 0x3f) << 12;
 738                 c |= (utf[2] & 0x3f) << 6;
 739                 c |= utf[3] & 0x3f;
 740             } else {
 741               /* 3-byte code */
 742                 *len = 3;
 743                 c = (utf[0] & 0xf) << 12;
 744                 c |= (utf[1] & 0x3f) << 6;
 745                 c |= utf[2] & 0x3f;
 746             }
 747         } else {
 748           /* 2-byte code */
 749             *len = 2;
 750             c = (utf[0] & 0x1f) << 6;
 751             c |= utf[1] & 0x3f;
 752         }
 753     } else {
 754         /* 1-byte code */
 755         *len = 1;
 756     }
 757     return(c);
 758
 759 error:
 760     if (len != NULL)
 761         *len = 0;
 762     return(-1);
 763 }
 764
 765 /**
 766  * xmlCheckUTF8:
 767  * @utf: Pointer to putative UTF-8 encoded string.
 768  *
 769  * Checks @utf for being valid UTF-8. @utf is assumed to be
 770  * null-terminated. This function is not super-strict, as it will
 771  * allow longer UTF-8 sequences than necessary. Note that Java is
 772  * capable of producing these sequences if provoked. Also note, this
 773  * routine checks for the 4-byte maximum size, but does not check for
 774  * 0x10ffff maximum value.
 775  *
 776  * Return value: true if @utf is valid.
 777  **/
 778 int
 779 xmlCheckUTF8(const unsigned char *utf)
 780 {
 781     int ix;
 782     unsigned char c;
 783
 784     if (utf == NULL)
 785         return(0);
 786     /*
 787      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
 788      * are as follows (in "bit format"):
 789      *    0xxxxxxx                                      valid 1-byte
 790      *    110xxxxx 10xxxxxx                             valid 2-byte
 791      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
 792      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
 793      */
 794     while ((c = utf[0])) {      /* string is 0-terminated */
 795         ix = 0;
 796         if ((c & 0x80) == 0x00) {       /* 1-byte code, starts with 10 */
 797             ix = 1;
 798         } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
 799             if ((utf[1] & 0xc0 ) != 0x80)
 800                 return 0;
 801             ix = 2;
 802         } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
 803             if (((utf[1] & 0xc0) != 0x80) ||
 804                 ((utf[2] & 0xc0) != 0x80))
 805                     return 0;
 806             ix = 3;
 807         } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
 808             if (((utf[1] & 0xc0) != 0x80) ||
 809                 ((utf[2] & 0xc0) != 0x80) ||
 810                 ((utf[3] & 0xc0) != 0x80))
 811                     return 0;
 812             ix = 4;
 813         } else                          /* unknown encoding */
 814             return 0;
 815         utf += ix;
 816       }
 817       return(1);
 818 }
 819
 820 /**
 821  * xmlUTF8Strsize:
 822  * @utf:  a sequence of UTF-8 encoded bytes
 823  * @len:  the number of characters in the array
 824  *
 825  * storage size of an UTF8 string
 826  * the behaviour is not guaranteed if the input string is not UTF-8
 827  *
 828  * Returns the storage size of
 829  * the first 'len' characters of ARRAY
 830  */
 831
 832 int
 833 xmlUTF8Strsize(const xmlChar *utf, int len) {
 834     const xmlChar *ptr=utf;
 835     int ch;
 836     size_t ret;
 837
 838     if (utf == NULL)
 839         return(0);
 840
 841     if (len <= 0)
 842         return(0);
 843
 844     while ( len-- > 0) {
 845         if ( !*ptr )
 846             break;
 847         if ( (ch = *ptr++) & 0x80)
 848             while ((ch<<=1) & 0x80 ) {
 849                 if (*ptr == 0) break;
 850                 ptr++;
 851             }
 852     }
 853     ret = ptr - utf;
 854     return (ret > INT_MAX ? 0 : ret);
 855 }
 856
 857
 858 /**
 859  * xmlUTF8Strndup:
 860  * @utf:  the input UTF8 *
 861  * @len:  the len of @utf (in chars)
 862  *
 863  * a strndup for array of UTF8's
 864  *
 865  * Returns a new UTF8 * or NULL
 866  */
 867 xmlChar *
 868 xmlUTF8Strndup(const xmlChar *utf, int len) {
 869     xmlChar *ret;
 870     int i;
 871
 872     if ((utf == NULL) || (len < 0)) return(NULL);
 873     i = xmlUTF8Strsize(utf, len);
 874     ret = (xmlChar *) xmlMallocAtomic(((size_t) i + 1) * sizeof(xmlChar));
 875     if (ret == NULL) {
 876         return(NULL);
 877     }
 878     memcpy(ret, utf, i * sizeof(xmlChar));
 879     ret[i] = 0;
 880     return(ret);
 881 }
 882
 883 /**
 884  * xmlUTF8Strpos:
 885  * @utf:  the input UTF8 *
 886  * @pos:  the position of the desired UTF8 char (in chars)
 887  *
 888  * a function to provide the equivalent of fetching a
 889  * character from a string array
 890  *
 891  * Returns a pointer to the UTF8 character or NULL
 892  */
 893 const xmlChar *
 894 xmlUTF8Strpos(const xmlChar *utf, int pos) {
 895     int ch;
 896
 897     if (utf == NULL) return(NULL);
 898     if (pos < 0)
 899         return(NULL);
 900     while (pos--) {
 901         if ((ch=*utf++) == 0) return(NULL);
 902         if ( ch & 0x80 ) {
 903             /* if not simple ascii, verify proper format */
 904             if ( (ch & 0xc0) != 0xc0 )
 905                 return(NULL);
 906             /* then skip over remaining bytes for this char */
 907             while ( (ch <<= 1) & 0x80 )
 908                 if ( (*utf++ & 0xc0) != 0x80 )
 909                     return(NULL);
 910         }
 911     }
 912     return((xmlChar *)utf);
 913 }
 914
 915 /**
 916  * xmlUTF8Strloc:
 917  * @utf:  the input UTF8 *
 918  * @utfchar:  the UTF8 character to be found
 919  *
 920  * a function to provide the relative location of a UTF8 char
 921  *
 922  * Returns the relative character position of the desired char
 923  * or -1 if not found
 924  */
 925 int
 926 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
 927     size_t i;
 928     int size;
 929     int ch;
 930
 931     if (utf==NULL || utfchar==NULL) return -1;
 932     size = xmlUTF8Strsize(utfchar, 1);
 933         for(i=0; (ch=*utf) != 0; i++) {
 934             if (xmlStrncmp(utf, utfchar, size)==0)
 935                 return(i > INT_MAX ? 0 : i);
 936             utf++;
 937             if ( ch & 0x80 ) {
 938                 /* if not simple ascii, verify proper format */
 939                 if ( (ch & 0xc0) != 0xc0 )
 940                     return(-1);
 941                 /* then skip over remaining bytes for this char */
 942                 while ( (ch <<= 1) & 0x80 )
 943                     if ( (*utf++ & 0xc0) != 0x80 )
 944                         return(-1);
 945             }
 946         }
 947
 948     return(-1);
 949 }
 950 /**
 951  * xmlUTF8Strsub:
 952  * @utf:  a sequence of UTF-8 encoded bytes
 953  * @start: relative pos of first char
 954  * @len:   total number to copy
 955  *
 956  * Create a substring from a given UTF-8 string
 957  * Note:  positions are given in units of UTF-8 chars
 958  *
 959  * Returns a pointer to a newly created string
 960  * or NULL if any problem
 961  */
 962
 963 xmlChar *
 964 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
 965     int i;
 966     int ch;
 967
 968     if (utf == NULL) return(NULL);
 969     if (start < 0) return(NULL);
 970     if (len < 0) return(NULL);
 971
 972     /*
 973      * Skip over any leading chars
 974      */
 975     for (i = 0;i < start;i++) {
 976         if ((ch=*utf++) == 0) return(NULL);
 977         if ( ch & 0x80 ) {
 978             /* if not simple ascii, verify proper format */
 979             if ( (ch & 0xc0) != 0xc0 )
 980                 return(NULL);
 981             /* then skip over remaining bytes for this char */
 982             while ( (ch <<= 1) & 0x80 )
 983                 if ( (*utf++ & 0xc0) != 0x80 )
 984                     return(NULL);
 985         }
 986     }
 987
 988     return(xmlUTF8Strndup(utf, len));
 989 }
 990
 991 /**
 992  * xmlEscapeFormatString:
 993  * @msg:  a pointer to the string in which to escape '%' characters.
 994  * Must be a heap-allocated buffer created by libxml2 that may be
 995  * returned, or that may be freed and replaced.
 996  *
 997  * Replaces the string pointed to by 'msg' with an escaped string.
 998  * Returns the same string with all '%' characters escaped.
 999  */
1000 xmlChar *
1001 xmlEscapeFormatString(xmlChar **msg)
1002 {
1003     xmlChar *msgPtr = NULL;
1004     xmlChar *result = NULL;
1005     xmlChar *resultPtr = NULL;
1006     size_t count = 0;
1007     size_t msgLen = 0;
1008     size_t resultLen = 0;
1009
1010     if (!msg || !*msg)
1011         return(NULL);
1012
1013     for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1014         ++msgLen;
1015         if (*msgPtr == '%')
1016             ++count;
1017     }
1018
1019     if (count == 0)
1020         return(*msg);
1021
1022     if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1023         return(NULL);
1024     resultLen = msgLen + count + 1;
1025     result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1026     if (result == NULL) {
1027         /* Clear *msg to prevent format string vulnerabilities in
1028            out-of-memory situations. */
1029         xmlFree(*msg);
1030         *msg = NULL;
1031         xmlErrMemory(NULL, NULL);
1032         return(NULL);
1033     }
1034
1035     for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1036         *resultPtr = *msgPtr;
1037         if (*msgPtr == '%')
1038             *(++resultPtr) = '%';
1039     }
1040     result[resultLen - 1] = '\0';
1041
1042     xmlFree(*msg);
1043     *msg = result;
1044
1045     return *msg;
1046 }
1047