libs/xml2/xmlstring.c

   1 /*
   2  * string.c : an XML string utilities module
   3  *
   4  * This module provides various utility functions for manipulating
   5  * the xmlChar* type. All functions named xmlStr* have been moved here
   6  * from the parser.c file (their original home).
   7  *
   8  * See Copyright for the status of this software.
   9  *
  10  * UTF8 string routines from:
  11  * William Brack <wbrack@mmm.com.hk>
  12  *
  13  * daniel@veillard.com
  14  */
  15
  16 #define IN_LIBXML
  17 #include "libxml.h"
  18
  19 #include <stdlib.h>
  20 #include <string.h>
  21 #include <limits.h>
  22 #include <libxml/xmlmemory.h>
  23 #include <libxml/parserInternals.h>
  24 #include <libxml/xmlstring.h>
  25
  26 /************************************************************************
  27  *                                                                      *
  28  *                Commodity functions to handle xmlChars                *
  29  *                                                                      *
  30  ************************************************************************/
  31
  32 /**
  33  * xmlStrndup:
  34  * @cur:  the input xmlChar *
  35  * @len:  the len of @cur
  36  *
  37  * a strndup for array of xmlChar's
  38  *
  39  * Returns a new xmlChar * or NULL
  40  */
  41 xmlChar *
  42 xmlStrndup(const xmlChar *cur, int len) {
  43     xmlChar *ret;
  44
  45     if ((cur == NULL) || (len < 0)) return(NULL);
  46     ret = (xmlChar *) xmlMallocAtomic(((size_t) len + 1) * sizeof(xmlChar));
  47     if (ret == NULL) {
  48         xmlErrMemory(NULL, NULL);
  49         return(NULL);
  50     }
  51     memcpy(ret, cur, len * sizeof(xmlChar));
  52     ret[len] = 0;
  53     return(ret);
  54 }
  55
  56 /**
  57  * xmlStrdup:
  58  * @cur:  the input xmlChar *
  59  *
  60  * a strdup for array of xmlChar's. Since they are supposed to be
  61  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
  62  * a termination mark of '0'.
  63  *
  64  * Returns a new xmlChar * or NULL
  65  */
  66 xmlChar *
  67 xmlStrdup(const xmlChar *cur) {
  68     const xmlChar *p = cur;
  69
  70     if (cur == NULL) return(NULL);
  71     while (*p != 0) p++; /* non input consuming */
  72     return(xmlStrndup(cur, p - cur));
  73 }
  74
  75 /**
  76  * xmlCharStrndup:
  77  * @cur:  the input char *
  78  * @len:  the len of @cur
  79  *
  80  * a strndup for char's to xmlChar's
  81  *
  82  * Returns a new xmlChar * or NULL
  83  */
  84
  85 xmlChar *
  86 xmlCharStrndup(const char *cur, int len) {
  87     int i;
  88     xmlChar *ret;
  89
  90     if ((cur == NULL) || (len < 0)) return(NULL);
  91     ret = (xmlChar *) xmlMallocAtomic(((size_t) len + 1) * sizeof(xmlChar));
  92     if (ret == NULL) {
  93         xmlErrMemory(NULL, NULL);
  94         return(NULL);
  95     }
  96     for (i = 0;i < len;i++) {
  97         ret[i] = (xmlChar) cur[i];
  98         if (ret[i] == 0) return(ret);
  99     }
 100     ret[len] = 0;
 101     return(ret);
 102 }
 103
 104 /**
 105  * xmlCharStrdup:
 106  * @cur:  the input char *
 107  *
 108  * a strdup for char's to xmlChar's
 109  *
 110  * Returns a new xmlChar * or NULL
 111  */
 112
 113 xmlChar *
 114 xmlCharStrdup(const char *cur) {
 115     const char *p = cur;
 116
 117     if (cur == NULL) return(NULL);
 118     while (*p != '\0') p++; /* non input consuming */
 119     return(xmlCharStrndup(cur, p - cur));
 120 }
 121
 122 /**
 123  * xmlStrcmp:
 124  * @str1:  the first xmlChar *
 125  * @str2:  the second xmlChar *
 126  *
 127  * a strcmp for xmlChar's
 128  *
 129  * Returns the integer result of the comparison
 130  */
 131
 132 int
 133 xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
 134     if (str1 == str2) return(0);
 135     if (str1 == NULL) return(-1);
 136     if (str2 == NULL) return(1);
 137 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
 138     return(strcmp((const char *)str1, (const char *)str2));
 139 #else
 140     do {
 141         int tmp = *str1++ - *str2;
 142         if (tmp != 0) return(tmp);
 143     } while (*str2++ != 0);
 144     return 0;
 145 #endif
 146 }
 147
 148 /**
 149  * xmlStrEqual:
 150  * @str1:  the first xmlChar *
 151  * @str2:  the second xmlChar *
 152  *
 153  * Check if both strings are equal of have same content.
 154  * Should be a bit more readable and faster than xmlStrcmp()
 155  *
 156  * Returns 1 if they are equal, 0 if they are different
 157  */
 158
 159 int
 160 xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
 161     if (str1 == str2) return(1);
 162     if (str1 == NULL) return(0);
 163     if (str2 == NULL) return(0);
 164 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
 165     return(strcmp((const char *)str1, (const char *)str2) == 0);
 166 #else
 167     do {
 168         if (*str1++ != *str2) return(0);
 169     } while (*str2++);
 170     return(1);
 171 #endif
 172 }
 173
 174 /**
 175  * xmlStrQEqual:
 176  * @pref:  the prefix of the QName
 177  * @name:  the localname of the QName
 178  * @str:  the second xmlChar *
 179  *
 180  * Check if a QName is Equal to a given string
 181  *
 182  * Returns 1 if they are equal, 0 if they are different
 183  */
 184
 185 int
 186 xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
 187     if (pref == NULL) return(xmlStrEqual(name, str));
 188     if (name == NULL) return(0);
 189     if (str == NULL) return(0);
 190
 191     do {
 192         if (*pref++ != *str) return(0);
 193     } while ((*str++) && (*pref));
 194     if (*str++ != ':') return(0);
 195     do {
 196         if (*name++ != *str) return(0);
 197     } while (*str++);
 198     return(1);
 199 }
 200
 201 /**
 202  * xmlStrncmp:
 203  * @str1:  the first xmlChar *
 204  * @str2:  the second xmlChar *
 205  * @len:  the max comparison length
 206  *
 207  * a strncmp for xmlChar's
 208  *
 209  * Returns the integer result of the comparison
 210  */
 211
 212 int
 213 xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
 214     if (len <= 0) return(0);
 215     if (str1 == str2) return(0);
 216     if (str1 == NULL) return(-1);
 217     if (str2 == NULL) return(1);
 218 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
 219     return(strncmp((const char *)str1, (const char *)str2, len));
 220 #else
 221     do {
 222         int tmp = *str1++ - *str2;
 223         if (tmp != 0 || --len == 0) return(tmp);
 224     } while (*str2++ != 0);
 225     return 0;
 226 #endif
 227 }
 228
 229 static const xmlChar casemap[256] = {
 230     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
 231     0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
 232     0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 233     0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
 234     0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
 235     0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
 236     0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
 237     0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
 238     0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
 239     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
 240     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
 241     0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
 242     0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
 243     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
 244     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
 245     0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
 246     0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
 247     0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
 248     0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
 249     0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
 250     0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
 251     0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
 252     0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
 253     0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
 254     0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
 255     0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
 256     0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
 257     0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
 258     0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
 259     0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
 260     0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
 261     0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
 262 };
 263
 264 /**
 265  * xmlStrcasecmp:
 266  * @str1:  the first xmlChar *
 267  * @str2:  the second xmlChar *
 268  *
 269  * a strcasecmp for xmlChar's
 270  *
 271  * Returns the integer result of the comparison
 272  */
 273
 274 int
 275 xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
 276     register int tmp;
 277
 278     if (str1 == str2) return(0);
 279     if (str1 == NULL) return(-1);
 280     if (str2 == NULL) return(1);
 281     do {
 282         tmp = casemap[*str1++] - casemap[*str2];
 283         if (tmp != 0) return(tmp);
 284     } while (*str2++ != 0);
 285     return 0;
 286 }
 287
 288 /**
 289  * xmlStrncasecmp:
 290  * @str1:  the first xmlChar *
 291  * @str2:  the second xmlChar *
 292  * @len:  the max comparison length
 293  *
 294  * a strncasecmp for xmlChar's
 295  *
 296  * Returns the integer result of the comparison
 297  */
 298
 299 int
 300 xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
 301     register int tmp;
 302
 303     if (len <= 0) return(0);
 304     if (str1 == str2) return(0);
 305     if (str1 == NULL) return(-1);
 306     if (str2 == NULL) return(1);
 307     do {
 308         tmp = casemap[*str1++] - casemap[*str2];
 309         if (tmp != 0 || --len == 0) return(tmp);
 310     } while (*str2++ != 0);
 311     return 0;
 312 }
 313
 314 /**
 315  * xmlStrchr:
 316  * @str:  the xmlChar * array
 317  * @val:  the xmlChar to search
 318  *
 319  * a strchr for xmlChar's
 320  *
 321  * Returns the xmlChar * for the first occurrence or NULL.
 322  */
 323
 324 const xmlChar *
 325 xmlStrchr(const xmlChar *str, xmlChar val) {
 326     if (str == NULL) return(NULL);
 327     while (*str != 0) { /* non input consuming */
 328         if (*str == val) return((xmlChar *) str);
 329         str++;
 330     }
 331     return(NULL);
 332 }
 333
 334 /**
 335  * xmlStrstr:
 336  * @str:  the xmlChar * array (haystack)
 337  * @val:  the xmlChar to search (needle)
 338  *
 339  * a strstr for xmlChar's
 340  *
 341  * Returns the xmlChar * for the first occurrence or NULL.
 342  */
 343
 344 const xmlChar *
 345 xmlStrstr(const xmlChar *str, const xmlChar *val) {
 346     int n;
 347
 348     if (str == NULL) return(NULL);
 349     if (val == NULL) return(NULL);
 350     n = xmlStrlen(val);
 351
 352     if (n == 0) return(str);
 353     while (*str != 0) { /* non input consuming */
 354         if (*str == *val) {
 355             if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
 356         }
 357         str++;
 358     }
 359     return(NULL);
 360 }
 361
 362 /**
 363  * xmlStrcasestr:
 364  * @str:  the xmlChar * array (haystack)
 365  * @val:  the xmlChar to search (needle)
 366  *
 367  * a case-ignoring strstr for xmlChar's
 368  *
 369  * Returns the xmlChar * for the first occurrence or NULL.
 370  */
 371
 372 const xmlChar *
 373 xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
 374     int n;
 375
 376     if (str == NULL) return(NULL);
 377     if (val == NULL) return(NULL);
 378     n = xmlStrlen(val);
 379
 380     if (n == 0) return(str);
 381     while (*str != 0) { /* non input consuming */
 382         if (casemap[*str] == casemap[*val])
 383             if (!xmlStrncasecmp(str, val, n)) return(str);
 384         str++;
 385     }
 386     return(NULL);
 387 }
 388
 389 /**
 390  * xmlStrsub:
 391  * @str:  the xmlChar * array (haystack)
 392  * @start:  the index of the first char (zero based)
 393  * @len:  the length of the substring
 394  *
 395  * Extract a substring of a given string
 396  *
 397  * Returns the xmlChar * for the first occurrence or NULL.
 398  */
 399
 400 xmlChar *
 401 xmlStrsub(const xmlChar *str, int start, int len) {
 402     int i;
 403
 404     if (str == NULL) return(NULL);
 405     if (start < 0) return(NULL);
 406     if (len < 0) return(NULL);
 407
 408     for (i = 0;i < start;i++) {
 409         if (*str == 0) return(NULL);
 410         str++;
 411     }
 412     if (*str == 0) return(NULL);
 413     return(xmlStrndup(str, len));
 414 }
 415
 416 /**
 417  * xmlStrlen:
 418  * @str:  the xmlChar * array
 419  *
 420  * length of a xmlChar's string
 421  *
 422  * Returns the number of xmlChar contained in the ARRAY.
 423  */
 424
 425 int
 426 xmlStrlen(const xmlChar *str) {
 427     size_t len = 0;
 428
 429     if (str == NULL) return(0);
 430     while (*str != 0) { /* non input consuming */
 431         str++;
 432         len++;
 433     }
 434     return(len > INT_MAX ? 0 : len);
 435 }
 436
 437 /**
 438  * xmlStrncat:
 439  * @cur:  the original xmlChar * array
 440  * @add:  the xmlChar * array added
 441  * @len:  the length of @add
 442  *
 443  * a strncat for array of xmlChar's, it will extend @cur with the len
 444  * first bytes of @add. Note that if @len < 0 then this is an API error
 445  * and NULL will be returned.
 446  *
 447  * Returns a new xmlChar *, the original @cur is reallocated and should
 448  * not be freed.
 449  */
 450
 451 xmlChar *
 452 xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
 453     int size;
 454     xmlChar *ret;
 455
 456     if ((add == NULL) || (len == 0))
 457         return(cur);
 458     if (len < 0)
 459         return(NULL);
 460     if (cur == NULL)
 461         return(xmlStrndup(add, len));
 462
 463     size = xmlStrlen(cur);
 464     if ((size < 0) || (size > INT_MAX - len))
 465         return(NULL);
 466     ret = (xmlChar *) xmlRealloc(cur, ((size_t) size + len + 1) * sizeof(xmlChar));
 467     if (ret == NULL) {
 468         xmlErrMemory(NULL, NULL);
 469         return(cur);
 470     }
 471     memcpy(&ret[size], add, len * sizeof(xmlChar));
 472     ret[size + len] = 0;
 473     return(ret);
 474 }
 475
 476 /**
 477  * xmlStrncatNew:
 478  * @str1:  first xmlChar string
 479  * @str2:  second xmlChar string
 480  * @len:  the len of @str2 or < 0
 481  *
 482  * same as xmlStrncat, but creates a new string.  The original
 483  * two strings are not freed. If @len is < 0 then the length
 484  * will be calculated automatically.
 485  *
 486  * Returns a new xmlChar * or NULL
 487  */
 488 xmlChar *
 489 xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
 490     int size;
 491     xmlChar *ret;
 492
 493     if (len < 0) {
 494         len = xmlStrlen(str2);
 495         if (len < 0)
 496             return(NULL);
 497     }
 498     if ((str2 == NULL) || (len == 0))
 499         return(xmlStrdup(str1));
 500     if (str1 == NULL)
 501         return(xmlStrndup(str2, len));
 502
 503     size = xmlStrlen(str1);
 504     if ((size < 0) || (size > INT_MAX - len))
 505         return(NULL);
 506     ret = (xmlChar *) xmlMalloc(((size_t) size + len + 1) * sizeof(xmlChar));
 507     if (ret == NULL) {
 508         xmlErrMemory(NULL, NULL);
 509         return(xmlStrndup(str1, size));
 510     }
 511     memcpy(ret, str1, size * sizeof(xmlChar));
 512     memcpy(&ret[size], str2, len * sizeof(xmlChar));
 513     ret[size + len] = 0;
 514     return(ret);
 515 }
 516
 517 /**
 518  * xmlStrcat:
 519  * @cur:  the original xmlChar * array
 520  * @add:  the xmlChar * array added
 521  *
 522  * a strcat for array of xmlChar's. Since they are supposed to be
 523  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
 524  * a termination mark of '0'.
 525  *
 526  * Returns a new xmlChar * containing the concatenated string. The original
 527  * @cur is reallocated and should not be freed.
 528  */
 529 xmlChar *
 530 xmlStrcat(xmlChar *cur, const xmlChar *add) {
 531     const xmlChar *p = add;
 532
 533     if (add == NULL) return(cur);
 534     if (cur == NULL)
 535         return(xmlStrdup(add));
 536
 537     while (*p != 0) p++; /* non input consuming */
 538     return(xmlStrncat(cur, add, p - add));
 539 }
 540
 541 /**
 542  * xmlStrPrintf:
 543  * @buf:   the result buffer.
 544  * @len:   the result buffer length.
 545  * @msg:   the message with printf formatting.
 546  * @...:   extra parameters for the message.
 547  *
 548  * Formats @msg and places result into @buf.
 549  *
 550  * Returns the number of characters written to @buf or -1 if an error occurs.
 551  */
 552 int XMLCDECL
 553 xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
 554     va_list args;
 555     int ret;
 556
 557     if((buf == NULL) || (msg == NULL)) {
 558         return(-1);
 559     }
 560
 561     va_start(args, msg);
 562     ret = vsnprintf((char *) buf, len, (const char *) msg, args);
 563     va_end(args);
 564     buf[len - 1] = 0; /* be safe ! */
 565
 566     return(ret);
 567 }
 568
 569 /**
 570  * xmlStrVPrintf:
 571  * @buf:   the result buffer.
 572  * @len:   the result buffer length.
 573  * @msg:   the message with printf formatting.
 574  * @ap:    extra parameters for the message.
 575  *
 576  * Formats @msg and places result into @buf.
 577  *
 578  * Returns the number of characters written to @buf or -1 if an error occurs.
 579  */
 580 int
 581 xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
 582     int ret;
 583
 584     if((buf == NULL) || (msg == NULL)) {
 585         return(-1);
 586     }
 587
 588     ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
 589     buf[len - 1] = 0; /* be safe ! */
 590
 591     return(ret);
 592 }
 593
 594 /************************************************************************
 595  *                                                                      *
 596  *              Generic UTF8 handling routines                          *
 597  *                                                                      *
 598  * From rfc2044: encoding of the Unicode values on UTF-8:               *
 599  *                                                                      *
 600  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
 601  * 0000 0000-0000 007F   0xxxxxxx                                       *
 602  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
 603  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
 604  *                                                                      *
 605  * I hope we won't use values > 0xFFFF anytime soon !                   *
 606  *                                                                      *
 607  ************************************************************************/
 608
 609
 610 /**
 611  * xmlUTF8Size:
 612  * @utf: pointer to the UTF8 character
 613  *
 614  * calculates the internal size of a UTF8 character
 615  *
 616  * returns the numbers of bytes in the character, -1 on format error
 617  */
 618 int
 619 xmlUTF8Size(const xmlChar *utf) {
 620     xmlChar mask;
 621     int len;
 622
 623     if (utf == NULL)
 624         return -1;
 625     if (*utf < 0x80)
 626         return 1;
 627     /* check valid UTF8 character */
 628     if (!(*utf & 0x40))
 629         return -1;
 630     /* determine number of bytes in char */
 631     len = 2;
 632     for (mask=0x20; mask != 0; mask>>=1) {
 633         if (!(*utf & mask))
 634             return len;
 635         len++;
 636     }
 637     return -1;
 638 }
 639
 640 /**
 641  * xmlUTF8Charcmp:
 642  * @utf1: pointer to first UTF8 char
 643  * @utf2: pointer to second UTF8 char
 644  *
 645  * compares the two UCS4 values
 646  *
 647  * returns result of the compare as with xmlStrncmp
 648  */
 649 int
 650 xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
 651
 652     if (utf1 == NULL ) {
 653         if (utf2 == NULL)
 654             return 0;
 655         return -1;
 656     }
 657     return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
 658 }
 659
 660 /**
 661  * xmlUTF8Strlen:
 662  * @utf:  a sequence of UTF-8 encoded bytes
 663  *
 664  * compute the length of an UTF8 string, it doesn't do a full UTF8
 665  * checking of the content of the string.
 666  *
 667  * Returns the number of characters in the string or -1 in case of error
 668  */
 669 int
 670 xmlUTF8Strlen(const xmlChar *utf) {
 671     size_t ret = 0;
 672
 673     if (utf == NULL)
 674         return(-1);
 675
 676     while (*utf != 0) {
 677         if (utf[0] & 0x80) {
 678             if ((utf[1] & 0xc0) != 0x80)
 679                 return(-1);
 680             if ((utf[0] & 0xe0) == 0xe0) {
 681                 if ((utf[2] & 0xc0) != 0x80)
 682                     return(-1);
 683                 if ((utf[0] & 0xf0) == 0xf0) {
 684                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
 685                         return(-1);
 686                     utf += 4;
 687                 } else {
 688                     utf += 3;
 689                 }
 690             } else {
 691                 utf += 2;
 692             }
 693         } else {
 694             utf++;
 695         }
 696         ret++;
 697     }
 698     return(ret > INT_MAX ? 0 : ret);
 699 }
 700
 701 /**
 702  * xmlGetUTF8Char:
 703  * @utf:  a sequence of UTF-8 encoded bytes
 704  * @len:  a pointer to the minimum number of bytes present in
 705  *        the sequence.  This is used to assure the next character
 706  *        is completely contained within the sequence.
 707  *
 708  * Read the first UTF8 character from @utf
 709  *
 710  * Returns the char value or -1 in case of error, and sets *len to
 711  *        the actual number of bytes consumed (0 in case of error)
 712  */
 713 int
 714 xmlGetUTF8Char(const unsigned char *utf, int *len) {
 715     unsigned int c;
 716
 717     if (utf == NULL)
 718         goto error;
 719     if (len == NULL)
 720         goto error;
 721     if (*len < 1)
 722         goto error;
 723
 724     c = utf[0];
 725     if (c & 0x80) {
 726         if (*len < 2)
 727             goto error;
 728         if ((utf[1] & 0xc0) != 0x80)
 729             goto error;
 730         if ((c & 0xe0) == 0xe0) {
 731             if (*len < 3)
 732                 goto error;
 733             if ((utf[2] & 0xc0) != 0x80)
 734                 goto error;
 735             if ((c & 0xf0) == 0xf0) {
 736                 if (*len < 4)
 737                     goto error;
 738                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
 739                     goto error;
 740                 *len = 4;
 741                 /* 4-byte code */
 742                 c = (utf[0] & 0x7) << 18;
 743                 c |= (utf[1] & 0x3f) << 12;
 744                 c |= (utf[2] & 0x3f) << 6;
 745                 c |= utf[3] & 0x3f;
 746             } else {
 747               /* 3-byte code */
 748                 *len = 3;
 749                 c = (utf[0] & 0xf) << 12;
 750                 c |= (utf[1] & 0x3f) << 6;
 751                 c |= utf[2] & 0x3f;
 752             }
 753         } else {
 754           /* 2-byte code */
 755             *len = 2;
 756             c = (utf[0] & 0x1f) << 6;
 757             c |= utf[1] & 0x3f;
 758         }
 759     } else {
 760         /* 1-byte code */
 761         *len = 1;
 762     }
 763     return(c);
 764
 765 error:
 766     if (len != NULL)
 767         *len = 0;
 768     return(-1);
 769 }
 770
 771 /**
 772  * xmlCheckUTF8:
 773  * @utf: Pointer to putative UTF-8 encoded string.
 774  *
 775  * Checks @utf for being valid UTF-8. @utf is assumed to be
 776  * null-terminated. This function is not super-strict, as it will
 777  * allow longer UTF-8 sequences than necessary. Note that Java is
 778  * capable of producing these sequences if provoked. Also note, this
 779  * routine checks for the 4-byte maximum size, but does not check for
 780  * 0x10ffff maximum value.
 781  *
 782  * Return value: true if @utf is valid.
 783  **/
 784 int
 785 xmlCheckUTF8(const unsigned char *utf)
 786 {
 787     int ix;
 788     unsigned char c;
 789
 790     if (utf == NULL)
 791         return(0);
 792     /*
 793      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
 794      * are as follows (in "bit format"):
 795      *    0xxxxxxx                                      valid 1-byte
 796      *    110xxxxx 10xxxxxx                             valid 2-byte
 797      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
 798      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
 799      */
 800     while ((c = utf[0])) {      /* string is 0-terminated */
 801         ix = 0;
 802         if ((c & 0x80) == 0x00) {       /* 1-byte code, starts with 10 */
 803             ix = 1;
 804         } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
 805             if ((utf[1] & 0xc0 ) != 0x80)
 806                 return 0;
 807             ix = 2;
 808         } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
 809             if (((utf[1] & 0xc0) != 0x80) ||
 810                 ((utf[2] & 0xc0) != 0x80))
 811                     return 0;
 812             ix = 3;
 813         } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
 814             if (((utf[1] & 0xc0) != 0x80) ||
 815                 ((utf[2] & 0xc0) != 0x80) ||
 816                 ((utf[3] & 0xc0) != 0x80))
 817                     return 0;
 818             ix = 4;
 819         } else                          /* unknown encoding */
 820             return 0;
 821         utf += ix;
 822       }
 823       return(1);
 824 }
 825
 826 /**
 827  * xmlUTF8Strsize:
 828  * @utf:  a sequence of UTF-8 encoded bytes
 829  * @len:  the number of characters in the array
 830  *
 831  * storage size of an UTF8 string
 832  * the behaviour is not guaranteed if the input string is not UTF-8
 833  *
 834  * Returns the storage size of
 835  * the first 'len' characters of ARRAY
 836  */
 837
 838 int
 839 xmlUTF8Strsize(const xmlChar *utf, int len) {
 840     const xmlChar *ptr=utf;
 841     int ch;
 842     size_t ret;
 843
 844     if (utf == NULL)
 845         return(0);
 846
 847     if (len <= 0)
 848         return(0);
 849
 850     while ( len-- > 0) {
 851         if ( !*ptr )
 852             break;
 853         if ( (ch = *ptr++) & 0x80)
 854             while ((ch<<=1) & 0x80 ) {
 855                 if (*ptr == 0) break;
 856                 ptr++;
 857             }
 858     }
 859     ret = ptr - utf;
 860     return (ret > INT_MAX ? 0 : ret);
 861 }
 862
 863
 864 /**
 865  * xmlUTF8Strndup:
 866  * @utf:  the input UTF8 *
 867  * @len:  the len of @utf (in chars)
 868  *
 869  * a strndup for array of UTF8's
 870  *
 871  * Returns a new UTF8 * or NULL
 872  */
 873 xmlChar *
 874 xmlUTF8Strndup(const xmlChar *utf, int len) {
 875     xmlChar *ret;
 876     int i;
 877
 878     if ((utf == NULL) || (len < 0)) return(NULL);
 879     i = xmlUTF8Strsize(utf, len);
 880     ret = (xmlChar *) xmlMallocAtomic(((size_t) i + 1) * sizeof(xmlChar));
 881     if (ret == NULL) {
 882         return(NULL);
 883     }
 884     memcpy(ret, utf, i * sizeof(xmlChar));
 885     ret[i] = 0;
 886     return(ret);
 887 }
 888
 889 /**
 890  * xmlUTF8Strpos:
 891  * @utf:  the input UTF8 *
 892  * @pos:  the position of the desired UTF8 char (in chars)
 893  *
 894  * a function to provide the equivalent of fetching a
 895  * character from a string array
 896  *
 897  * Returns a pointer to the UTF8 character or NULL
 898  */
 899 const xmlChar *
 900 xmlUTF8Strpos(const xmlChar *utf, int pos) {
 901     int ch;
 902
 903     if (utf == NULL) return(NULL);
 904     if (pos < 0)
 905         return(NULL);
 906     while (pos--) {
 907         if ((ch=*utf++) == 0) return(NULL);
 908         if ( ch & 0x80 ) {
 909             /* if not simple ascii, verify proper format */
 910             if ( (ch & 0xc0) != 0xc0 )
 911                 return(NULL);
 912             /* then skip over remaining bytes for this char */
 913             while ( (ch <<= 1) & 0x80 )
 914                 if ( (*utf++ & 0xc0) != 0x80 )
 915                     return(NULL);
 916         }
 917     }
 918     return((xmlChar *)utf);
 919 }
 920
 921 /**
 922  * xmlUTF8Strloc:
 923  * @utf:  the input UTF8 *
 924  * @utfchar:  the UTF8 character to be found
 925  *
 926  * a function to provide the relative location of a UTF8 char
 927  *
 928  * Returns the relative character position of the desired char
 929  * or -1 if not found
 930  */
 931 int
 932 xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
 933     size_t i;
 934     int size;
 935     int ch;
 936
 937     if (utf==NULL || utfchar==NULL) return -1;
 938     size = xmlUTF8Strsize(utfchar, 1);
 939         for(i=0; (ch=*utf) != 0; i++) {
 940             if (xmlStrncmp(utf, utfchar, size)==0)
 941                 return(i > INT_MAX ? 0 : i);
 942             utf++;
 943             if ( ch & 0x80 ) {
 944                 /* if not simple ascii, verify proper format */
 945                 if ( (ch & 0xc0) != 0xc0 )
 946                     return(-1);
 947                 /* then skip over remaining bytes for this char */
 948                 while ( (ch <<= 1) & 0x80 )
 949                     if ( (*utf++ & 0xc0) != 0x80 )
 950                         return(-1);
 951             }
 952         }
 953
 954     return(-1);
 955 }
 956 /**
 957  * xmlUTF8Strsub:
 958  * @utf:  a sequence of UTF-8 encoded bytes
 959  * @start: relative pos of first char
 960  * @len:   total number to copy
 961  *
 962  * Create a substring from a given UTF-8 string
 963  * Note:  positions are given in units of UTF-8 chars
 964  *
 965  * Returns a pointer to a newly created string
 966  * or NULL if any problem
 967  */
 968
 969 xmlChar *
 970 xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
 971     int i;
 972     int ch;
 973
 974     if (utf == NULL) return(NULL);
 975     if (start < 0) return(NULL);
 976     if (len < 0) return(NULL);
 977
 978     /*
 979      * Skip over any leading chars
 980      */
 981     for (i = 0;i < start;i++) {
 982         if ((ch=*utf++) == 0) return(NULL);
 983         if ( ch & 0x80 ) {
 984             /* if not simple ascii, verify proper format */
 985             if ( (ch & 0xc0) != 0xc0 )
 986                 return(NULL);
 987             /* then skip over remaining bytes for this char */
 988             while ( (ch <<= 1) & 0x80 )
 989                 if ( (*utf++ & 0xc0) != 0x80 )
 990                     return(NULL);
 991         }
 992     }
 993
 994     return(xmlUTF8Strndup(utf, len));
 995 }
 996
 997 /**
 998  * xmlEscapeFormatString:
 999  * @msg:  a pointer to the string in which to escape '%' characters.
1000  * Must be a heap-allocated buffer created by libxml2 that may be
1001  * returned, or that may be freed and replaced.
1002  *
1003  * Replaces the string pointed to by 'msg' with an escaped string.
1004  * Returns the same string with all '%' characters escaped.
1005  */
1006 xmlChar *
1007 xmlEscapeFormatString(xmlChar **msg)
1008 {
1009     xmlChar *msgPtr = NULL;
1010     xmlChar *result = NULL;
1011     xmlChar *resultPtr = NULL;
1012     size_t count = 0;
1013     size_t msgLen = 0;
1014     size_t resultLen = 0;
1015
1016     if (!msg || !*msg)
1017         return(NULL);
1018
1019     for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1020         ++msgLen;
1021         if (*msgPtr == '%')
1022             ++count;
1023     }
1024
1025     if (count == 0)
1026         return(*msg);
1027
1028     if ((count > INT_MAX) || (msgLen > INT_MAX - count))
1029         return(NULL);
1030     resultLen = msgLen + count + 1;
1031     result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1032     if (result == NULL) {
1033         /* Clear *msg to prevent format string vulnerabilities in
1034            out-of-memory situations. */
1035         xmlFree(*msg);
1036         *msg = NULL;
1037         xmlErrMemory(NULL, NULL);
1038         return(NULL);
1039     }
1040
1041     for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1042         *resultPtr = *msgPtr;
1043         if (*msgPtr == '%')
1044             *(++resultPtr) = '%';
1045     }
1046     result[resultLen - 1] = '\0';
1047
1048     xmlFree(*msg);
1049     *msg = result;
1050
1051     return *msg;
1052 }
1053
1054 #define bottom_xmlstring
1055 #include "elfgcchack.h"