source/texk/web2c/luatexdir/unilib/ustring.c

   1 /* Copyright (C) 2000-2012 by George Williams */
   2 /*
   3  * Redistribution and use in source and binary forms, with or without
   4  * modification, are permitted provided that the following conditions are met:
   5
   6  * Redistributions of source code must retain the above copyright notice, this
   7  * list of conditions and the following disclaimer.
   8
   9  * Redistributions in binary form must reproduce the above copyright notice,
  10  * this list of conditions and the following disclaimer in the documentation
  11  * and/or other materials provided with the distribution.
  12
  13  * The name of the author may not be used to endorse or promote products
  14  * derived from this software without specific prior written permission.
  15
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  17  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  18  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  19  * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  22  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  24  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  25  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26  */
  27 #include "ustring.h"
  28 #include "utype.h"
  29 #include <stddef.h>
  30
  31 long uc_strcmp(const unichar_t *str1,const char *str2) {
  32     long ch1, ch2;
  33     for (;;) {
  34         ch1 = *str1++; ch2 = *(unsigned char *) str2++ ;
  35         if ( ch1!=ch2 || ch1=='\0' )
  36 return(ch1-ch2);
  37     }
  38 }
  39
  40 long uc_strncmp(const unichar_t *str1,const char *str2,int n) {
  41     long ch1, ch2;
  42     while ( --n>=0 ) {
  43         ch1 = *str1++; ch2 = *(unsigned char *) str2++ ;
  44         if ( ch1!=ch2 || ch1=='\0' )
  45 return(ch1-ch2);
  46     }
  47 return( 0 );
  48 }
  49
  50 long uc_strmatch(const unichar_t *str1, const char *str2) {
  51     long ch1, ch2;
  52     for (;;) {
  53         ch1 = *str1++; ch2 = *(unsigned char *) str2++ ;
  54         ch1 = tolower(ch1);
  55         ch2 = tolower(ch2);
  56         if ( ch1!=ch2 || ch1=='\0' )
  57 return(ch1-ch2);
  58     }
  59 }
  60
  61 long uc_strnmatch(const unichar_t *str1, const char *str2, int len) {
  62     long ch1, ch2;
  63     for (;--len>=0;) {
  64         ch1 = *str1++; ch2 = *(unsigned char *) str2++ ;
  65         ch1 = tolower(ch1);
  66         ch2 = tolower(ch2);
  67         if ( ch1!=ch2 || ch1=='\0' || len<=0 )
  68 return(ch1-ch2);
  69     }
  70 return( 0 );
  71 }
  72
  73 long u_strnmatch(const unichar_t *str1, const unichar_t *str2, int len) {
  74     long ch1, ch2;
  75     for (;--len>=0;) {
  76         ch1 = *str1++; ch2 = *str2++ ;
  77         ch1 = tolower(ch1);
  78         ch2 = tolower(ch2);
  79         if ( ch1!=ch2 || ch1=='\0' || len<=0 )
  80 return(ch1-ch2);
  81     }
  82 return( 0 );
  83 }
  84
  85 long u_strcmp(const unichar_t *str1,const unichar_t *str2) {
  86     long ch1, ch2;
  87     for (;;) {
  88         ch1 = *str1++; ch2 = *str2++ ;
  89         if ( ch1!=ch2 || ch1=='\0' )
  90 return(ch1-ch2);
  91     }
  92 }
  93
  94 long u_strncmp(const unichar_t *str1,const unichar_t *str2,int n) {
  95     long ch1, ch2;
  96     while ( --n>=0 ) {
  97         ch1 = *str1++; ch2 = *str2++ ;
  98         if ( ch1!=ch2 || ch1=='\0' )
  99 return(ch1-ch2);
 100     }
 101 return( 0 );
 102 }
 103
 104 long u_strmatch(const unichar_t *str1, const unichar_t *str2) {
 105     long ch1, ch2;
 106     for (;;) {
 107         ch1 = *str1++; ch2 = *str2++ ;
 108         ch1 = tolower(ch1);
 109         ch2 = tolower(ch2);
 110         if ( ch1!=ch2 || ch1=='\0' )
 111 return(ch1-ch2);
 112     }
 113 }
 114
 115 void cu_strcpy(char *to, const unichar_t *from) {
 116     register unichar_t ch;
 117     while ( (ch = *from++) != '\0' )
 118         *(to++) = ch;
 119     *to = 0;
 120 }
 121
 122 void uc_strcpy(unichar_t *to, const char *from) {
 123     register unichar_t ch;
 124     while ( (ch = *(unsigned char *) from++) != '\0' )
 125         *(to++) = ch;
 126     *to = 0;
 127 }
 128
 129 void u_strcpy(unichar_t *to, const unichar_t *from) {
 130     register unichar_t ch;
 131     while ( (ch = *from++) != '\0' )
 132         *(to++) = ch;
 133     *to = 0;
 134 }
 135
 136 void u_strncpy(register unichar_t *to, const unichar_t *from, int len) {
 137     register unichar_t ch;
 138     while ( (ch = *from++) != '\0' && --len>=0 )
 139         *(to++) = ch;
 140     *to = 0;
 141 }
 142
 143 void cu_strncpy(register char *to, const unichar_t *from, int len) {
 144     register unichar_t ch;
 145     while ( (ch = *from++) != '\0' && --len>=0 )
 146         *(to++) = ch;
 147     *to = 0;
 148 }
 149
 150 void uc_strncpy(register unichar_t *to, const char *from, int len) {
 151     register unichar_t ch;
 152     while ( (ch = *(unsigned char *) from++) != '\0' && --len>=0 )
 153         *(to++) = ch;
 154     *to = 0;
 155 }
 156
 157 void uc_strcat(unichar_t *to, const char *from) {
 158     uc_strcpy(to+u_strlen(to),from);
 159 }
 160
 161 void uc_strncat(unichar_t *to, const char *from,int len) {
 162     uc_strncpy(to+u_strlen(to),from,len);
 163 }
 164
 165 void cu_strcat(char *to, const unichar_t *from) {
 166     cu_strcpy(to+strlen(to),from);
 167 }
 168
 169 void cu_strncat(char *to, const unichar_t *from, int len) {
 170     cu_strncpy(to+strlen(to),from,len);
 171 }
 172
 173 void u_strcat(unichar_t *to, const unichar_t *from) {
 174     u_strcpy(to+u_strlen(to),from);
 175 }
 176
 177 void u_strncat(unichar_t *to, const unichar_t *from, int len) {
 178     u_strncpy(to+u_strlen(to),from,len);
 179 }
 180
 181 int  u_strlen(register const unichar_t *str) {
 182     register int len = 0;
 183
 184     while ( *str++!='\0' )
 185         ++len;
 186 return( len );
 187 }
 188
 189 unichar_t *u_strchr(const unichar_t *str ,unichar_t ch) {
 190     register unichar_t test;
 191
 192     while ( (test=*(str++))!='\0' )
 193         if ( test==ch )
 194 return( (unichar_t *) str-1 );
 195
 196 return( NULL );
 197 }
 198
 199 unichar_t *u_strrchr(const unichar_t *str ,unichar_t ch) {
 200     register unichar_t test, *last = NULL;
 201
 202     while ( (test=*(str++))!='\0' )
 203         if ( test==ch )
 204             last = (unichar_t *) str-1;
 205
 206 return( last );
 207 }
 208
 209 unichar_t *uc_strstr(const unichar_t *longer, const char *substr) {
 210     long ch1, ch2;
 211     const unichar_t *lpt, *str1; const char *str2;
 212
 213     for ( lpt=longer; *lpt!='\0'; ++lpt ) {
 214         str1 = lpt; str2 = substr;
 215         for (;;) {
 216             ch1 = *str1++; ch2 = *(unsigned char *) str2++ ;
 217             if ( ch2=='\0' )
 218 return((unichar_t *) lpt);
 219             if ( ch1!=ch2 )
 220         break;
 221         }
 222     }
 223 return( NULL );
 224 }
 225
 226 unichar_t *u_strstr(const unichar_t *longer, const unichar_t *substr) {
 227     long ch1, ch2;
 228     const unichar_t *lpt, *str1, *str2;
 229
 230     for ( lpt=longer; *lpt!='\0'; ++lpt ) {
 231         str1 = lpt; str2 = substr;
 232         for (;;) {
 233             ch1 = *str1++; ch2 = *str2++ ;
 234             if ( ch2=='\0' )
 235 return((unichar_t *) lpt);
 236             if ( ch1!=ch2 )
 237         break;
 238         }
 239     }
 240 return( NULL );
 241 }
 242
 243 unichar_t *uc_strstrmatch(const unichar_t *longer, const char *substr) {
 244     long ch1, ch2;
 245     const unichar_t *lpt, *str1; const unsigned char *str2;
 246
 247     for ( lpt=longer; *lpt!='\0'; ++lpt ) {
 248         str1 = lpt; str2 = (unsigned char *) substr;
 249         for (;;) {
 250             ch1 = *str1++; ch2 = *str2++ ;
 251             ch1 = tolower(ch1);
 252             ch2 = tolower(ch2);
 253             if ( ch2=='\0' )
 254 return((unichar_t *) lpt);
 255             if ( ch1!=ch2 )
 256         break;
 257         }
 258     }
 259 return( NULL );
 260 }
 261
 262 unichar_t *u_strstrmatch(const unichar_t *longer, const unichar_t *substr) {
 263     long ch1, ch2;
 264     const unichar_t *lpt, *str1, *str2;
 265
 266     for ( lpt=longer; *lpt!='\0'; ++lpt ) {
 267         str1 = lpt; str2 = substr;
 268         for (;;) {
 269             ch1 = *str1++; ch2 = *str2++ ;
 270             ch1 = tolower(ch1);
 271             ch2 = tolower(ch2);
 272             if ( ch2=='\0' )
 273 return((unichar_t *) lpt);
 274             if ( ch1!=ch2 )
 275         break;
 276         }
 277     }
 278 return( NULL );
 279 }
 280
 281 unichar_t *u_copyn(const unichar_t *pt, long n) {
 282     unichar_t *res;
 283 #ifdef MEMORY_MASK
 284     if ( n*sizeof(unichar_t)>=MEMORY_MASK )
 285         n = MEMORY_MASK/sizeof(unichar_t)-1;
 286 #endif
 287     res = (unichar_t *) xmalloc((n+1)*sizeof(unichar_t));
 288     memcpy(res,pt,n*sizeof(unichar_t));
 289     res[n]='\0';
 290 return(res);
 291 }
 292
 293 unichar_t *u_copynallocm(const unichar_t *pt, long n, long m) {
 294     unichar_t *res;
 295 #ifdef MEMORY_MASK
 296     if ( n*sizeof(unichar_t)>=MEMORY_MASK )
 297         n = MEMORY_MASK/sizeof(unichar_t)-1;
 298 #endif
 299     res = xmalloc((m+1)*sizeof(unichar_t));
 300     memcpy(res,pt,n*sizeof(unichar_t));
 301     res[n]='\0';
 302 return(res);
 303 }
 304
 305 unichar_t *u_copy(const unichar_t *pt) {
 306     if(pt)
 307 return u_copyn(pt,u_strlen(pt));
 308
 309 return((unichar_t *)0);
 310 }
 311
 312 unichar_t *u_concat(const unichar_t *s1, const unichar_t *s2) {
 313     long len1, len2;
 314     unichar_t *pt;
 315
 316     if ( s1==NULL )
 317 return( u_copy( s2 ));
 318     else if ( s2==NULL )
 319 return( u_copy( s1 ));
 320     len1 = u_strlen(s1); len2 = u_strlen(s2);
 321     pt = (unichar_t *) xmalloc((len1+len2+1)*sizeof(unichar_t));
 322     u_strcpy(pt,s1);
 323     u_strcpy(pt+len1,s2);
 324 return( pt );
 325 }
 326
 327 unichar_t *uc_copyn(const char *pt,int len) {
 328     unichar_t *res, *rpt;
 329
 330     if(!pt)
 331 return((unichar_t *)0);
 332
 333 #ifdef MEMORY_MASK
 334     if ( (len+1)*sizeof(unichar_t)>=MEMORY_MASK )
 335         len = MEMORY_MASK/sizeof(unichar_t)-1;
 336 #endif
 337     res = (unichar_t *) xmalloc((len+1)*sizeof(unichar_t));
 338     for ( rpt=res; --len>=0 ; *rpt++ = *(unsigned char *) pt++ );
 339     *rpt = '\0';
 340 return(res);
 341 }
 342
 343 unichar_t *uc_copy(const char *pt) {
 344     unichar_t *res, *rpt;
 345     int n;
 346
 347     if(!pt)
 348 return((unichar_t *)0);
 349
 350     n = strlen(pt);
 351 #ifdef MEMORY_MASK
 352     if ( (n+1)*sizeof(unichar_t)>=MEMORY_MASK )
 353         n = MEMORY_MASK/sizeof(unichar_t)-1;
 354 #endif
 355     res = (unichar_t *) xmalloc((n+1)*sizeof(unichar_t));
 356     for ( rpt=res; --n>=0 ; *rpt++ = *(unsigned char *) pt++ );
 357     *rpt = '\0';
 358 return(res);
 359 }
 360
 361 char *cu_copyn(const unichar_t *pt,int len) {
 362     char *res, *rpt;
 363
 364     if(!pt)
 365 return(NULL);
 366
 367 #ifdef MEMORY_MASK
 368     if ( (len+1)>=MEMORY_MASK )
 369         len = MEMORY_MASK-1;
 370 #endif
 371     res = (char *) xmalloc(len+1);
 372     for ( rpt=res; --len>=0 ; *rpt++ = *pt++ );
 373     *rpt = '\0';
 374 return(res);
 375 }
 376
 377 char *cu_copy(const unichar_t *pt) {
 378     char *res, *rpt;
 379     int n;
 380
 381     if(!pt)
 382 return((char *)0);
 383
 384     n = u_strlen(pt);
 385 #ifdef MEMORY_MASK
 386     if ( (n+1)>=MEMORY_MASK )
 387         n = MEMORY_MASK/sizeof(unichar_t)-1;
 388 #endif
 389     res = (char *) xmalloc(n+1);
 390     for ( rpt=res; --n>=0 ; *rpt++ = *pt++ );
 391     *rpt = '\0';
 392 return(res);
 393 }
 394
 395 double u_strtod(const unichar_t *str, unichar_t **ptr) {
 396     char buf[60], *pt, *ret;
 397     const unichar_t *upt;
 398     double val;
 399     extern double strtod();             /* Please don't delete this, not all of us have good ansi headers */
 400
 401     for ( upt=str, pt=buf; *upt<128 && *upt!='\0' && pt-buf<sizeof(buf)-1; )
 402         *pt++ = *upt++;
 403     *pt = '\0';
 404     val = strtod(buf,&ret);
 405     if ( ptr!=NULL ) {
 406         if ( pt==ret )
 407             *ptr = (unichar_t *) upt;
 408         else
 409             *ptr = (unichar_t *) (str + (ret-buf));
 410     }
 411 return( val );
 412 }
 413
 414 long u_strtol(const unichar_t *str, unichar_t **ptr, int base) {
 415     char buf[60], *pt, *ret;
 416     const unichar_t *upt;
 417     long val;
 418     extern long strtol();               /* Please don't delete this, not all of us have good ansi headers */
 419
 420     for ( upt=str, pt=buf; *upt<128 && *upt!='\0' && pt<buf+sizeof(buf)-1; )
 421         *pt++ = *upt++;
 422     *pt = '\0';
 423     val = strtol(buf,&ret,base);
 424     if ( ptr!=NULL ) {
 425         if ( pt==ret )
 426             *ptr = (unichar_t *) upt;
 427         else
 428             *ptr = (unichar_t *) (str + (ret-buf));
 429     }
 430 return( val );
 431 }
 432
 433 unsigned long u_strtoul(const unichar_t *str, unichar_t **ptr, int base) {
 434     char buf[60], *pt, *ret;
 435     const unichar_t *upt;
 436     unsigned long val;
 437
 438     for ( upt=str, pt=buf; *upt<128 && *upt!='\0' && pt<buf+sizeof(buf)-1; )
 439         *pt++ = *upt++;
 440     *pt = '\0';
 441     val = strtoul(buf,&ret,base);
 442     if ( ptr!=NULL ) {
 443         if ( pt==ret )
 444             *ptr = (unichar_t *) upt;
 445         else
 446             *ptr = (unichar_t *) (str + (ret-buf));
 447     }
 448 return( val );
 449 }
 450
 451 unichar_t *cu_strstartmatch(const char *key,const unichar_t *str) {
 452     if ( key && str ) {
 453         while( *key ) {
 454             if(tolower(*key) != tolower(*str))
 455 return 0;
 456             key++;
 457             str++;
 458         }
 459     }
 460 return (unichar_t *)str;
 461 }
 462
 463 unichar_t *u_strstartmatch(const unichar_t *initial, const unichar_t *full) {
 464     int ch1, ch2;
 465     for (;;) {
 466         ch1 = *initial++; ch2 = *full++ ;
 467         if ( ch1=='\0' )
 468 return( (unichar_t *) full );
 469         ch1 = tolower(ch1);
 470         ch2 = tolower(ch2);
 471         if ( ch1!=ch2 || ch1=='\0' )
 472 return(NULL);
 473     }
 474 }
 475
 476 char *u_to_c(const unichar_t *ubuf) {
 477     static char buf[400];
 478     cu_strncpy(buf,ubuf,sizeof(buf));
 479 return( buf );
 480 }
 481
 482 unichar_t *c_to_u(const char *buf) {
 483     static unichar_t ubuf[400];
 484     uc_strncpy(ubuf,buf,sizeof(ubuf));
 485 return( ubuf );
 486 }
 487
 488 unichar_t *utf82u_strncpy(unichar_t *ubuf,const char *utf8buf,int len) {
 489     unichar_t *upt=ubuf, *uend=ubuf+len-1;
 490     const uint8 *pt = (const uint8 *) utf8buf, *end = pt+strlen(utf8buf);
 491     int w, w2;
 492
 493     while ( pt<end && *pt!='\0' && upt<uend ) {
 494         if ( *pt<=127 )
 495             *upt = *pt++;
 496         else if ( *pt<=0xdf ) {
 497             *upt = ((*pt&0x1f)<<6) | (pt[1]&0x3f);
 498             pt += 2;
 499         } else if ( *pt<=0xef ) {
 500             *upt = ((*pt&0xf)<<12) | ((pt[1]&0x3f)<<6) | (pt[2]&0x3f);
 501             pt += 3;
 502         } else {
 503             w = ( ((*pt&0x7)<<2) | ((pt[1]&0x30)>>4) )-1;
 504             w = (w<<6) | ((pt[1]&0xf)<<2) | ((pt[2]&0x30)>>4);
 505             w2 = ((pt[2]&0xf)<<6) | (pt[3]&0x3f);
 506             *upt = w*0x400 + w2 + 0x10000;
 507             pt += 4;
 508         }
 509         ++upt;
 510     }
 511     *upt = '\0';
 512 return( ubuf );
 513 }
 514
 515 unichar_t *utf82u_strcpy(unichar_t *ubuf,const char *utf8buf) {
 516 return( utf82u_strncpy(ubuf,utf8buf,strlen(utf8buf)+1));
 517 }
 518
 519 unichar_t *utf82u_copyn(const char *utf8buf,int len) {
 520     unichar_t *ubuf = (unichar_t *) xmalloc((len+1)*sizeof(unichar_t));
 521 return( utf82u_strncpy(ubuf,utf8buf,len+1));
 522 }
 523
 524 unichar_t *utf82u_copy(const char *utf8buf) {
 525     int len;
 526     unichar_t *ubuf;
 527
 528     if ( utf8buf==NULL )
 529 return( NULL );
 530
 531     len = strlen(utf8buf);
 532     ubuf = (unichar_t *) xmalloc((len+1)*sizeof(unichar_t));
 533 return( utf82u_strncpy(ubuf,utf8buf,len+1));
 534 }
 535
 536 void utf82u_strcat(unichar_t *to,const char *from) {
 537     utf82u_strcpy(to+u_strlen(to),from);
 538 }
 539
 540 char *u2utf8_strcpy(char *utf8buf,const unichar_t *ubuf) {
 541     char *pt = utf8buf;
 542
 543     while ( *ubuf ) {
 544         if ( *ubuf<0x80 )
 545             *pt++ = *ubuf;
 546         else if ( *ubuf<0x800 ) {
 547             *pt++ = 0xc0 | (*ubuf>>6);
 548             *pt++ = 0x80 | (*ubuf&0x3f);
 549         } else if ( *ubuf < 0x10000 ) {
 550             *pt++ = 0xe0 | (*ubuf>>12);
 551             *pt++ = 0x80 | ((*ubuf>>6)&0x3f);
 552             *pt++ = 0x80 | (*ubuf&0x3f);
 553         } else {
 554             uint32 val = *ubuf-0x10000;
 555             int u = ((val&0xf0000)>>16)+1, z=(val&0x0f000)>>12, y=(val&0x00fc0)>>6, x=val&0x0003f;
 556             *pt++ = 0xf0 | (u>>2);
 557             *pt++ = 0x80 | ((u&3)<<4) | z;
 558             *pt++ = 0x80 | y;
 559             *pt++ = 0x80 | x;
 560         }
 561         ++ubuf;
 562     }
 563     *pt = '\0';
 564 return( utf8buf );
 565 }
 566
 567 char *utf8_strchr(const char *str, int search) {
 568     int ch;
 569     const char *old = str;
 570
 571     while ( (ch = utf8_ildb(&str))!=0 ) {
 572         if ( ch==search )
 573 return( (char *) old );
 574         old = str;
 575     }
 576 return( NULL );
 577 }
 578
 579 char *latin1_2_utf8_strcpy(char *utf8buf,const char *lbuf) {
 580     char *pt = utf8buf;
 581     const unsigned char *lpt = (const unsigned char *) lbuf;
 582
 583     while ( *lpt ) {
 584         if ( *lpt<0x80 )
 585             *pt++ = *lpt;
 586         else {
 587             *pt++ = 0xc0 | (*lpt>>6);
 588             *pt++ = 0x80 | (*lpt&0x3f);
 589         }
 590         ++lpt;
 591     }
 592     *pt = '\0';
 593 return( utf8buf );
 594 }
 595
 596 char *latin1_2_utf8_copy(const char *lbuf) {
 597     int len;
 598     char *utf8buf;
 599
 600     if ( lbuf==NULL )
 601 return( NULL );
 602
 603     len = strlen(lbuf);
 604     utf8buf = (char *) xmalloc(2*len+1);
 605 return( latin1_2_utf8_strcpy(utf8buf,lbuf));
 606 }
 607
 608 char *utf8_2_latin1_copy(const char *utf8buf) {
 609     int len;
 610     int ch;
 611     char *lbuf, *pt; const char *upt;
 612
 613     if ( utf8buf==NULL )
 614 return( NULL );
 615
 616     len = strlen(utf8buf);
 617     pt = lbuf = (char *) xmalloc(len+1);
 618     for ( upt=utf8buf; (ch=utf8_ildb(&upt))!='\0'; )
 619         if ( ch>=0xff )
 620             *pt++ = '?';
 621         else
 622             *pt++ = ch;
 623     *pt = '\0';
 624 return( lbuf );
 625 }
 626
 627 char *u2utf8_copy(const unichar_t *ubuf) {
 628     int len;
 629     char *utf8buf;
 630
 631     if ( ubuf==NULL )
 632 return( NULL );
 633
 634     len = u_strlen(ubuf);
 635     utf8buf = (char *) xmalloc((len+1)*4);
 636 return( u2utf8_strcpy(utf8buf,ubuf));
 637 }
 638
 639 char *u2utf8_copyn(const unichar_t *ubuf,int len) {
 640     int i;
 641     char *utf8buf, *pt;
 642
 643     if ( ubuf==NULL )
 644 return( NULL );
 645
 646     utf8buf = pt = (char *) xmalloc((len+1)*4);
 647     for ( i=0; i<len && *ubuf!='\0'; ++i )
 648         pt = utf8_idpb(pt, *ubuf++);
 649     *pt = '\0';
 650 return( utf8buf );
 651 }
 652
 653 int32 utf8_ildb(const char **_text) {
 654     int32 val= -1;
 655     int ch;
 656     const uint8 *text = (const uint8 *) *_text;
 657     /* Increment and load character */
 658
 659     if ( (ch = *text++)<0x80 ) {
 660         val = ch;
 661     } else if ( ch<=0xbf ) {
 662         /* error */
 663     } else if ( ch<=0xdf ) {
 664         if ( *text>=0x80 && *text<0xc0 )
 665             val = ((ch&0x1f)<<6) | (*text++&0x3f);
 666     } else if ( ch<=0xef ) {
 667         if ( *text>=0x80 && *text<0xc0 && text[1]>=0x80 && text[1]<0xc0 ) {
 668             val = ((ch&0xf)<<12) | ((text[0]&0x3f)<<6) | (text[1]&0x3f);
 669             text += 2;
 670         }
 671     } else {
 672         int w = ( ((ch&0x7)<<2) | ((text[0]&0x30)>>4) )-1, w2;
 673         w = (w<<6) | ((text[0]&0xf)<<2) | ((text[1]&0x30)>>4);
 674         w2 = ((text[1]&0xf)<<6) | (text[2]&0x3f);
 675         val = w*0x400 + w2 + 0x10000;
 676         if ( *text<0x80 || text[1]<0x80 || text[2]<0x80 ||
 677                 *text>=0xc0 || text[1]>=0xc0 || text[2]>=0xc0 )
 678             val = -1;
 679         else
 680             text += 3;
 681     }
 682     *_text = (const char *) text;
 683 return( val );
 684 }
 685
 686 char *utf8_idpb(char *utf8_text,uint32 ch) {
 687     /* Increment and deposit character */
 688     if ( ch>=17*65536 )
 689 return( utf8_text );
 690
 691     if ( ch<=127 )
 692         *utf8_text++ = ch;
 693     else if ( ch<=0x7ff ) {
 694         *utf8_text++ = 0xc0 | (ch>>6);
 695         *utf8_text++ = 0x80 | (ch&0x3f);
 696     } else if ( ch<=0xffff ) {
 697         *utf8_text++ = 0xe0 | (ch>>12);
 698         *utf8_text++ = 0x80 | ((ch>>6)&0x3f);
 699         *utf8_text++ = 0x80 | (ch&0x3f);
 700     } else {
 701         uint32 val = ch-0x10000;
 702         int u = ((val&0xf0000)>>16)+1, z=(val&0x0f000)>>12, y=(val&0x00fc0)>>6, x=val&0x0003f;
 703         *utf8_text++ = 0xf0 | (u>>2);
 704         *utf8_text++ = 0x80 | ((u&3)<<4) | z;
 705         *utf8_text++ = 0x80 | y;
 706         *utf8_text++ = 0x80 | x;
 707     }
 708 return( utf8_text );
 709 }
 710
 711
 712 char *utf8_ib(char *utf8_text) {
 713     int ch;
 714
 715     /* Increment character */
 716     if ( (ch = *utf8_text)=='\0' )
 717 return( utf8_text );
 718     else if ( ch<=127 )
 719 return( utf8_text+1 );
 720     else if ( ch<0xe0 )
 721 return( utf8_text+2 );
 722     else if ( ch<0xf0 )
 723 return( utf8_text+3 );
 724     else
 725 return( utf8_text+4 );
 726 }
 727
 728 int utf8_valid(const char *str) {
 729     /* Is this a valid utf8 string? */
 730     int ch;
 731
 732     while ( (ch=utf8_ildb(&str))!='\0' )
 733         if ( ch==-1 )
 734 return( false );
 735
 736 return( true );
 737 }
 738
 739 void utf8_truncatevalid(char *str) {
 740     /* There are certain cases where we have a fixed amount of space to display */
 741     /*  something, and if it doesn't fit in that, then we truncate it. But... */
 742     /*  that can leave us with a half completed utf8 byte sequence. So truncate*/
 743     /*  again, right before the start of the bad sequence */
 744     int ch;
 745     char *old;
 746
 747     old = str;
 748     while ( (ch=utf8_ildb((const char **) &str))!='\0' ) {
 749         if ( ch==-1 ) {
 750             *old = '\0';
 751 return;
 752         }
 753         old = str;
 754     }
 755 }
 756
 757 char *utf8_db(char *utf8_text) {
 758     /* Decrement utf8 pointer */
 759     unsigned char *pt = (unsigned char *) utf8_text;
 760
 761     --pt;
 762     if ( *pt>=0xc0 )
 763         /* This should never happen. The pointer was looking at an intermediate */
 764         /*  character. However, if it does happen then we are now properly */
 765         /*  positioned at the start of a new char */;
 766     else if ( *pt>=0x80 ) {
 767         --pt;
 768         if ( *pt>=0xc0 )
 769             /* Done */;
 770         else if ( *pt>=0x80 ) {
 771             --pt;
 772             if ( *pt>=0xc0 )
 773                 /* Done */;
 774             else if ( *pt>=0x80 )
 775                 --pt;
 776         }
 777     }
 778 return( (char *) pt );
 779 }
 780
 781 int utf8_strlen(const char *utf8_str) {
 782     /* how many characters in the string NOT bytes */
 783     int len = 0;
 784
 785     while ( utf8_ildb(&utf8_str)>0 )
 786         ++len;
 787 return( len );
 788 }
 789
 790 int utf82u_strlen(const char *utf8_str) {
 791     /* how many shorts needed to represent it in UCS2 */
 792     int ch;
 793     int len = 0;
 794
 795     while ( (ch = utf8_ildb(&utf8_str))>0 )
 796         if ( ch>0x10000 )
 797             len += 2;
 798         else
 799             ++len;
 800 return( len );
 801 }
 802
 803 void utf8_strncpy(register char *to, const char *from, int len) {
 804     /* copy n characters NOT bytes */
 805     const char *old = from;
 806     while ( len && *old ) {
 807         utf8_ildb(&old);
 808         len--;
 809     }
 810     strncpy(to, from, old-from);
 811     to[old-from] = 0;
 812 }
 813
 814 #include <chardata.h>
 815 char *StripToASCII(const char *utf8_str) {
 816     /* Remove any non-ascii characters: Special case, convert the copyright symbol to (c) */
 817     char *newcr, *pt, *end;
 818     int len, ch;
 819     const unichar_t *alt;
 820
 821     len = strlen(utf8_str);
 822     pt = newcr = (char *) xmalloc(len+1);
 823     end = pt+len;
 824     while ( (ch= utf8_ildb(&utf8_str))!='\0' ) {
 825         if ( pt>=end ) {
 826             int off = pt-newcr;
 827             newcr = (char *) xrealloc(newcr,(off+10)+1);
 828             pt = newcr+off;
 829             end = pt+10;
 830         }
 831         if ( (ch>=' ' && ch<'\177' ) || ch=='\n' || ch=='\t' )
 832             *pt++ = ch;
 833         else if ( ch=='\r' && *utf8_str!='\n' )
 834             *pt++ = '\n';
 835         else if ( ch==0xa9 /* Copyright sign */ ) {
 836             char *str = "(c)";
 837             if ( pt+strlen(str)>=end ) {
 838                 int off = pt-newcr;
 839                 newcr = (char *) xrealloc(newcr,(off+10+strlen(str))+1);
 840                 pt = newcr+off;
 841                 end = pt+10;
 842             }
 843             while ( *str )
 844                 *pt++ = *str++;
 845         } else if ( unicode_alternates[ch>>8]!=NULL &&
 846                 (alt = unicode_alternates[ch>>8][ch&0xff])!=NULL ) {
 847             while ( *alt!='\0' ) {
 848                 if ( pt>=end ) {
 849                     int off = pt-newcr;
 850                     newcr = (char *) xrealloc(newcr,(off+10)+1);
 851                     pt = newcr+off;
 852                     end = pt+10;
 853                 }
 854                 if ( *alt>=' ' && *alt<'\177' )
 855                     *pt++ = *alt;
 856                 else if ( *alt==0x300 )
 857                     *pt++ = '`';
 858                 else if ( *alt==0x301 )
 859                     *pt++ = '\'';
 860                 else if ( *alt==0x302 )
 861                     *pt++ = '^';
 862                 else if ( *alt==0x303 )
 863                     *pt++ = '~';
 864                 else if ( *alt==0x308 )
 865                     *pt++ = ':';
 866                 ++alt;
 867             }
 868         }
 869     }
 870     *pt = '\0';
 871 return( newcr );
 872 }
 873
 874 int AllAscii(const char *txt) {
 875     for ( ; *txt!='\0'; ++txt ) {
 876         if ( *txt=='\t' || *txt=='\n' || *txt=='\r' )
 877             /* All right */;
 878         else if ( *txt<' ' || *txt>='\177' )
 879 return( false );
 880     }
 881 return( true );
 882 }
 883
 884 int uAllAscii(const unichar_t *txt) {
 885     for ( ; *txt!='\0'; ++txt ) {
 886         if ( *txt=='\t' || *txt=='\n' || *txt=='\r' )
 887             /* All right */;
 888         else if ( *txt<' ' || *txt>='\177' )
 889 return( false );
 890     }
 891 return( true );
 892 }
 893
 894 char* chomp( char* line ) {
 895     if( !line )
 896         return line;
 897     if ( line[strlen(line)-1]=='\n' )
 898         line[strlen(line)-1] = '\0';
 899     if ( line[strlen(line)-1]=='\r' )
 900         line[strlen(line)-1] = '\0';
 901     return line;
 902 }
 903
 904 char *copytolower(const char *input)
 905 {
 906     char* ret = xstrdup(input);
 907     char* p = ret;
 908     for( ; *p; ++p ) {
 909         *p = tolower(*p);
 910     }
 911     return ret;
 912 }
 913
 914
 915 int endswith(const char *haystack,const char *needle) {
 916     int haylen = strlen( haystack );
 917     int nedlen = strlen( needle );
 918     char* p;
 919     if( haylen < nedlen )
 920         return 0;
 921     p = strstr( haystack + haylen - nedlen, needle );
 922     return p == ( haystack + haylen - nedlen );
 923 }
 924
 925 int endswithi(const char *haystackZ,const char *needleZ) {
 926     char* haystack = copytolower(haystackZ);
 927     char* needle   = copytolower(needleZ);
 928     int ret = endswith( haystack, needle );
 929     free( haystack );
 930     free( needle );
 931     return ret;
 932 }
 933
 934 int endswithi_partialExtension( const char *haystackZ,const char *needleZ) {
 935     int nedlen = strlen(needleZ);
 936     char* haystack;
 937     char* needle;
 938     int ret;
 939     int i;
 940     if( nedlen == 0 ) {
 941         return 0;
 942     }
 943     haystack = copytolower(haystackZ);
 944     needle   = copytolower(needleZ);
 945     ret = 0;
 946     i = nedlen-1;
 947     ret |= endswith( haystack, needle );
 948     for( ; i>=0 && !ret ; --i ) {
 949         needle[i] = '\0';
 950         ret |= endswith( haystack, needle );
 951     }
 952     free( haystack );
 953     free( needle );
 954     return ret;
 955 }
 956
 957 int u_endswith(const unichar_t *haystack,const unichar_t *needle) {
 958     int haylen = u_strlen( haystack );
 959     int nedlen = u_strlen( needle );
 960     unichar_t* p;
 961     if( haylen < nedlen )
 962         return 0;
 963     p = u_strstr( haystack + haylen - nedlen, needle );
 964     return p == ( haystack + haylen - nedlen );
 965 }