src/src/Data/String/analyze.cpp

   1
   2 /******************************************************************************
   3 * MODULE     : analyze.cpp
   4 * DESCRIPTION: Properties of characters and strings
   5 * COPYRIGHT  : (C) 1999  Joris van der Hoeven
   6 *******************************************************************************
   7 * This software falls under the GNU general public license version 3 or later.
   8 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
   9 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
  10 ******************************************************************************/
  11
  12 #include "analyze.hpp"
  13 #include "merge_sort.hpp"
  14 #include "converter.hpp"
  15 #include "Scheme/object.hpp"
  16
  17 /******************************************************************************
  18 * Tests for caracters
  19 ******************************************************************************/
  20
  21 bool
  22 is_alpha (register char c) {
  23   return ((c>='a') && (c<='z')) || ((c>='A') && (c<='Z'));
  24 }
  25
  26 bool
  27 is_iso_alpha (register char c) {
  28   int i= ((int) ((unsigned char) c));
  29   return
  30     ((c>='a') && (c<='z')) ||
  31     ((c>='A') && (c<='Z')) ||
  32     ((i >= 128) && (i != 159) && (i != 189) && (i != 190) && (i != 191));
  33 }
  34
  35 bool
  36 is_locase (register char c) {
  37   int code= (int) ((unsigned char) c);
  38   return
  39     ((c>='a') && (c<='z')) ||
  40     ((code >= 160) && (code < 189)) ||
  41     (code >= 224);
  42 }
  43
  44 bool
  45 is_upcase (register char c) {
  46   int code= (int) ((unsigned char) c);
  47   return
  48     ((c>='A') && (c<='Z')) ||
  49     ((code >= 128) && (code < 159)) ||
  50     ((code >= 192) && (code < 224));
  51 }
  52
  53 bool
  54 is_digit (register char c) {
  55   return (c>='0') && (c<='9');
  56 }
  57
  58 bool
  59 is_numeric (register char c) {
  60   return ((c>='0') && (c<='9')) || (c=='.');
  61 }
  62
  63 bool
  64 is_punctuation (register char c) {
  65   return
  66     (c=='.') || (c==',') || (c==':') || (c=='\'') || (c=='`') ||
  67     (c==';') || (c=='!') || (c=='?');
  68 }
  69
  70 bool
  71 is_space (register char c) {
  72   return (c == ' ') || (c == '\11') || (c == '\12') || (c == '\15');\
  73 }
  74
  75 /******************************************************************************
  76 * Tests for strings
  77 ******************************************************************************/
  78
  79 bool
  80 is_alpha (string s) {
  81   int i;
  82   if (N(s)==0) return false;
  83   for (i=0; i<N(s); i++)
  84     if (!is_alpha (s[i])) return false;
  85   return true;
  86 }
  87
  88 bool
  89 is_locase_alpha (string s) {
  90   int i;
  91   if (N(s)==0) return false;
  92   for (i=0; i<N(s); i++)
  93     if (s[i]<'a' || s[i]>'z') return false;
  94   return true;
  95 }
  96
  97 bool
  98 is_iso_alpha (string s) {
  99   int i;
 100   if (N(s)==0) return false;
 101   for (i=0; i<N(s); i++)
 102     if (!is_iso_alpha (s[i])) return false;
 103   return true;
 104 }
 105
 106 bool
 107 is_numeric (string s) {
 108   int i;
 109   if (N(s)==0) return false;
 110   for (i=0; i<N(s); i++)
 111     if (!is_numeric (s[i])) return false;
 112   return true;
 113 }
 114
 115 /******************************************************************************
 116 * Changing cases
 117 ******************************************************************************/
 118
 119 char
 120 upcase (char c) {
 121   if (is_locase (c))
 122     return (char) (((int) ((unsigned char) c)) - 32);
 123   else return c;
 124 }
 125
 126 char
 127 locase (char c) {
 128   if (is_upcase (c))
 129     return (char) (((int) ((unsigned char) c)) + 32);
 130   else return c;
 131 }
 132
 133 string
 134 upcase_first (string s) {
 135   if ((N(s)==0) || (!is_locase (s[0]))) return s;
 136   return string ((char) (((int) ((unsigned char) s[0]))-32)) * s (1, N(s));
 137 }
 138
 139 string
 140 locase_first (string s) {
 141   if ((N(s)==0) || (!is_upcase (s[0]))) return s;
 142   return string ((char) (((int) ((unsigned char) s[0]))+32)) * s (1, N(s));
 143 }
 144
 145 string
 146 upcase_all (string s) {
 147   int i;
 148   string r (N(s));
 149   for (i=0; i<N(s); i++)
 150     if (!is_locase (s[i])) r[i]= s[i];
 151     else r[i]= (char) (((int) ((unsigned char) s[i]))-32);
 152   return r;
 153 }
 154
 155 string
 156 locase_all (string s) {
 157   int i;
 158   string r (N(s));
 159   for (i=0; i<N(s); i++)
 160     if (!is_upcase (s[i])) r[i]= s[i];
 161     else r[i]= (char) (((int) ((unsigned char) s[i]))+32);
 162   return r;
 163 }
 164
 165 /******************************************************************************
 166 * Inserting or removing a character into a string as a set of characters
 167 ******************************************************************************/
 168
 169 string
 170 string_union (string s1, string s2) {
 171   return string_minus (s1, s2) * s2;
 172 }
 173
 174 string
 175 string_minus (string s1, string s2) {
 176   string r;
 177   int i1, n1= N(s1), i2, n2= N(s2);
 178   for (i1=0; i1<n1; i1++) {
 179     for (i2=0; i2<n2; i2++)
 180       if (s1[i1] == s2[i2]) break;
 181     if (i2==n2) r << s1[i1];
 182   }
 183   return r;
 184 }
 185
 186 /******************************************************************************
 187 * Spanish in relation with ispell
 188 ******************************************************************************/
 189
 190 string
 191 ispanish_to_spanish (string s) {
 192   int i, n= N(s);
 193   string r;
 194   for (i=0; i<n; i++)
 195     if ((s[i] == '\'') && ((i+1)<n)) {
 196       switch (s[i+1]) {
 197       case 'A': r << 'Á'; break;
 198       case 'E': r << 'É'; break;
 199       case 'I': r << 'Í'; break;
 200       case 'N': r << 'Ñ'; break;
 201       case 'O': r << 'Ó'; break;
 202       case 'U': r << 'Ú'; break;
 203       case 'Y': r << 'Ý'; break;
 204       case 'a': r << 'á'; break;
 205       case 'e': r << 'é'; break;
 206       case 'i': r << 'í'; break;
 207       case 'n': r << 'ñ'; break;
 208       case 'o': r << 'ó'; break;
 209       case 'u': r << 'ú'; break;
 210       case 'y': r << 'ý'; break;
 211       default : r << '\'' << s[i+1];
 212       }
 213       i++;
 214     }
 215     else r << s[i];
 216   return r;
 217 }
 218
 219 string
 220 spanish_to_ispanish (string s) {
 221   int i, n= N(s);
 222   string r;
 223   for (i=0; i<n; i++)
 224     switch (s[i]) {
 225     case 'Á': r << "'A"; break;
 226     case 'É': r << "'E"; break;
 227     case 'Í': r << "'I"; break;
 228     case 'Ñ': r << "'N"; break;
 229     case 'Ó': r << "'O"; break;
 230     case 'Ú': r << "'U"; break;
 231     case 'Ý': r << "'Y"; break;
 232     case 'á': r << "'a"; break;
 233     case 'é': r << "'e"; break;
 234     case 'í': r << "'i"; break;
 235     case 'ñ': r << "'n"; break;
 236     case 'ó': r << "'o"; break;
 237     case 'ú': r << "'u"; break;
 238     case 'ý': r << "'y"; break;
 239     default : r << s[i];
 240     }
 241   return r;
 242 }
 243
 244 string
 245 igerman_to_german (string s) {
 246   int i, n= N(s);
 247   string r;
 248   for (i=0; i<n; i++)
 249     if (s[i] == 'ß') r << 'ÿ';
 250     else r << s[i];
 251   return r;
 252 }
 253
 254 string
 255 german_to_igerman (string s) {
 256   int i, n= N(s);
 257   string r;
 258   for (i=0; i<n; i++)
 259     if (s[i] == 'ÿ') r << 'ß';
 260     else r << s[i];
 261   return r;
 262 }
 263
 264 /******************************************************************************
 265 * Iso latin 2 encoding for polish and czech
 266 ******************************************************************************/
 267
 268 static string il2_to_cork_string=
 269   "€�‚ƒ„…†‡ˆ‰Š‹Œ�Ž��‘’“”•–—˜™š›œ�žŸ �\bŠ ‰‘Ÿ\x04’“”™\x7fš› ¡\fª\x01©±\a\v²³´¹\x05º»�ÁÂ€Äˆ‚ÇƒÉ†Ë…ÍÎ„Ð‹ŒÓÔŽÖ.�—Ú–ÜÝ•ÿ¯áâ ä¨¢ç£é¦ë¥íî¤ž«¬óô®ö/°·ú¶üýµ ";
 270 static string cork_to_il2_string=
 271   "Ã¡ÆÈÏÌÊGÅ¥£ÑÒ ÕÀØ¦©ª«ÞÛÙY¬®¯IIð§ã±æèïìêgåµ³ñò õàø¶¹º»þûùy¼¾¿i!?LAÁÂAÄAAÇEÉEËIÍÎIÐNOÓÔOÖOOUÚUÜÝ Saáâaäaaçeéeëiíîiðnoóôoöoouúuüý ß";
 272
 273 static char
 274 il2_to_cork (char c) {
 275   int i= (int) ((unsigned char) c);
 276   if (i<128) return c;
 277   return il2_to_cork_string [i-128];
 278 }
 279
 280 static char
 281 cork_to_il2 (char c) {
 282   int i= (int) ((unsigned char) c);
 283   if (i<128) return c;
 284   return cork_to_il2_string [i-128];
 285 }
 286
 287 string
 288 il2_to_cork (string s) {
 289   int i, n= N(s);
 290   string r (n);
 291   for (i=0; i<n; i++)
 292     r[i]= il2_to_cork (s[i]);
 293   return r;
 294 }
 295
 296 string
 297 cork_to_il2 (string s) {
 298   int i, n= N(s);
 299   string r (n);
 300   for (i=0; i<n; i++)
 301     r[i]= cork_to_il2 (s[i]);
 302   return r;
 303 }
 304
 305 /******************************************************************************
 306 * Koi8 encoding for russian
 307 ******************************************************************************/
 308
 309 static string koi8_to_iso_string=
 310   "áâ÷çäåöúéêëìíîïðòóôõæèãþûýÿùøüàñÁÂ×ÇÄÅÖÚÉÊËÌÍÎÏÐÒÓÔÕÆÈÃÞÛÝßÙØÜÀÑ";
 311 static string iso_to_koi8_string=
 312   "þàáöäåôãõèéêëìíîïÿðñòóæâüûçøýù÷úÞÀÁÖÄÅÔÃÕÈÉÊËÌÍÎÏßÐÑÒÓÆÂÜÛÇØÝÙ×Ú";
 313
 314 static char
 315 koi8_to_iso (char c, bool ukrainian) {
 316   int i= (int) ((unsigned char) c);
 317   if (i==156) return '³';
 318   if (i==188) return '£';
 319   if (ukrainian)
 320   {
 321      switch(c)
 322      {
 323          case 'I':return '¶';
 324          case 'ˆ':return '·';
 325          case '™':return '´';
 326          case '€':return '½';
 327          case 'i':return '¦';
 328          case '¨':return '§';
 329          case '¹':return '¤';
 330          case ' ':return '';
 331      }
 332   }
 333   if (i<192) return c;
 334   return koi8_to_iso_string [i-192];
 335 }
 336
 337 static char
 338 iso_to_koi8 (char c, bool ukrainian) {
 339   int i= (int) ((unsigned char) c);
 340   if (c=='³') return (char) 156;
 341   if (c=='£') return (char) 188;
 342   if (ukrainian)
 343   {
 344      switch(c)
 345      {
 346          case '¶':return 'I';
 347          case '·':return 'ˆ';
 348          case '´':return '™';
 349          case '½':return '€';
 350          case '¦':return 'i';
 351          case '§':return '¨';
 352          case '¤':return '¹';
 353          case '':return ' ';
 354      }
 355   }
 356   if (i<192) return c;
 357   return iso_to_koi8_string [i-192];
 358 }
 359
 360 string
 361 koi8_to_iso (string s) {
 362   int i, n= N(s);
 363   string r (n);
 364   for (i=0; i<n; i++)
 365     r[i]= koi8_to_iso (s[i], false);
 366   return r;
 367 }
 368
 369 string
 370 iso_to_koi8 (string s) {
 371   int i, n= N(s);
 372   string r (n);
 373   for (i=0; i<n; i++)
 374     r[i]= iso_to_koi8 (s[i], false);
 375   return r;
 376 }
 377
 378 string
 379 koi8uk_to_iso (string s) {
 380   int i, n= N(s);
 381   string r (n);
 382   for (i=0; i<n; i++)
 383     r[i]= koi8_to_iso (s[i], true);
 384   return r;
 385 }
 386
 387 string
 388 iso_to_koi8uk (string s) {
 389   int i, n= N(s);
 390   string r (n);
 391   for (i=0; i<n; i++)
 392     r[i]= iso_to_koi8 (s[i], true);
 393   return r;
 394 }
 395
 396 /******************************************************************************
 397 * Convert between TeXmacs and XML strings
 398 ******************************************************************************/
 399
 400 static bool
 401 is_xml_name (char c) {
 402   return
 403     is_alpha (c) || is_numeric (c) ||
 404     (c == '.') || (c == '-') || (c == ':');
 405 }
 406
 407 string
 408 tm_to_xml_name (string s) {
 409   string r;
 410   int i, n= N(s);
 411   for (i=0; i<n; i++)
 412     if (is_xml_name (s[i])) r << s[i];
 413     else r << "_" << as_string ((int) ((unsigned char) s[i])) << "_";
 414   return r;
 415 }
 416
 417 string
 418 xml_name_to_tm (string s) {
 419   string r;
 420   int i, n= N(s);
 421   for (i=0; i<n; i++)
 422     if (s[i] != '_') r << s[i];
 423     else {
 424       int start= ++i;
 425       while ((i<n) && (s[i]!='_')) i++;
 426       r << (char) ((unsigned char) as_int (s (start, i)));
 427     }
 428   return r;
 429 }
 430
 431 string
 432 old_tm_to_xml_cdata (string s) {
 433   string r;
 434   int i, n= N(s);
 435   for (i=0; i<n; i++)
 436     if (s[i] == '&') r << "&amp;";
 437     else if (s[i] == '>') r << "&gt;";
 438     else if (s[i] != '<') r << s[i];
 439     else {
 440       int start= ++i;
 441       while ((i<n) && (s[i]!='>')) i++;
 442       r << "&" << tm_to_xml_name (s (start, i)) << ";";
 443     }
 444   return r;
 445 }
 446
 447 object
 448 tm_to_xml_cdata (string s) {
 449   array<object> a;
 450   a << symbol_object ("!concat");
 451   string r;
 452   int i, n= N(s);
 453   for (i=0; i<n; i++)
 454     if (s[i] == '&') r << "&amp;";
 455     else if (s[i] == '>') r << "&gt;";
 456     else if (s[i] == '\\') r << "\\";
 457     else if (s[i] != '<') r << cork_to_utf8 (s (i, i+1));
 458     else {
 459       int start= i++;
 460       while ((i<n) && (s[i]!='>')) i++;
 461       string ss= s (start, i+1);
 462       string rr= cork_to_utf8 (ss);
 463       string qq= utf8_to_cork (rr);
 464       if (rr != ss && qq == ss) r << rr;
 465       else {
 466         if (r != "") a << object (r);
 467         a << cons (symbol_object ("tm-sym"),
 468                    cons (ss (1, N(ss)-1),
 469                          null_object ()));
 470         r= "";
 471       }
 472     }
 473   if (r != "") a << object (r);
 474   if (N(a) == 1) return object ("");
 475   else if (N(a) == 2) return a[1];
 476   else return call ("list", a);
 477 }
 478
 479 string
 480 old_xml_cdata_to_tm (string s) {
 481   string r;
 482   int i, n= N(s);
 483   for (i=0; i<n; i++)
 484     if (s[i] == '<') r << "<less>";
 485     else if (s[i] == '>') r << "<gtr>";
 486     else if (s[i] != '&') r << s[i];
 487     else {
 488       int start= ++i;
 489       while ((i<n) && (s[i]!=';')) i++;
 490       string x= "<" * xml_name_to_tm (s (start, i)) * ">";
 491       if (x == "<amp>") r << "&";
 492       else r << x;
 493     }
 494   return r;
 495 }
 496
 497 string
 498 xml_unspace (string s, bool first, bool last) {
 499   string r;
 500   int i= 0, n= N(s);
 501   if (first) while ((i<n) && is_space (s[i])) i++;
 502   while (i<n)
 503     if (!is_space (s[i])) r << s[i++];
 504     else {
 505       while ((i<n) && is_space (s[i])) i++;
 506       if ((i<n) || (!last)) r << ' ';
 507     }
 508   return r;
 509 }
 510
 511 bool
 512 contains_unicode_char (string s) {
 513   int i= 0, n= N(s);
 514   while (i+1<n) {
 515     if (s[i] == '<' && s[i+1] == '#') return true;
 516     tm_char_forwards (s, i);
 517   }
 518   return false;
 519 }
 520
 521 /******************************************************************************
 522 * Roman and alpha numbers
 523 ******************************************************************************/
 524
 525 static string ones[10]= {
 526   "", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" };
 527 static string tens[10]= {
 528   "", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" };
 529 static string hundreds[10]= {
 530   "", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" };
 531
 532 string
 533 roman_nr (int nr) {
 534   if (nr<0) return "-" * roman_nr (nr);
 535   if (nr==0) return "o";
 536   if (nr>1000) return "m" * roman_nr (nr-1000);
 537   if (nr==1000) return "m";
 538   if (nr==999) return "im";
 539   if (nr==499) return "id";
 540   if ((nr%100)==99) return hundreds[nr/100] * "ic";
 541   if ((nr%100)==49) return hundreds[nr/100] * "il";
 542   return hundreds[nr/100] * tens[(nr%100)/10] * ones[nr%10];
 543 }
 544
 545 string
 546 Roman_nr (int nr) {
 547   return upcase_all (roman_nr (nr));
 548 }
 549
 550 string
 551 alpha_nr (int nr) {
 552   if (nr<0) return "-" * alpha_nr (nr);
 553   if (nr==0) return "0";
 554   if (nr<=26) return string ((char) (((int) 'a')+ nr-1));
 555   return alpha_nr ((nr-1)/26) * alpha_nr (((nr-1)%26)+1);
 556 }
 557
 558 string
 559 Alpha_nr (int nr) {
 560   return upcase_all (alpha_nr (nr));
 561 }
 562
 563 string
 564 fnsymbol_nr (int nr) {
 565   string sym, r;
 566   int i, m= (nr-1)%3, n= ((nr-1)/3)+1;
 567   switch (m) {
 568   case 0: sym= "<ast>"; break;
 569   case 1: sym= "<dag>"; break;
 570   case 2: sym= "<ddag>"; break;
 571   }
 572   for (i=0; i<n; i++) r << sym;
 573   return r;
 574 }
 575
 576 /******************************************************************************
 577 * Conversions to and from hexadecimal
 578 ******************************************************************************/
 579
 580 static const char* hex_string= "0123456789ABCDEF";
 581
 582 string
 583 as_hexadecimal (int i) {
 584   if (i<0) return "-" * as_hexadecimal (-i);
 585   if (i<16) return hex_string [i & 15];
 586   return as_hexadecimal (i >> 4) * hex_string [i & 15];
 587 }
 588
 589 string
 590 as_hexadecimal (pointer ptr) {
 591   intptr_t i= (intptr_t) ptr;
 592   if (i<0) return "-" * as_hexadecimal (-i);
 593   if (i<16) return hex_string [i & 15];
 594   return as_hexadecimal (i >> 4) * hex_string [i & 15];
 595 }
 596
 597 string
 598 as_hexadecimal (int i, int len) {
 599   if (len==1) return hex_string [i & 15];
 600   else return as_hexadecimal (i >> 4, len-1) * hex_string [i & 15];
 601 }
 602
 603 int
 604 from_hexadecimal (string s) {
 605   int i, n= N(s), res= 0;
 606   if ((n>0) && (s[0]=='-'))
 607     return -from_hexadecimal (s (1, n));
 608   for (i=0; i<n; i++) {
 609     res= res << 4;
 610     if ((s[i] >= '0') && (s[i] <= '9')) res += (int) (s[i] - '0');
 611     if ((s[i] >= 'A') && (s[i] <= 'F')) res += (int) (s[i] + 10 - 'A');
 612     if ((s[i] >= 'a') && (s[i] <= 'f')) res += (int) (s[i] + 10 - 'a');
 613   }
 614   return res;
 615 }
 616
 617 /******************************************************************************
 618 * Routines for the TeXmacs encoding
 619 ******************************************************************************/
 620
 621 string
 622 tm_encode (string s) {
 623   // verbatim to TeXmacs encoding
 624   register int i;
 625   string r;
 626   for (i=0; i<N(s); i++) {
 627     if (s[i]=='<') r << "<less>";
 628     else if (s[i]=='>') r << "<gtr>";
 629     else r << s[i];
 630   }
 631   return r;
 632 }
 633
 634 string
 635 tm_decode (string s) {
 636   // TeXmacs encoding to verbatim
 637   register int i;
 638   string r;
 639   for (i=0; i<N(s); i++) {
 640     if (s[i]=='<') {
 641       register int j;
 642       for (j=i+1; j<N(s); j++)
 643         if (s[j]=='>') break;
 644       if (j<N(s)) j++;
 645       if (s(i,j) == "<less>") r << "<";
 646       else if (s(i,j) == "<gtr>") r << ">";
 647       i=j-1;
 648       if (s[i]!='>') return r;
 649     }
 650     else if (s[i]!='>') r << s[i];
 651   }
 652   return r;
 653 }
 654
 655 string
 656 tm_var_encode (string s) {
 657   register int i, n= N(s);
 658   string r;
 659   for (i=0; i<n; i++) {
 660     if (s[i]=='<') {
 661       if (i+1 < n && s[i+1] == '#') {
 662         while (i<n && s[i] != '>') r << s[i++];
 663         if (i<n) r << s[i];
 664       }
 665       else r << "<less>";
 666     }
 667     else if (s[i]=='>') r << "<gtr>";
 668     else r << s[i];
 669   }
 670   return r;
 671 }
 672
 673 string
 674 tm_correct (string s) {
 675   register int i;
 676   string r;
 677   for (i=0; i<N(s); i++) {
 678     if (s[i]=='<') {
 679       register bool flag= true;
 680       register int j, k;
 681       for (j=i+1; j<N(s); j++)
 682         if (s[j]=='>') break;
 683       if (j==N(s)) return r;
 684       for (k=i+1; k<j; k++)
 685         if (s[k]=='<') flag= false;
 686       if (flag) r << s(i,j+1);
 687       i=j;
 688     }
 689     else if (s[i]!='>') r << s[i];
 690   }
 691   return r;
 692 }
 693
 694 void
 695 tm_char_forwards (string s, int& pos) {
 696   ASSERT (pos >= 0 && pos <= N(s), "out of range");
 697   int n= N(s);
 698   if (pos == n);
 699   else if (s[pos] != '<') pos++;
 700   else {
 701     while (pos<n && s[pos] != '>') pos++;
 702     if (pos<n) pos++;
 703   }
 704 }
 705
 706 void
 707 tm_char_backwards (string s, int& pos) {
 708   ASSERT (pos >= 0 && pos <= N(s), "out of range");
 709   if (pos == 0);
 710   else if (s[pos-1] != '>') pos--;
 711   else {
 712     while (pos>0 && s[pos-1] != '<') pos--;
 713     if (pos>0) pos--;
 714   }
 715 }
 716
 717 /******************************************************************************
 718 * Quoting
 719 ******************************************************************************/
 720
 721 string
 722 scm_quote (string s) {
 723   // R5RS compliant external string representation.
 724   int i, n= N(s);
 725   string r;
 726   r << '"';
 727   for (i=0; i<n; i++)
 728     switch (s[i]) {
 729     case '\"':
 730     case '\\':
 731       r << '\\' << s[i];
 732       break;
 733     default:
 734       r << s[i];
 735     }
 736   r << '"';
 737   return r;
 738 }
 739
 740 string
 741 scm_unquote (string s) {
 742   if ((N(s)>=2) && (s[0]=='\"') && (s[N(s)-1]=='\"')) {
 743     int i, n= N(s);
 744     string r;
 745     for (i=1; i<n-1; i++)
 746       if (s[i] == '\\' && (s[i+1] == '\"' || s[i+1] == '\\')) r << s[++i];
 747       else r << s[i];
 748     return r;
 749   }
 750   else return s;
 751 }
 752
 753 string
 754 raw_quote (string s) {
 755   // Mark the label of a STRING tree as representing a string and not a symbol.
 756   return "\"" * s * "\"";
 757 }
 758
 759 string
 760 raw_unquote (string s) {
 761   // Get the string value of a STRING tree label representing a string.
 762   if ((N(s)>=2) && (s[0]=='\"') && (s[N(s)-1]=='\"'))
 763     return s (1, N(s)-1);
 764   else return s;
 765 }
 766
 767 /******************************************************************************
 768 * Handling escape characters
 769 ******************************************************************************/
 770
 771 string
 772 escape_sh (string s) {
 773 #if defined (__MINGW__) || defined (__MINGW32__) || defined (OS_WIN32)
 774   return raw_quote (s);
 775 #else
 776   int i, n= N(s);
 777   string r;
 778   for (i=0; i<n; i++)
 779     switch (s[i]) {
 780     case '?':
 781     case '&':
 782     case '$':
 783     case '`':
 784     case '\"':
 785     case '\\':
 786     case ' ':
 787       r << '\\' << s[i];
 788       break;
 789     default:
 790       r << s[i];
 791     }
 792   return r;
 793 #endif
 794 }
 795
 796 string
 797 escape_generic (string s) {
 798   int i, n= N(s);
 799   string r;
 800   for (i=0; i<n; i++) {
 801     if ((s[i] == '\2') || (s[i] == '\5') || (s[i] == '\33')) r << '\33';
 802     r << s[i];
 803   }
 804   return r;
 805 }
 806
 807 string
 808 escape_verbatim (string s) {
 809   int i, n= N(s);
 810   string r;
 811   for (i=0; i<n; i++) {
 812     unsigned char c= (unsigned char) s[i];
 813     if ((c == '\n') || (c == '\t')) r << ' ';
 814     else if (((int) c) >= 32) r << s[i];
 815   }
 816   return r;
 817 }
 818
 819 string
 820 escape_spaces (string s) {
 821   int i, n= N(s);
 822   string r;
 823   for (i=0; i<n; i++) {
 824     unsigned char c= (unsigned char) s[i];
 825     if (c == ' ') r << '\\';
 826     r << c;
 827   }
 828   return r;
 829 }
 830
 831 string
 832 dos_to_better (string s) {
 833   int i, n= N(s);
 834   string r;
 835   for (i=0; i<n; i++)
 836     if (s[i] == '\015');
 837     else r << s[i];
 838   return r;
 839 }
 840
 841 /******************************************************************************
 842 * Reading input from a string
 843 ******************************************************************************/
 844
 845 bool
 846 test (string s, int i, const char* test) {
 847   int n= N(s), j=0;
 848   while (test[j]!='\0') {
 849     if (i>=n) return false;
 850     if (s[i]!=test[j]) return false;
 851     i++; j++;
 852   }
 853   return true;
 854 }
 855
 856 bool
 857 test (string s, int i, string test) {
 858   int n= N(s), m= N(test), j=0;
 859   while (j<m) {
 860     if (i>=n) return false;
 861     if (s[i]!=test[j]) return false;
 862     i++; j++;
 863   }
 864   return true;
 865 }
 866
 867 bool
 868 starts (string s, const char* what) {
 869   return test (s, 0, what);
 870 }
 871
 872 bool
 873 starts (string s, const string what) {
 874   return test (s, 0, what);
 875 }
 876
 877 bool
 878 ends (string s, const char* what) {
 879   string r (what);
 880   if (N(r) > N(s)) return false;
 881   return s (N(s)-N(r), N(s)) == r;
 882 }
 883
 884 bool
 885 ends (string s, const string r) {
 886   if (N(r) > N(s)) return false;
 887   return s (N(s)-N(r), N(s)) == r;
 888 }
 889
 890 bool
 891 read (string s, int& i, const char* test) {
 892   int n= N(s), j=0, k=i;
 893   while (test[j]!='\0') {
 894     if (k>=n) return false;
 895     if (s[k]!=test[j]) return false;
 896     j++; k++;
 897   }
 898   i=k;
 899   return true;
 900 }
 901
 902 bool
 903 read (string s, int& i, string test) {
 904   int n= N(s), m= N(test), j=0, k=i;
 905   while (j<m) {
 906     if (k>=n) return false;
 907     if (s[k]!=test[j]) return false;
 908     j++; k++;
 909   }
 910   i=k;
 911   return true;
 912 }
 913
 914 bool
 915 read_line (string s, int& i, string& result) {
 916   int start= i;
 917   for (; i<N(s); i++) {
 918     if (s[i]=='\n') {
 919       result= s(start,i++);
 920       return true;
 921     }
 922   }
 923   result= s(start,i);
 924   return false;
 925 }
 926
 927 bool
 928 read_int (string s, int& i, int& result) {
 929   int n= N(s), start= i;
 930   result= 0;
 931   if (i==n) return false;
 932   if (s[i]=='-') {
 933     if (i+1==n) return false;
 934     if (!is_digit (s[i+1])) return false;
 935     i++;
 936   }
 937   else if (!is_digit (s[i])) return false;
 938   while ((i<n) && is_digit (s[i])) i++;
 939   result= as_int (s(start,i));
 940   return true;
 941 }
 942
 943 bool
 944 read_double (string s, int& i, double& result) {
 945   int n= N(s), start= i;
 946   result= 0.0;
 947   if (i==n) return false;
 948   if (s[i]=='-') {
 949     if (i+1==n) return false;
 950     if (!is_numeric (s[i+1])) return false;
 951     i++;
 952   }
 953   else if (!is_numeric (s[i])) return false;
 954   while ((i<n) && is_digit (s[i])) i++;
 955   if ((i<n) && (s[i]=='.')) i++;
 956   while ((i<n) && is_digit (s[i])) i++;
 957   if ((i<n) && ((s[i]=='e') || (s[i]=='E'))) {
 958     i++;
 959     if ((i<n) && (s[i]=='-')) i++;
 960     if ((i==n) || (!is_digit (s[i]))) { i=start; return false; }
 961     while ((i<n) && is_digit (s[i])) i++;
 962   }
 963   result= as_double (s(start,i));
 964   return true;
 965 }
 966
 967 void
 968 skip_spaces (string s, int& i) {
 969   int n=N(s);
 970   while ((i<n) && ((s[i]==' ') || (s[i]=='\t'))) i++;
 971 }
 972
 973 void
 974 skip_line (string s, int& i) {
 975   int n=N(s);
 976   while ((i<n) && (s[i]!='\n')) i++;
 977   if (i<n) i++;
 978 }
 979
 980 void
 981 skip_symbol (string s, int& i) {
 982   int n=N(s);
 983   if (i<n) {
 984     if (s[i]=='<') {
 985       for (i++; i<n; i++)
 986         if (s[i-1]=='>') break;
 987     }
 988     else i++;
 989   }
 990 }
 991
 992 /******************************************************************************
 993 * Parsing binary data
 994 ******************************************************************************/
 995
 996 void
 997 parse (string s, int& pos, QI& ret) {
 998   ret= (QI) s[pos++];
 999 }
1000
1001 void
1002 parse (string s, int& pos, QN& ret) {
1003   ret= (QN) s[pos++];
1004 }
1005
1006 void
1007 parse (string s, int& pos, HI& ret) {
1008   QI c1= (QI) s[pos++];
1009   QN c2= (QN) s[pos++];
1010   ret= (((HI) c1)<<8)+ c2;
1011 }
1012
1013 void
1014 parse (string s, int& pos, HN& ret) {
1015   QN c1= (QN) s[pos++];
1016   QN c2= (QN) s[pos++];
1017   ret= (((HN) c1)<<8)+ c2;
1018 }
1019
1020 void
1021 parse (string s, int& pos, SI& ret) {
1022   QI c1= (QI) s[pos++];
1023   QN c2= (QN) s[pos++];
1024   QN c3= (QN) s[pos++];
1025   QN c4= (QN) s[pos++];
1026   ret= (((((((SI) c1)<<8)+ ((SI) c2))<<8)+ ((SI) c3))<<8)+ c4;
1027 }
1028
1029 void
1030 parse (string s, int& pos, SI*& a, int len) {
1031   int i;
1032   a= tm_new_array<int> (len);
1033   for (i=0; i<len; i++) parse (s, pos, a[i]);
1034 }
1035
1036 /******************************************************************************
1037 * Searching, replacing and pattern matching
1038 ******************************************************************************/
1039
1040 int
1041 search_forwards (string s, int pos, string in) {
1042   int k= N(s), n= N(in);
1043   if (k == 0) return pos;
1044   char c= s[0];
1045   while (pos+k <= n) {
1046     if (in[pos] == c && test (in, pos, s)) return pos;
1047     pos++;
1048   }
1049   return -1;
1050 }
1051
1052 int
1053 search_forwards (string s, string in) {
1054   return search_forwards (s, 0, in);
1055 }
1056
1057 int
1058 search_backwards (string s, int pos, string in) {
1059   while (pos >= 0) {
1060     if (test (in, pos, s)) return pos;
1061     pos--;
1062   }
1063   return -1;
1064 }
1065
1066 int
1067 search_backwards (string s, string in) {
1068   return search_backwards (s, N(in)-N(s), in);
1069 }
1070
1071 int
1072 count_occurrences (string s, string in) {
1073   int count= 0;
1074   int i=0, next, n= N(s);
1075   while (i<n) {
1076     next= search_forwards (s, i, in);
1077     if (next == -1) break;
1078     count++;
1079     i= next+1;
1080   }
1081   return count;
1082 }
1083
1084 string
1085 replace (string s, string what, string by) {
1086   int i, n= N(s);
1087   string r;
1088   for (i=0; i<n; )
1089     if (test (s, i, what)) {
1090       r << by;
1091       i += N(what);
1092     }
1093     else {
1094       r << s[i];
1095       i++;
1096     }
1097   return r;
1098 }
1099
1100 static bool
1101 match_wildcard (string s, int spos, string w, int wpos) {
1102   if (wpos == N(w)) return spos == N(s);
1103   if (w[wpos] != '*')
1104     return (spos < N(s)) && (s[spos] == w[wpos]) &&
1105       match_wildcard (s, spos+1, w, wpos+1);
1106   while ((wpos<N(w)) && (w[wpos]=='*')) wpos++;
1107   while (spos <= N(s)) {
1108     if (match_wildcard (s, spos, w, wpos)) return true;
1109     spos++;
1110   }
1111   return false;
1112 }
1113
1114 bool
1115 match_wildcard (string s, string w) {
1116   return match_wildcard (s, 0, w, 0);
1117 }
1118
1119 /******************************************************************************
1120 * Computations with completions
1121 ******************************************************************************/
1122
1123 array<string>
1124 as_completions (hashset<string> h) {
1125   tree t= (tree) h;
1126   int i, n= N(t);
1127   array<string> a (n);
1128   for (i=0; i<n; i++) a[i]= t[i]->label;
1129   merge_sort (a);
1130   return a;
1131 }
1132
1133 /*
1134 static void
1135 close_completions (hashset<string>& h) {
1136   array<string> a= as_completions (h);
1137   int i, j, n= N(a);
1138   for (i=1; i<n; i++) {
1139     for (j=0; j < min (N(a[i-1]), N(a[i])); j++)
1140       if (a[i-1][j] != a[i][j]) break;
1141     if (j < min (N(a[i-1]), N(a[i])))
1142       h->insert (a[i](0,j));
1143   }
1144 }
1145
1146 array<string>
1147 close_completions (array<string> a) {
1148   int i, n= N(a);
1149   hashset<string> h;
1150   for (i=0; i<n; i++) h->insert (a[i]);
1151   close_completions (h);
1152   return as_completions (h);
1153 }
1154 */
1155
1156 array<string>
1157 close_completions (array<string> a) {
1158   if (N(a) == 0) return a;
1159   merge_sort (a);
1160   int i, j, n= N(a), l= N(a[0]);
1161   for (i=1; i<n; i++) {
1162     for (j=0; j<l && j<N(a[i]); j++)
1163       if (a[i-1][j] != a[i][j]) break;
1164     l= j;
1165   }
1166   array<string> r;
1167   r << a[0] (0, l);
1168   for (i=0; i<n; i++)
1169     if (a[i] != r[N(r)-1])
1170       r << a[i];
1171   return r;
1172 }
1173
1174 array<string>
1175 strip_completions (array<string> a, string prefix) {
1176   int i, n= N(a);
1177   array<string> b;
1178   for (i=0; i<n; i++)
1179     if (starts (a[i], prefix))
1180       b << a[i] (N(prefix), N(a[i]));
1181   return b;
1182 }