usr/src/cmd/awk_xpg4/awk1.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*
  27  * Copyright 1986, 1994 by Mortice Kern Systems Inc.  All rights reserved.
  28  */
  29
  30 /*
  31  * awk -- mainline, yylex, etc.
  32  *
  33  * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes
  34  */
  35
  36 #include "awk.h"
  37 #include "y.tab.h"
  38 #include <stdarg.h>
  39 #include <unistd.h>
  40 #include <locale.h>
  41 #include <search.h>
  42
  43 static char     *progfiles[NPFILE];     /* Programmes files for yylex */
  44 static char     **progfilep = &progfiles[0]; /* Pointer to last file */
  45 static wchar_t  *progptr;               /* In-memory programme */
  46 static int      proglen;                /* Length of progptr */
  47 static wchar_t  context[NCONTEXT];      /* Circular buffer of context */
  48 static wchar_t  *conptr = &context[0];  /* context ptr */
  49 static FILE     *progfp;                /* Stdio stream for programme */
  50 static char     *filename;
  51 #ifdef  DEBUG
  52 static int      dflag;
  53 #endif
  54
  55 #define AWK_EXEC_MAGIC  "<MKS AWKC>"
  56 #define LEN_EXEC_MAGIC  10
  57
  58 static char     unbal[] = "unbalanced E char";
  59
  60 static void     awkarginit(int c, char **av);
  61 static int      lexid(wint_t c);
  62 static int      lexnumber(wint_t c);
  63 static int      lexstring(wint_t endc);
  64 static int      lexregexp(wint_t endc);
  65
  66 static void     awkvarinit(void);
  67 static wint_t   lexgetc(void);
  68 static void     lexungetc(wint_t c);
  69 static size_t   lexescape(wint_t endc, int regx, int cmd_line_operand);
  70 static void     awkierr(int perr, char *fmt, va_list ap) __NORETURN;
  71 static int      usage(void);
  72 void            strescape(wchar_t *str);
  73 static const char       *toprint(wint_t);
  74 char *_cmdname;
  75 static wchar_t *mbconvert(char *str);
  76
  77 extern int      isclvar(wchar_t *arg);
  78
  79 /*
  80  * mainline for awk
  81  */
  82 int
  83 main(int argc, char *argv[])
  84 {
  85         wchar_t *ap;
  86         char *cmd;
  87
  88         cmd = argv[0];
  89         _cmdname = cmd;
  90
  91         linebuf = emalloc(NLINE * sizeof (wchar_t));
  92
  93         /*
  94          * At this point only messaging should be internationalized.
  95          * numbers are still scanned as in the Posix locale.
  96          */
  97         (void) setlocale(LC_ALL, "");
  98         (void) setlocale(LC_NUMERIC, "C");
  99 #if !defined(TEXT_DOMAIN)
 100 #define TEXT_DOMAIN     "SYS_TEST"
 101 #endif
 102         (void) textdomain(TEXT_DOMAIN);
 103
 104         awkvarinit();
 105         /* running = 1; */
 106         while (argc > 1 && *argv[1] == '-') {
 107                 void *save_ptr = NULL;
 108                 ap = mbstowcsdup(&argv[1][1]);
 109                 if (ap == NULL)
 110                         break;
 111                 if (*ap == '\0') {
 112                         free(ap);
 113                         break;
 114                 }
 115                 save_ptr = (void *) ap;
 116                 ++argv;
 117                 --argc;
 118                 if (*ap == '-' && ap[1] == '\0')
 119                         break;
 120                 for (; *ap != '\0'; ++ap) {
 121                         switch (*ap) {
 122 #ifdef DEBUG
 123                         case 'd':
 124                                 dflag = 1;
 125                                 continue;
 126
 127 #endif
 128                         case 'f':
 129                                 if (argc < 2) {
 130                                         (void) fprintf(stderr,
 131                                 gettext("Missing script file\n"));
 132                                         return (1);
 133                                 }
 134                                 *progfilep++ = argv[1];
 135                                 --argc;
 136                                 ++argv;
 137                                 continue;
 138
 139                         case 'F':
 140                                 if (ap[1] == '\0') {
 141                                         if (argc < 2) {
 142                                                 (void) fprintf(stderr,
 143                                 gettext("Missing field separator\n"));
 144                                                 return (1);
 145                                         }
 146                                         ap = mbstowcsdup(argv[1]);
 147                                         --argc;
 148                                         ++argv;
 149                                 } else
 150                                         ++ap;
 151                                 strescape(ap);
 152                                 strassign(varFS, linebuf, FALLOC,
 153                                     wcslen(linebuf));
 154                                 break;
 155
 156                         case 'v': {
 157                                 wchar_t *vp;
 158                                 wchar_t *arg;
 159
 160                                 if (argc < 2) {
 161                                         (void) fprintf(stderr,
 162                 gettext("Missing variable assignment\n"));
 163                                         return (1);
 164                                 }
 165                                 arg = mbconvert(argv[1]);
 166                                 /*
 167                                  * Ensure the variable expression
 168                                  * is valid (correct form).
 169                                  */
 170                                 if (((vp = wcschr(arg, '=')) != NULL) &&
 171                                     isclvar(arg)) {
 172                                         *vp = '\0';
 173                                         strescape(vp+1);
 174                                         strassign(vlook(arg), linebuf,
 175                                             FALLOC|FSENSE,
 176                                             wcslen(linebuf));
 177                                         *vp = '=';
 178                                 } else {
 179                                         (void) fprintf(stderr, gettext(
 180                                             "Invalid form for variable "
 181                                             "assignment: %S\n"), arg);
 182                                         return (1);
 183                                 }
 184                                 --argc;
 185                                 ++argv;
 186                                 continue;
 187                         }
 188
 189                         default:
 190                                 (void) fprintf(stderr,
 191                                 gettext("Unknown option \"-%S\"\n"), ap);
 192                                 return (usage());
 193                         }
 194                         break;
 195                 }
 196                 if (save_ptr)
 197                         free(save_ptr);
 198         }
 199         if (progfilep == &progfiles[0]) {
 200                 if (argc < 2)
 201                         return (usage());
 202                 filename = "[command line]";    /* BUG: NEEDS TRANSLATION */
 203                 progptr = mbstowcsdup(argv[1]);
 204                 proglen = wcslen(progptr);
 205                 --argc;
 206                 ++argv;
 207         }
 208
 209         argv[0] = cmd;
 210
 211         awkarginit(argc, argv);
 212
 213         /* running = 0; */
 214         (void) yyparse();
 215
 216         lineno = 0;
 217         /*
 218          * Ok, done parsing, so now activate the rest of the nls stuff, set
 219          * the radix character.
 220          */
 221         (void) setlocale(LC_ALL, "");
 222         radixpoint = *localeconv()->decimal_point;
 223         awk();
 224         /* NOTREACHED */
 225         return (0);
 226 }
 227
 228 /*
 229  * Do initial setup of buffers, etc.
 230  * This must be called before most processing
 231  * and especially before lexical analysis.
 232  * Variables initialised here will be overruled by command
 233  * line parameter initialisation.
 234  */
 235 static void
 236 awkvarinit()
 237 {
 238         NODE *np;
 239
 240         (void) setvbuf(stderr, NULL, _IONBF, 0);
 241
 242         if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) {
 243                 (void) fprintf(stderr,
 244         gettext("not enough available file descriptors"));
 245                 exit(1);
 246         }
 247         ofiles = (OFILE *)emalloc(sizeof (OFILE)*NIOSTREAM);
 248 #ifdef A_ZERO_POINTERS
 249         (void) memset((wchar_t *)ofiles, 0, sizeof (OFILE) * NIOSTREAM);
 250 #else
 251         {
 252                 /* initialize file descriptor table */
 253                 OFILE *fp;
 254                 for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) {
 255                         fp->f_fp = FNULL;
 256                                         fp->f_mode = 0;
 257                                         fp->f_name = (char *)0;
 258                 }
 259         }
 260 #endif
 261         constant = intnode((INT)0);
 262
 263         const0 = intnode((INT)0);
 264         const1 = intnode((INT)1);
 265         constundef = emptynode(CONSTANT, 0);
 266         constundef->n_flags = FSTRING|FVINT;
 267         constundef->n_string = _null;
 268         constundef->n_strlen = 0;
 269         inc_oper = emptynode(ADD, 0);
 270         inc_oper->n_right = const1;
 271         asn_oper = emptynode(ADD, 0);
 272         field0 = node(FIELD, const0, NNULL);
 273
 274         {
 275                 RESFUNC near*rp;
 276
 277                 for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) {
 278                         np = finstall(rp->rf_name, rp->rf_func, rp->rf_type);
 279                 }
 280         }
 281         {
 282                 RESERVED near*rp;
 283
 284                 for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) {
 285                         switch (rp->r_type) {
 286                         case SVAR:
 287                         case VAR:
 288                                 running = 1;
 289                                 np = vlook(rp->r_name);
 290                                 if (rp->r_type == SVAR)
 291                                         np->n_flags |= FSPECIAL;
 292                                 if (rp->r_svalue != NULL)
 293                                         strassign(np, rp->r_svalue, FSTATIC,
 294                                             (size_t)rp->r_ivalue);
 295                                 else {
 296                                         constant->n_int = rp->r_ivalue;
 297                                         (void) assign(np, constant);
 298                                 }
 299                                 running = 0;
 300                                 break;
 301
 302                         case KEYWORD:
 303                                 kinstall(rp->r_name, (int)rp->r_ivalue);
 304                                 break;
 305                         }
 306                 }
 307         }
 308
 309         varNR = vlook(s_NR);
 310         varFNR = vlook(s_FNR);
 311         varNF = vlook(s_NF);
 312         varOFMT = vlook(s_OFMT);
 313         varCONVFMT = vlook(s_CONVFMT);
 314         varOFS = vlook(s_OFS);
 315         varORS = vlook(s_ORS);
 316         varRS = vlook(s_RS);
 317         varFS = vlook(s_FS);
 318         varARGC = vlook(s_ARGC);
 319         varSUBSEP = vlook(s_SUBSEP);
 320         varENVIRON = vlook(s_ENVIRON);
 321         varFILENAME = vlook(s_FILENAME);
 322         varSYMTAB = vlook(s_SYMTAB);
 323         incNR = node(ASG, varNR, node(ADD, varNR, const1));
 324         incFNR = node(ASG, varFNR, node(ADD, varFNR, const1));
 325         clrFNR = node(ASG, varFNR, const0);
 326 }
 327
 328 /*
 329  * Initialise awk ARGC, ARGV variables.
 330  */
 331 static void
 332 awkarginit(int ac, char **av)
 333 {
 334         int i;
 335         wchar_t *cp;
 336
 337         ARGVsubi = node(INDEX, vlook(s_ARGV), constant);
 338         running = 1;
 339         constant->n_int = ac;
 340         (void) assign(varARGC, constant);
 341         for (i = 0; i < ac; ++i) {
 342                 cp = mbstowcsdup(av[i]);
 343                 constant->n_int = i;
 344                 strassign(exprreduce(ARGVsubi), cp,
 345                     FSTATIC|FSENSE, wcslen(cp));
 346         }
 347         running = 0;
 348 }
 349
 350 /*
 351  * Clean up when done parsing a function.
 352  * All formal parameters, because of a deal (funparm) in
 353  * yylex, get put into the symbol table in front of any
 354  * global variable of the same name.  When the entire
 355  * function is parsed, remove these formal dummy nodes
 356  * from the symbol table but retain the nodes because
 357  * the generated tree points at them.
 358  */
 359 void
 360 uexit(NODE *np)
 361 {
 362         NODE *formal;
 363
 364         while ((formal = getlist(&np)) != NNULL)
 365                 delsymtab(formal, 0);
 366 }
 367
 368 /*
 369  * The lexical analyzer.
 370  */
 371 int
 372 yylex()
 373 {
 374         wint_t c, c1;
 375         int i;
 376         static int savetoken = 0;
 377         static int wasfield;
 378         static int isfuncdef;
 379         static int nbrace, nparen, nbracket;
 380         static struct ctosymstruct {
 381                 wint_t c, sym;
 382         } ctosym[] = {
 383                 { '|', BAR },           { '^', CARAT },
 384                 { '~', TILDE },         { '<', LANGLE },
 385                 { '>', RANGLE },        { '+', PLUSC },
 386                 { '-', HYPHEN },        { '*', STAR },
 387                 { '/', SLASH },         { '%', PERCENT },
 388                 { '!', EXCLAMATION },   { '$', DOLLAR },
 389                 { '[', LSQUARE },       { ']', RSQUARE },
 390                 { '(', LPAREN },        { ')', RPAREN },
 391                 { ';', SEMI },          { '{', LBRACE },
 392                 { '}', RBRACE },        {   0, 0 }
 393         };
 394
 395         if (savetoken) {
 396                 c = savetoken;
 397                 savetoken = 0;
 398         } else if (redelim != '\0') {
 399                 c = redelim;
 400                 redelim = 0;
 401                 catterm = 0;
 402                 savetoken = c;
 403                 c = lexlast = lexregexp(c);
 404                 goto out;
 405         } else while ((c = lexgetc()) != WEOF) {
 406                 if (iswalpha(c) || c == '_') {
 407                         c = lexid(c);
 408                 } else if (iswdigit(c) || c == '.') {
 409                         c = lexnumber(c);
 410                 } else if (isWblank(c)) {
 411                         continue;
 412                 } else switch (c) {
 413 #if DOS || OS2
 414                 case 032:               /* ^Z */
 415                         continue;
 416 #endif
 417
 418                 case '"':
 419                         c = lexstring(c);
 420                         break;
 421
 422                 case '#':
 423                         while ((c = lexgetc()) != '\n' && c != WEOF)
 424                                 ;
 425                         lexungetc(c);
 426                         continue;
 427
 428                 case '+':
 429                         if ((c1 = lexgetc()) == '+')
 430                                 c = INC;
 431                         else if (c1 == '=')
 432                                 c = AADD;
 433                         else
 434                                 lexungetc(c1);
 435                         break;
 436
 437                 case '-':
 438                         if ((c1 = lexgetc()) == '-')
 439                                 c = DEC;
 440                         else if (c1 == '=')
 441                                 c = ASUB;
 442                         else
 443                                 lexungetc(c1);
 444                         break;
 445
 446                 case '*':
 447                         if ((c1 = lexgetc()) == '=')
 448                                 c = AMUL;
 449                         else if (c1 == '*') {
 450                                 if ((c1 = lexgetc()) == '=')
 451                                         c = AEXP;
 452                                 else {
 453                                         c = EXP;
 454                                         lexungetc(c1);
 455                                 }
 456                         } else
 457                                 lexungetc(c1);
 458                         break;
 459
 460                 case '^':
 461                         if ((c1 = lexgetc()) == '=') {
 462                                 c = AEXP;
 463                         } else {
 464                                 c = EXP;
 465                                 lexungetc(c1);
 466                         }
 467                         break;
 468
 469                 case '/':
 470                         if ((c1 = lexgetc()) == '=' &&
 471                             lexlast != RE && lexlast != NRE &&
 472                             lexlast != ';' && lexlast != '\n' &&
 473                             lexlast != ',' && lexlast != '(')
 474                                 c = ADIV;
 475                         else
 476                                 lexungetc(c1);
 477                         break;
 478
 479                 case '%':
 480                         if ((c1 = lexgetc()) == '=')
 481                                 c = AREM;
 482                         else
 483                                 lexungetc(c1);
 484                         break;
 485
 486                 case '&':
 487                         if ((c1 = lexgetc()) == '&')
 488                                 c = AND;
 489                         else
 490                                 lexungetc(c1);
 491                         break;
 492
 493                 case '|':
 494                         if ((c1 = lexgetc()) == '|')
 495                                 c = OR;
 496                         else {
 497                                 lexungetc(c1);
 498                                 if (inprint)
 499                                         c = PIPE;
 500                         }
 501                         break;
 502
 503                 case '>':
 504                         if ((c1 = lexgetc()) == '=')
 505                                 c = GE;
 506                         else if (c1 == '>')
 507                                 c = APPEND;
 508                         else {
 509                                 lexungetc(c1);
 510                                 if (nparen == 0 && inprint)
 511                                         c = WRITE;
 512                         }
 513                         break;
 514
 515                 case '<':
 516                         if ((c1 = lexgetc()) == '=')
 517                                 c = LE;
 518                         else
 519                                 lexungetc(c1);
 520                         break;
 521
 522                 case '!':
 523                         if ((c1 = lexgetc()) == '=')
 524                                 c = NE;
 525                         else if (c1 == '~')
 526                                 c = NRE;
 527                         else
 528                                 lexungetc(c1);
 529                         break;
 530
 531                 case '=':
 532                         if ((c1 = lexgetc()) == '=')
 533                                 c = EQ;
 534                         else {
 535                                 lexungetc(c1);
 536                                 c = ASG;
 537                         }
 538                         break;
 539
 540                 case '\n':
 541                         switch (lexlast) {
 542                         case ')':
 543                                 if (catterm || inprint) {
 544                                         c = ';';
 545                                         break;
 546                                 }
 547                         /* FALLTHROUGH */
 548                         case AND:
 549                         case OR:
 550                         case COMMA:
 551                         case '{':
 552                         case ELSE:
 553                         case ';':
 554                         case DO:
 555                                 continue;
 556
 557                         case '}':
 558                                 if (nbrace != 0)
 559                                         continue;
 560                                 /* FALLTHROUGH */
 561
 562                         default:
 563                                 c = ';';
 564                                 break;
 565                         }
 566                         break;
 567
 568                 case ELSE:
 569                         if (lexlast != ';') {
 570                                 savetoken = ELSE;
 571                                 c = ';';
 572                         }
 573                         break;
 574
 575                 case '(':
 576                         ++nparen;
 577                         break;
 578
 579                 case ')':
 580                         if (--nparen < 0)
 581                                 awkerr(unbal, "()");
 582                         break;
 583
 584                 case '{':
 585                         nbrace++;
 586                         break;
 587
 588                 case '}':
 589                         if (--nbrace < 0) {
 590                                 char brk[3];
 591
 592                                 brk[0] = '{';
 593                                 brk[1] = '}';
 594                                 brk[2] = '\0';
 595                                 awkerr(unbal, brk);
 596                         }
 597                         if (lexlast != ';') {
 598                                 savetoken = c;
 599                                 c = ';';
 600                         }
 601                         break;
 602
 603                 case '[':
 604                         ++nbracket;
 605                         break;
 606
 607                 case ']':
 608                         if (--nbracket < 0) {
 609                                 char brk[3];
 610
 611                                 brk[0] = '[';
 612                                 brk[1] = ']';
 613                                 brk[2] = '\0';
 614                                 awkerr(unbal, brk);
 615                         }
 616                         break;
 617
 618                 case '\\':
 619                         if ((c1 = lexgetc()) == '\n')
 620                                 continue;
 621                         lexungetc(c1);
 622                         break;
 623
 624                 case ',':
 625                         c = COMMA;
 626                         break;
 627
 628                 case '?':
 629                         c = QUEST;
 630                         break;
 631
 632                 case ':':
 633                         c = COLON;
 634                         break;
 635
 636                 default:
 637                         if (!iswprint(c))
 638                                 awkerr(
 639                                     gettext("invalid character \"%s\""),
 640                                     toprint(c));
 641                         break;
 642                 }
 643                 break;
 644         }
 645
 646         switch (c) {
 647         case ']':
 648                 ++catterm;
 649                 break;
 650
 651         case VAR:
 652                 if (catterm) {
 653                         savetoken = c;
 654                         c = CONCAT;
 655                         catterm = 0;
 656                 } else if (!isfuncdef) {
 657                         if ((c1 = lexgetc()) != '(')
 658                                 ++catterm;
 659                         lexungetc(c1);
 660                 }
 661                 isfuncdef = 0;
 662                 break;
 663
 664         case PARM:
 665         case CONSTANT:
 666                 if (catterm) {
 667                         savetoken = c;
 668                         c = CONCAT;
 669                         catterm = 0;
 670                 } else {
 671                         if (lexlast == '$')
 672                                 wasfield = 2;
 673                         ++catterm;
 674                 }
 675                 break;
 676
 677         case INC:
 678         case DEC:
 679                 if (!catterm || lexlast != CONSTANT || wasfield)
 680                         break;
 681
 682         /* FALLTHROUGH */
 683         case UFUNC:
 684         case FUNC:
 685         case GETLINE:
 686         case '!':
 687         case '$':
 688         case '(':
 689                 if (catterm) {
 690                         savetoken = c;
 691                         c = CONCAT;
 692                         catterm = 0;
 693                 }
 694                 break;
 695
 696         case '}':
 697                 if (nbrace == 0)
 698                         savetoken = ';';
 699         /* FALLTHROUGH */
 700         case ';':
 701                 inprint = 0;
 702         /* FALLTHROUGH */
 703         default:
 704                 if (c == DEFFUNC)
 705                         isfuncdef = 1;
 706                 catterm = 0;
 707         }
 708         lexlast = c;
 709         if (wasfield)
 710                 wasfield--;
 711         /*
 712          * Map character constants to symbolic names.
 713          */
 714         for (i = 0; ctosym[i].c != 0; i++)
 715                 if (c == ctosym[i].c) {
 716                         c = ctosym[i].sym;
 717                         break;
 718                 }
 719 out:
 720 #ifdef DEBUG
 721         if (dflag)
 722                 (void) printf("%d\n", (int)c);
 723 #endif
 724         return ((int)c);
 725 }
 726
 727 /*
 728  * Read a number for the lexical analyzer.
 729  * Input is the first character of the number.
 730  * Return value is the lexical type.
 731  */
 732 static int
 733 lexnumber(wint_t c)
 734 {
 735         wchar_t *cp;
 736         int dotfound = 0;
 737         int efound = 0;
 738         INT number;
 739
 740         cp = linebuf;
 741         do {
 742                 if (iswdigit(c))
 743                         ;
 744                 else if (c == '.') {
 745                         if (dotfound++)
 746                                 break;
 747                 } else if (c == 'e' || c == 'E') {
 748                         if ((c = lexgetc()) != '-' && c != '+') {
 749                                 lexungetc(c);
 750                                 c = 'e';
 751                         } else
 752                                 *cp++ = 'e';
 753                         if (efound++)
 754                                 break;
 755                 } else
 756                         break;
 757                 *cp++ = c;
 758         } while ((c = lexgetc()) != WEOF);
 759         *cp = '\0';
 760         if (dotfound && cp == linebuf+1)
 761                 return (DOT);
 762         lexungetc(c);
 763         errno = 0;
 764         if (!dotfound && !efound &&
 765             ((number = wcstol(linebuf, (wchar_t **)0, 10)), errno != ERANGE))
 766                 yylval.node = intnode(number);
 767         else
 768                 yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0));
 769         return (CONSTANT);
 770 }
 771
 772 /*
 773  * Read an identifier.
 774  * Input is first character of identifier.
 775  * Return VAR.
 776  */
 777 static int
 778 lexid(wint_t c)
 779 {
 780         wchar_t *cp;
 781         size_t i;
 782         NODE *np;
 783
 784         cp = linebuf;
 785         do {
 786                 *cp++ = c;
 787                 c = lexgetc();
 788         } while (iswalpha(c) || iswdigit(c) || c == '_');
 789         *cp = '\0';
 790         lexungetc(c);
 791         yylval.node = np = vlook(linebuf);
 792
 793         switch (np->n_type) {
 794         case KEYWORD:
 795                 switch (np->n_keywtype) {
 796                 case PRINT:
 797                 case PRINTF:
 798                         ++inprint;
 799                         /* FALLTHROUGH */
 800                 default:
 801                         return ((int)np->n_keywtype);
 802                 }
 803                 /* NOTREACHED */
 804
 805         case ARRAY:
 806         case VAR:
 807                 /*
 808                  * If reading the argument list, create a dummy node
 809                  * for the duration of that function. These variables
 810                  * can be removed from the symbol table at function end
 811                  * but they must still exist because the execution tree
 812                  * knows about them.
 813                  */
 814                 if (funparm) {
 815 do_funparm:
 816                         np = emptynode(PARM, i = (cp-linebuf));
 817                         np->n_flags = FSTRING;
 818                         np->n_string = _null;
 819                         np->n_strlen = 0;
 820                         (void) memcpy(np->n_name, linebuf,
 821                             (i+1) * sizeof (wchar_t));
 822                         addsymtab(np);
 823                         yylval.node = np;
 824                 } else if (np == varNF || (np == varFS &&
 825                     (!doing_begin || begin_getline))) {
 826                         /*
 827                          * If the user program references NF or sets
 828                          * FS either outside of a begin block or
 829                          * in a begin block after a getline then the
 830                          * input line will be split immediately upon read
 831                          * rather than when a field is first referenced.
 832                          */
 833                         needsplit = 1;
 834                 } else if (np == varENVIRON)
 835                         needenviron = 1;
 836         /* FALLTHROUGH */
 837         case PARM:
 838                 return (VAR);
 839
 840         case UFUNC:
 841                 /*
 842                  * It is ok to redefine functions as parameters
 843                  */
 844                 if (funparm) goto do_funparm;
 845         /* FALLTHROUGH */
 846         case FUNC:
 847         case GETLINE:
 848                 /*
 849                  * When a getline is encountered, clear the 'doing_begin' flag.
 850                  * This will force the 'needsplit' flag to be set, even inside
 851                  * a begin block, if FS is altered. (See VAR case above)
 852                  */
 853                 if (doing_begin)
 854                         begin_getline = 1;
 855                 return (np->n_type);
 856         }
 857         /* NOTREACHED */
 858         return (0);
 859 }
 860
 861 /*
 862  * Read a string for the lexical analyzer.
 863  * `endc' terminates the string.
 864  */
 865 static int
 866 lexstring(wint_t endc)
 867 {
 868         size_t length = lexescape(endc, 0, 0);
 869
 870         yylval.node = stringnode(linebuf, FALLOC, length);
 871         return (CONSTANT);
 872 }
 873
 874 /*
 875  * Read a regular expression.
 876  */
 877 static int
 878 lexregexp(wint_t endc)
 879 {
 880         (void) lexescape(endc, 1, 0);
 881         yylval.node = renode(linebuf);
 882         return (URE);
 883 }
 884
 885 /*
 886  * Process a string, converting the escape characters as required by
 887  * 1003.2. The processed string ends up in the global linebuf[]. This
 888  * routine also changes the value of 'progfd' - the program file
 889  * descriptor, so it should be used with some care. It is presently used to
 890  * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
 891  */
 892 void
 893 strescape(wchar_t *str)
 894 {
 895         progptr = str;
 896         proglen = wcslen(str) + 1;      /* Include \0 */
 897         (void) lexescape('\0', 0, 1);
 898         progptr = NULL;
 899 }
 900
 901 /*
 902  * Read a string or regular expression, terminated by ``endc'',
 903  * for lexical analyzer, processing escape sequences.
 904  * Return string length.
 905  */
 906 static size_t
 907 lexescape(wint_t endc, int regx, int cmd_line_operand)
 908 {
 909         static char nlre[256];
 910         static char nlstr[256];
 911         static char eofre[256];
 912         static char eofstr[256];
 913         int first_time = 1;
 914         wint_t c;
 915         wchar_t *cp;
 916         int n, max;
 917
 918         if (first_time == 1) {
 919                 (void) strcpy(nlre, gettext("Newline in regular expression\n"));
 920                 (void) strcpy(nlstr, gettext("Newline in string\n"));
 921                 (void) strcpy(eofre, gettext("EOF in regular expression\n"));
 922                 (void) strcpy(eofstr, gettext("EOF in string\n"));
 923                 first_time = 0;
 924         }
 925
 926         cp = linebuf;
 927         while ((c = lexgetc()) != endc) {
 928                 if (c == '\n')
 929                         awkerr(regx ? nlre : nlstr);
 930                 if (c == '\\') {
 931                         switch (c = lexgetc(), c) {
 932                         case '\\':
 933                                 if (regx)
 934                                         *cp++ = '\\';
 935                                 break;
 936
 937                         case '/':
 938                                 c = '/';
 939                                 break;
 940
 941                         case 'n':
 942                                 c = '\n';
 943                                 break;
 944
 945                         case 'b':
 946                                 c = '\b';
 947                                 break;
 948
 949                         case 't':
 950                                 c = '\t';
 951                                 break;
 952
 953                         case 'r':
 954                                 c = '\r';
 955                                 break;
 956
 957                         case 'f':
 958                                 c = '\f';
 959                                 break;
 960
 961                         case 'v':
 962                                 c = '\v';
 963                                 break;
 964
 965                         case 'a':
 966                                 c = (char)0x07;
 967                                 break;
 968
 969                         case 'x':
 970                                 n = 0;
 971                                 while (iswxdigit(c = lexgetc())) {
 972                                         if (iswdigit(c))
 973                                                 c -= '0';
 974                                         else if (iswupper(c))
 975                                                 c -= 'A'-10;
 976                                         else
 977                                                 c -= 'a'-10;
 978                                         n = (n<<4) + c;
 979                                 }
 980                                 lexungetc(c);
 981                                 c = n;
 982                                 break;
 983
 984                         case '0':
 985                         case '1':
 986                         case '2':
 987                         case '3':
 988                         case '4':
 989                         case '5':
 990                         case '6':
 991                         case '7':
 992 #if 0
 993 /*
 994  * Posix.2 draft 10 disallows the use of back-referencing - it explicitly
 995  * requires processing of the octal escapes both in strings and
 996  * regular expressions. The following code is disabled instead of
 997  * removed as back-referencing may be reintroduced in a future draft
 998  * of the standard.
 999  */
1000                                 /*
1001                                  * For regular expressions, we disallow
1002                                  * \ooo to mean octal character, in favour
1003                                  * of back referencing.
1004                                  */
1005                                 if (regx) {
1006                                         *cp++ = '\\';
1007                                         break;
1008                                 }
1009 #endif
1010                                 max = 3;
1011                                 n = 0;
1012                                 do {
1013                                         n = (n<<3) + c-'0';
1014                                         if ((c = lexgetc()) > '7' || c < '0')
1015                                                 break;
1016                                 } while (--max);
1017                                 lexungetc(c);
1018                                 /*
1019                                  * an octal escape sequence must have at least
1020                                  * 2 digits after the backslash, otherwise
1021                                  * it gets passed straight thru for possible
1022                                  * use in backreferencing.
1023                                  */
1024                                 if (max == 3) {
1025                                         *cp++ = '\\';
1026                                         n += '0';
1027                                 }
1028                                 c = n;
1029                                 break;
1030
1031                         case '\n':
1032                                 continue;
1033
1034                         default:
1035                                 if (c != endc || cmd_line_operand) {
1036                                         *cp++ = '\\';
1037                                         if (c == endc)
1038                                                 lexungetc(c);
1039                                 }
1040                         }
1041                 }
1042                 if (c == WEOF)
1043                         awkerr(regx ? eofre : eofstr);
1044                 *cp++ = c;
1045         }
1046         *cp = '\0';
1047         return (cp - linebuf);
1048 }
1049
1050 /*
1051  * Build a regular expression NODE.
1052  * Argument is the string holding the expression.
1053  */
1054 NODE *
1055 renode(wchar_t *s)
1056 {
1057         NODE *np;
1058         int n;
1059
1060         np = emptynode(RE, 0);
1061         np->n_left = np->n_right = NNULL;
1062         if ((n = REGWCOMP(&np->n_regexp, s)) != REG_OK) {
1063                 int m;
1064                 char *p;
1065
1066                 m = REGWERROR(n, np->n_regexp, NULL, 0);
1067                 p = (char *)emalloc(m);
1068                 REGWERROR(n, np->n_regexp, p, m);
1069                 awkerr("/%S/: %s", s, p);
1070         }
1071         return (np);
1072 }
1073 /*
1074  * Get a character for the lexical analyser routine.
1075  */
1076 static wint_t
1077 lexgetc()
1078 {
1079         wint_t c;
1080         static char **files = &progfiles[0];
1081
1082         if (progfp != FNULL && (c = fgetwc(progfp)) != WEOF)
1083                 ;
1084         else {
1085                 if (progptr != NULL) {
1086                         if (proglen-- <= 0)
1087                                 c = WEOF;
1088                         else
1089                                 c = *progptr++;
1090                 } else {
1091                         if (progfp != FNULL) {
1092                                 if (progfp != stdin)
1093                                         (void) fclose(progfp);
1094                                 else
1095                                         clearerr(progfp);
1096                                 progfp = FNULL;
1097                         }
1098                         if (files < progfilep) {
1099                                 filename = *files++;
1100                                 lineno = 1;
1101                                 if (filename[0] == '-' && filename[1] == '\0')
1102                                         progfp = stdin;
1103                                 else if ((progfp = fopen(filename, r))
1104                                     == FNULL) {
1105                                         (void) fprintf(stderr,
1106                                 gettext("script file \"%s\""), filename);
1107                                         exit(1);
1108                                 }
1109                                 c = fgetwc(progfp);
1110                         }
1111                 }
1112         }
1113         if (c == '\n')
1114                 ++lineno;
1115         if (conptr >= &context[NCONTEXT])
1116                 conptr = &context[0];
1117         if (c != WEOF)
1118                 *conptr++ = c;
1119         return (c);
1120 }
1121
1122 /*
1123  * Return a character for lexical analyser.
1124  * Only one returned character is (not enforced) legitimite.
1125  */
1126 static void
1127 lexungetc(wint_t c)
1128 {
1129         if (c == '\n')
1130                 --lineno;
1131         if (c != WEOF) {
1132                 if (conptr == &context[0])
1133                         conptr = &context[NCONTEXT];
1134                 *--conptr = '\0';
1135         }
1136         if (progfp != FNULL) {
1137                 (void) ungetwc(c, progfp);
1138                 return;
1139         }
1140         if (c == WEOF)
1141                 return;
1142         *--progptr = c;
1143         proglen++;
1144 }
1145
1146 /*
1147  * Syntax errors during parsing.
1148  */
1149 void
1150 yyerror(char *s, ...)
1151 {
1152         if (lexlast == FUNC || lexlast == GETLINE || lexlast == KEYWORD)
1153                 if (lexlast == KEYWORD)
1154                         awkerr(gettext("inadmissible use of reserved keyword"));
1155                 else
1156                         awkerr(gettext("attempt to redefine builtin function"));
1157         awkerr(s);
1158 }
1159
1160 /*
1161  * Error routine for all awk errors.
1162  */
1163 /* ARGSUSED */
1164 void
1165 awkerr(char *fmt, ...)
1166 {
1167         va_list args;
1168
1169         va_start(args, fmt);
1170         awkierr(0, fmt, args);
1171         va_end(args);
1172 }
1173
1174 /*
1175  * Error routine like "awkerr" except that it prints out
1176  * a message that includes an errno-specific indication.
1177  */
1178 /* ARGSUSED */
1179 void
1180 awkperr(char *fmt, ...)
1181 {
1182         va_list args;
1183
1184         va_start(args, fmt);
1185         awkierr(1, fmt, args);
1186         va_end(args);
1187 }
1188
1189 /*
1190  * Common internal routine for awkerr, awkperr
1191  */
1192 static void
1193 awkierr(int perr, char *fmt, va_list ap)
1194 {
1195         static char sep1[] = "\n>>>\t";
1196         static char sep2[] = "\t<<<";
1197         int saveerr = errno;
1198
1199         (void) fprintf(stderr, "%s: ", _cmdname);
1200         if (running) {
1201                 (void) fprintf(stderr, gettext("line %u ("),
1202                     curnode == NNULL ? 0 : curnode->n_lineno);
1203                 if (phase == 0)
1204                         (void) fprintf(stderr, "NR=%lld): ",
1205                             (INT)exprint(varNR));
1206                 else
1207                         (void) fprintf(stderr, "%s): ",
1208                             phase == BEGIN ? s_BEGIN : s_END);
1209         } else if (lineno != 0) {
1210                 (void) fprintf(stderr, gettext("file \"%s\": "), filename);
1211                 (void) fprintf(stderr, gettext("line %u: "), lineno);
1212         }
1213         (void) vfprintf(stderr, gettext(fmt), ap);
1214         if (perr == 1)
1215                 (void) fprintf(stderr, ": %s", strerror(saveerr));
1216         if (perr != 2 && !running) {
1217                 wchar_t *cp;
1218                 int n;
1219                 int c;
1220
1221                 (void) fprintf(stderr, gettext("  Context is:%s"), sep1);
1222                 cp = conptr;
1223                 n = NCONTEXT;
1224                 do {
1225                         if (cp >= &context[NCONTEXT])
1226                                 cp = &context[0];
1227                         if ((c = *cp++) != '\0')
1228                                 (void) fputs(c == '\n' ? sep1 : toprint(c),
1229                                     stderr);
1230                 } while (--n != 0);
1231                 (void) fputs(sep2, stderr);
1232         }
1233         (void) fprintf(stderr, "\n");
1234         exit(1);
1235 }
1236
1237 wchar_t *
1238 emalloc(unsigned n)
1239 {
1240         wchar_t *cp;
1241
1242         if ((cp = malloc(n)) == NULL)
1243                 awkerr(nomem);
1244         return (cp);
1245 }
1246
1247 wchar_t *
1248 erealloc(wchar_t *p, unsigned n)
1249 {
1250         wchar_t *cp;
1251
1252         if ((cp = realloc(p, n)) == NULL)
1253                 awkerr(nomem);
1254         return (cp);
1255 }
1256
1257
1258 /*
1259  * usage message for awk
1260  */
1261 static int
1262 usage()
1263 {
1264         (void) fprintf(stderr, gettext(
1265 "Usage: awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n"
1266 "       awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n"));
1267         return (2);
1268 }
1269
1270
1271 static wchar_t *
1272 mbconvert(char *str)
1273 {
1274         static wchar_t *op = 0;
1275
1276         if (op != 0)
1277                 free(op);
1278         return (op = mbstowcsdup(str));
1279 }
1280
1281 char *
1282 mbunconvert(wchar_t *str)
1283 {
1284         static char *op = 0;
1285
1286         if (op != 0)
1287                 free(op);
1288         return (op = wcstombsdup(str));
1289 }
1290
1291 /*
1292  * Solaris port - following functions are typical MKS functions written
1293  * to work for Solaris.
1294  */
1295
1296 wchar_t *
1297 mbstowcsdup(char *s)
1298 {
1299         int n;
1300         wchar_t *w;
1301
1302         n = strlen(s) + 1;
1303         if ((w = (wchar_t *)malloc(n * sizeof (wchar_t))) == NULL)
1304                 return (NULL);
1305
1306         if (mbstowcs(w, s, n) == (size_t)-1)
1307                 return (NULL);
1308         return (w);
1309
1310 }
1311
1312 char *
1313 wcstombsdup(wchar_t *w)
1314 {
1315         int n;
1316         char *mb;
1317
1318         /* Fetch memory for worst case string length */
1319         n = wslen(w) + 1;
1320         n *= MB_CUR_MAX;
1321         if ((mb = (char *)malloc(n)) == NULL) {
1322                 return (NULL);
1323         }
1324
1325         /* Convert the string */
1326         if ((n = wcstombs(mb, w, n)) == -1) {
1327                 int saverr = errno;
1328
1329                 free(mb);
1330                 errno = saverr;
1331                 return (0);
1332         }
1333
1334         /* Shrink the string down */
1335         if ((mb = (char *)realloc(mb, strlen(mb)+1)) == NULL)  {
1336                 return (NULL);
1337         }
1338         return (mb);
1339 }
1340
1341 /*
1342  * The upe_ctrls[] table contains the printable 'control-sequences' for the
1343  * character values 0..31 and 127.  The first entry is for value 127, thus the
1344  * entries for the remaining character values are from 1..32.
1345  */
1346 static const char *const upe_ctrls[] =
1347 {
1348         "^?",
1349         "^@",  "^A",  "^B",  "^C",  "^D",  "^E",  "^F",  "^G",
1350         "^H",  "^I",  "^J",  "^K",  "^L",  "^M",  "^N",  "^O",
1351         "^P",  "^Q",  "^R",  "^S",  "^T",  "^U",  "^V",  "^W",
1352         "^X",  "^Y",  "^Z",  "^[",  "^\\", "^]",  "^^",  "^_"
1353 };
1354
1355
1356 /*
1357  * Return a printable string corresponding to the given character value.  If
1358  * the character is printable, simply return it as the string.  If it is in
1359  * the range specified by table 5-101 in the UPE, return the corresponding
1360  * string.  Otherwise, return an octal escape sequence.
1361  */
1362 static const char *
1363 toprint(wchar_t c)
1364 {
1365         int n, len;
1366         unsigned char *ptr;
1367         static char mbch[MB_LEN_MAX+1];
1368         static char buf[5 * MB_LEN_MAX + 1];
1369
1370         if ((n = wctomb(mbch, c)) == -1) {
1371                 /* Should never happen */
1372                 (void) sprintf(buf, "\\%x", c);
1373                 return (buf);
1374         }
1375         mbch[n] = '\0';
1376         if (iswprint(c)) {
1377                 return (mbch);
1378         } else if (c == 127) {
1379                 return (upe_ctrls[0]);
1380         } else if (c < 32) {
1381                 /* Print as in Table 5-101 in the UPE */
1382                 return (upe_ctrls[c+1]);
1383         } else {
1384                 /* Print as an octal escape sequence */
1385                 for (len = 0, ptr = (unsigned char *) mbch; 0 < n; --n, ++ptr)
1386                         len += sprintf(buf+len, "\\%03o", *ptr);
1387         }
1388         return (buf);
1389 }
1390
1391 static int
1392 wcoff(const wchar_t *astring, const int off)
1393 {
1394         const wchar_t *s = astring;
1395         int c = 0;
1396         char mb[MB_LEN_MAX];
1397
1398         while (c < off) {
1399                 int n;
1400                 if ((n = wctomb(mb, *s)) == 0)
1401                         break;
1402                 if (n == -1)
1403                         n = 1;
1404                 c += n;
1405                 s++;
1406         }
1407
1408         return (s - astring);
1409 }
1410
1411 #define NREGHASH        64
1412 #define NREGHOLD        1024    /* max number unused entries */
1413
1414 static int      nregunref;
1415
1416 struct reghashq {
1417         struct qelem hq;
1418         struct regcache *regcachep;
1419 };
1420
1421 struct regcache {
1422         struct qelem    lq;
1423         wchar_t *pattern;
1424         regex_t re;
1425         int     refcnt;
1426         struct reghashq hash;
1427 };
1428
1429 static struct qelem reghash[NREGHASH], reglink;
1430
1431 /*
1432  * Generate a hash value of the given wchar string.
1433  * The hashing method is similar to what Java does for strings.
1434  */
1435 static uint_t
1436 regtxthash(const wchar_t *str)
1437 {
1438         int k = 0;
1439
1440         while (*str != L'\0')
1441                 k = (31 * k) + *str++;
1442
1443         k += ~(k << 9);
1444         k ^=  (k >> 14);
1445         k +=  (k << 4);
1446         k ^=  (k >> 10);
1447
1448         return (k % NREGHASH);
1449 }
1450
1451 int
1452 int_regwcomp(REGEXP *r, const wchar_t *pattern)
1453 {
1454         regex_t re;
1455         char *mbpattern;
1456         int ret;
1457         uint_t key;
1458         struct qelem *qp;
1459         struct regcache *rcp;
1460
1461         key = regtxthash(pattern);
1462         for (qp = reghash[key].q_forw; qp != NULL; qp = qp->q_forw) {
1463                 rcp = ((struct reghashq *)qp)->regcachep;
1464                 if (*rcp->pattern == *pattern &&
1465                     wcscmp(rcp->pattern, pattern) == 0)
1466                         break;
1467         }
1468         if (qp != NULL) {
1469                 /* update link. put this one at the beginning */
1470                 if (rcp != (struct regcache *)reglink.q_forw) {
1471                         remque(&rcp->lq);
1472                         insque(&rcp->lq, &reglink);
1473                 }
1474                 if (rcp->refcnt == 0)
1475                         nregunref--;    /* no longer unref'ed */
1476                 rcp->refcnt++;
1477                 *(struct regcache **)r = rcp;
1478                 return (REG_OK);
1479         }
1480
1481         if ((mbpattern = wcstombsdup((wchar_t *)pattern)) == NULL)
1482                 return (REG_ESPACE);
1483
1484         ret = regcomp(&re, mbpattern, REG_EXTENDED);
1485
1486         free(mbpattern);
1487
1488         if (ret != REG_OK)
1489                 return (ret);
1490
1491         if ((rcp = malloc(sizeof (struct regcache))) == NULL)
1492                 return (REG_ESPACE);
1493         rcp->re = re;
1494         if ((rcp->pattern = wsdup(pattern)) == NULL) {
1495                 regfree(&re);
1496                 free(rcp);
1497                 return (REG_ESPACE);
1498         }
1499         rcp->refcnt = 1;
1500         insque(&rcp->lq, &reglink);
1501         insque(&rcp->hash.hq, &reghash[key]);
1502         rcp->hash.regcachep = rcp;
1503
1504         *(struct regcache **)r = rcp;
1505         return (ret);
1506 }
1507
1508 void
1509 int_regwfree(REGEXP r)
1510 {
1511         int     cnt;
1512         struct qelem *qp, *nqp;
1513         struct regcache *rcp;
1514
1515         rcp = (struct regcache *)r;
1516
1517         if (--rcp->refcnt != 0)
1518                 return;
1519
1520         /* this cache has no reference */
1521         if (++nregunref < NREGHOLD)
1522                 return;
1523
1524         /*
1525          * We've got too much unref'ed regex. Free half of least
1526          * used regex.
1527          */
1528         cnt = 0;
1529         for (qp = reglink.q_forw; qp != NULL; qp = nqp) {
1530                 nqp = qp->q_forw;
1531                 rcp = (struct regcache *)qp;
1532                 if (rcp->refcnt != 0)
1533                         continue;
1534
1535                 /* free half of them */
1536                 if (++cnt < (NREGHOLD / 2))
1537                         continue;
1538
1539                 /* detach and free */
1540                 remque(&rcp->lq);
1541                 remque(&rcp->hash.hq);
1542
1543                 /* free up */
1544                 free(rcp->pattern);
1545                 regfree(&rcp->re);
1546                 free(rcp);
1547
1548                 nregunref--;
1549         }
1550 }
1551
1552 size_t
1553 int_regwerror(int errcode, REGEXP r, char *errbuf, size_t bufsiz)
1554 {
1555         struct regcache *rcp;
1556
1557         rcp = (struct regcache *)r;
1558         return (regerror(errcode, &rcp->re, errbuf, bufsiz));
1559 }
1560
1561 int
1562 int_regwexec(REGEXP r,          /* compiled RE */
1563     const wchar_t *astring,     /* subject string */
1564     size_t nsub,                /* number of subexpressions */
1565     int_regwmatch_t *sub,       /* subexpression pointers */
1566     int flags)
1567 {
1568         char *mbs;
1569         regmatch_t *mbsub = NULL;
1570         int i;
1571         struct regcache *rcp;
1572
1573         if ((mbs = wcstombsdup((wchar_t *)astring)) == NULL)
1574                 return (REG_ESPACE);
1575
1576         if (nsub > 0 && sub) {
1577                 if ((mbsub = malloc(nsub * sizeof (regmatch_t))) == NULL)
1578                         return (REG_ESPACE);
1579         }
1580
1581         rcp = (struct regcache *)r;
1582
1583         i = regexec(&rcp->re, mbs, nsub, mbsub, flags);
1584
1585         /* Now, adjust the pointers/counts in sub */
1586         if (i == REG_OK && nsub > 0 && mbsub) {
1587                 int j, k;
1588
1589                 for (j = 0; j < nsub; j++) {
1590                         regmatch_t *ms = &mbsub[j];
1591                         int_regwmatch_t *ws = &sub[j];
1592
1593                         if ((k = ms->rm_so) >= 0) {
1594                                 ws->rm_so = wcoff(astring, k);
1595                                 ws->rm_sp = astring + ws->rm_so;
1596                         }
1597                         if ((k = ms->rm_eo) >= 0) {
1598                                 ws->rm_eo = wcoff(astring, k);
1599                                 ws->rm_ep = astring + ws->rm_eo;
1600                         }
1601                 }
1602         }
1603
1604         free(mbs);
1605         if (mbsub)
1606                 free(mbsub);
1607         return (i);
1608 }
1609
1610 int
1611 int_regwdosuba(REGEXP rp,       /* compiled RE: Pattern */
1612     const wchar_t *rpl,         /* replacement string: /rpl/ */
1613     const wchar_t *src,         /* source string */
1614     wchar_t **dstp,             /* destination string */
1615     int len,                    /* destination length */
1616     int *globp)         /* IN: occurence, 0 for all; OUT: substitutions */
1617 {
1618         wchar_t *dst, *odst;
1619         const wchar_t *ip, *xp;
1620         wchar_t *op;
1621         int i;
1622         wchar_t c;
1623         int glob, iglob = *globp, oglob = 0;
1624 #define NSUB    10
1625         int_regwmatch_t rm[NSUB], *rmp;
1626         int flags;
1627         wchar_t *end;
1628         int regerr;
1629
1630 /* handle overflow of dst. we need "i" more bytes */
1631 #ifdef OVERFLOW
1632 #undef OVERFLOW
1633 #define OVERFLOW(i) { \
1634                 int pos = op - dst; \
1635                 dst = (wchar_t *)realloc(odst = dst, \
1636                         (len += len + i) * sizeof (wchar_t)); \
1637                 if (dst == NULL) \
1638                         goto nospace; \
1639                 op = dst + pos; \
1640                 end = dst + len; \
1641         }
1642 #endif
1643
1644         *dstp = dst = (wchar_t *)malloc(len * sizeof (wchar_t));
1645         if (dst == NULL)
1646                 return (REG_ESPACE);
1647
1648         if (rp == NULL || rpl == NULL || src == NULL || dst ==  NULL)
1649                 return (REG_EFATAL);
1650
1651         glob = 0;       /* match count */
1652         ip = src;       /* source position */
1653         op = dst;       /* destination position */
1654         end = dst + len;
1655
1656         flags = 0;
1657         while ((regerr = int_regwexec(rp, ip, NSUB, rm, flags)) == REG_OK) {
1658                 /* Copy text preceding match */
1659                 if (op + (i = rm[0].rm_sp - ip) >= end)
1660                         OVERFLOW(i)
1661                 while (i--)
1662                         *op++ = *ip++;
1663
1664                 if (iglob == 0 || ++glob == iglob) {
1665                         oglob++;
1666                         xp = rpl;               /* do substitute */
1667                 } else
1668                         xp = L"&";              /* preserve text */
1669
1670                 /* Perform replacement of matched substing */
1671                 while ((c = *xp++) != '\0') {
1672                         rmp = NULL;
1673                         if (c == '&')
1674                                 rmp = &rm[0];
1675                         else if (c == '\\') {
1676                                 if ('0' <= *xp && *xp <= '9')
1677                                         rmp = &rm[*xp++ - '0'];
1678                                 else if (*xp != '\0')
1679                                         c = *xp++;
1680                         }
1681
1682                         if (rmp ==  NULL) {     /* Ordinary character. */
1683                                 *op++ = c;
1684                                 if (op >= end)
1685                                         OVERFLOW(1)
1686                         } else if (rmp->rm_sp != NULL && rmp->rm_ep != NULL) {
1687                                 ip = rmp->rm_sp;
1688                                 if (op + (i = rmp->rm_ep - rmp->rm_sp) >= end)
1689                                         OVERFLOW(i)
1690                                 while (i--)
1691                                         *op++ = *ip++;
1692                         }
1693                 }
1694
1695                 ip = rm[0].rm_ep;
1696                 if (*ip == '\0')        /* If at end break */
1697                         break;
1698                 else if (rm[0].rm_sp == rm[0].rm_ep) {
1699                         /* If empty match copy next char */
1700                         *op++ = *ip++;
1701                         if (op >= end)
1702                                 OVERFLOW(1)
1703                 }
1704                 flags = REG_NOTBOL;
1705         }
1706
1707         if (regerr != REG_OK && regerr != REG_NOMATCH)
1708                 return (regerr);
1709
1710         /* Copy rest of text */
1711         if (op + (i =  wcslen(ip)) >= end)
1712                 OVERFLOW(i)
1713         while (i--)
1714                 *op++ = *ip++;
1715         *op++ = '\0';
1716
1717         if ((*dstp = dst = (wchar_t *)realloc(odst = dst,
1718             sizeof (wchar_t) * (size_t)(op - dst))) == NULL) {
1719 nospace:
1720                 free(odst);
1721                 return (REG_ESPACE);
1722         }
1723
1724         *globp = oglob;
1725
1726         return ((oglob == 0) ? REG_NOMATCH : REG_OK);
1727 }