3065 some functions in the tcp module can be static
[unleashed.git] / usr / src / cmd / man / src / util / nsgmls.src / lib / parseMode.cxx
blob9920b0220da0546b3cdcf1c487397dcd296f162c
1 // Copyright (c) 1994 James Clark
2 // See the file COPYING for copying permission.
3 #pragma ident "%Z%%M% %I% %E% SMI"
5 #include "splib.h"
6 #include "Parser.h"
7 #include "ParserMessages.h"
8 #include "MessageArg.h"
9 #include "TokenMessageArg.h"
10 #include "ModeInfo.h"
11 #include "Partition.h"
12 #include "SrInfo.h"
13 #include "Vector.h"
14 #include "ISetIter.h"
15 #include "token.h"
16 #include "TrieBuilder.h"
17 #include "macros.h"
19 #ifdef SP_NAMESPACE
20 namespace SP_NAMESPACE {
21 #endif
23 enum {
24 modeUsedInSd = 01,
25 modeUsedInProlog = 02,
26 modeUsedInInstance = 04,
27 modeUsesSr = 010
30 static struct {
31 Mode mode;
32 unsigned flags;
33 } modeTable[] = {
34 { grpMode, modeUsedInProlog|modeUsedInInstance },
35 { alitMode, modeUsedInProlog|modeUsedInInstance },
36 { alitaMode, modeUsedInProlog|modeUsedInInstance },
37 { aliteMode, modeUsedInProlog|modeUsedInInstance },
38 { talitMode, modeUsedInProlog|modeUsedInInstance },
39 { talitaMode, modeUsedInProlog|modeUsedInInstance },
40 { taliteMode, modeUsedInProlog|modeUsedInInstance },
41 { mdMode, modeUsedInProlog|modeUsedInInstance },
42 { mdMinusMode, modeUsedInProlog },
43 { mdPeroMode, modeUsedInProlog },
44 { sdMode, modeUsedInSd },
45 { comMode, modeUsedInProlog|modeUsedInInstance },
46 { sdcomMode, modeUsedInSd },
47 { piMode, modeUsedInProlog|modeUsedInInstance },
48 { refMode, modeUsedInProlog|modeUsedInInstance|modeUsedInSd },
49 { imsMode, modeUsedInProlog|modeUsedInInstance },
50 { cmsMode, modeUsedInProlog|modeUsedInInstance },
51 { rcmsMode, modeUsedInProlog|modeUsedInInstance },
52 { proMode, modeUsedInProlog },
53 { dsMode, modeUsedInProlog },
54 { dsiMode, modeUsedInProlog },
55 { plitMode, modeUsedInProlog },
56 { plitaMode, modeUsedInProlog },
57 { pliteMode, modeUsedInProlog },
58 { sdplitMode, modeUsedInSd },
59 { sdplitaMode, modeUsedInSd },
60 { grpsufMode, modeUsedInProlog },
61 { mlitMode, modeUsedInProlog|modeUsedInSd },
62 { mlitaMode, modeUsedInProlog|modeUsedInSd },
63 { asMode, modeUsedInProlog },
64 { slitMode, modeUsedInProlog },
65 { slitaMode, modeUsedInProlog },
66 { sdslitMode, modeUsedInSd },
67 { sdslitaMode, modeUsedInSd },
68 { cconMode, modeUsedInInstance },
69 { rcconMode, modeUsedInInstance },
70 { cconnetMode, modeUsedInInstance },
71 { rcconnetMode, modeUsedInInstance },
72 { rcconeMode, modeUsedInInstance },
73 { tagMode, modeUsedInInstance },
74 { econMode, modeUsedInInstance|modeUsesSr },
75 { mconMode, modeUsedInInstance|modeUsesSr },
76 { econnetMode, modeUsedInInstance|modeUsesSr },
77 { mconnetMode, modeUsedInInstance|modeUsesSr },
80 void Parser::compileSdModes()
82 Mode modes[nModes];
83 int n = 0;
84 for (size_t i = 0; i < SIZEOF(modeTable); i++)
85 if (modeTable[i].flags & modeUsedInSd)
86 modes[n++] = modeTable[i].mode;
87 compileModes(modes, n, 0);
90 void Parser::compilePrologModes()
92 Boolean scopeInstance = sd().scopeInstance();
93 Boolean haveSr = syntax().hasShortrefs();
94 Mode modes[nModes];
95 int n = 0;
96 for (size_t i = 0; i < SIZEOF(modeTable); i++) {
97 if (scopeInstance) {
98 if (modeTable[i].flags & modeUsedInProlog)
99 modes[n++] = modeTable[i].mode;
101 else if (haveSr) {
102 if ((modeTable[i].flags & (modeUsedInInstance|modeUsedInProlog))
103 && !(modeTable[i].flags & modeUsesSr))
104 modes[n++] = modeTable[i].mode;
106 else {
107 if (modeTable[i].flags & (modeUsedInInstance|modeUsedInProlog))
108 modes[n++] = modeTable[i].mode;
111 compileModes(modes, n, 0);
114 void Parser::compileInstanceModes()
116 Boolean scopeInstance = sd().scopeInstance();
117 compileNormalMap();
118 if (!scopeInstance && !syntax().hasShortrefs())
119 return;
120 Mode modes[nModes];
121 int n = 0;
122 for (size_t i = 0; i < SIZEOF(modeTable); i++) {
123 if (scopeInstance) {
124 if (modeTable[i].flags & modeUsedInInstance)
125 modes[n++] = modeTable[i].mode;
127 else {
128 if (modeTable[i].flags & modeUsesSr)
129 modes[n++] = modeTable[i].mode;
132 compileModes(modes, n, &currentDtd());
135 void Parser::compileModes(const Mode *modes,
136 int n,
137 const Dtd *dtd)
139 PackedBoolean sets[Syntax::nSet];
140 PackedBoolean delims[Syntax::nDelimGeneral];
141 PackedBoolean functions[3];
142 int i;
143 Boolean includesShortref = 0;
144 for (i = 0; i < Syntax::nSet; i++)
145 sets[i] = 0;
146 for (i = 0; i < Syntax::nDelimGeneral; i++)
147 delims[i] = 0;
148 for (i = 0; i < 3; i++)
149 functions[i] = 0;
151 for (i = 0; i < n; i++) {
152 ModeInfo iter(modes[i], sd());
153 TokenInfo ti;
154 while (iter.nextToken(&ti)) {
155 switch (ti.type) {
156 case TokenInfo::delimType:
157 delims[ti.delim1] = 1;
158 break;
159 case TokenInfo::delimDelimType:
160 delims[ti.delim1] = 1;
161 delims[ti.delim2] = 1;
162 break;
163 case TokenInfo::delimSetType:
164 delims[ti.delim1] = 1;
165 // fall through
166 case TokenInfo::setType:
167 sets[ti.set] = 1;
168 break;
169 case TokenInfo::functionType:
170 functions[ti.function] = 1;
171 break;
174 if (!includesShortref && iter.includesShortref())
175 includesShortref = 1;
178 ISet<Char> chars;
180 for (i = 0; i < 3; i++)
181 if (functions[i])
182 chars.add(syntax().standardFunction(i));
183 for (i = 0; i < Syntax::nDelimGeneral; i++)
184 if (delims[i]) {
185 const StringC &str = syntax().delimGeneral(i);
186 for (size_t j = 0; j < str.size(); j++)
187 chars.add(str[j]);
189 if (includesShortref && dtd) {
190 size_t n = dtd->nShortref();
191 for (size_t i = 0; i < n; i++) {
192 const StringC &delim = dtd->shortref(i);
193 size_t len = delim.size();
194 for (size_t j = 0; j < len; j++)
195 if (delim[j] == sd().execToInternal('B'))
196 sets[Syntax::blank] = 1;
197 else
198 chars.add(delim[j]);
202 const ISet<Char> *csets[Syntax::nSet];
203 int usedSets = 0;
204 for (i = 0; i < Syntax::nSet; i++)
205 if (sets[i])
206 csets[usedSets++] = syntax().charSet(i);
208 Partition partition(chars, csets, usedSets, *syntax().generalSubstTable());
210 String<EquivCode> setCodes[Syntax::nSet];
212 int nCodes = 0;
213 for (i = 0; i < Syntax::nSet; i++)
214 if (sets[i])
215 setCodes[i] = partition.setCodes(nCodes++);
217 String<EquivCode> delimCodes[Syntax::nDelimGeneral];
218 for (i = 0; i < Syntax::nDelimGeneral; i++)
219 if (delims[i]) {
220 StringC str = syntax().delimGeneral(i);
221 for (size_t j = 0; j < str.size(); j++)
222 delimCodes[i] += partition.charCode(str[j]);
225 String<EquivCode> functionCode[3];
226 for (i = 0; i < 3; i++)
227 if (functions[i])
228 functionCode[i] += partition.charCode(syntax().standardFunction(i));
230 Vector<SrInfo> srInfo;
232 int nShortref;
233 if (!includesShortref || !dtd)
234 nShortref = 0;
235 else {
236 nShortref = dtd->nShortref();
237 srInfo.resize(nShortref);
239 for (i = 0; i < nShortref; i++) {
240 const StringC delim = dtd->shortref(i);
241 SrInfo *p = &srInfo[i];
242 size_t j;
243 for (j = 0; j < delim.size(); j++) {
244 if (delim[j] == sd().execToInternal('B'))
245 break;
246 p->chars += partition.charCode(delim[j]);
248 if (j < delim.size()) {
249 p->bSequenceLength = 1;
250 for (++j; j < delim.size(); j++) {
251 if (delim[j] != sd().execToInternal('B'))
252 break;
253 p->bSequenceLength += 1;
255 for (; j < delim.size(); j++)
256 p->chars2 += partition.charCode(delim[j]);
258 else
259 p->bSequenceLength = 0;
263 String<EquivCode> dataDelimCodes;
264 if (options().warnDataDelim) {
265 ModeInfo iter(mconMode, sd());
266 TokenInfo ti;
267 while (iter.nextToken(&ti)) {
268 switch (ti.type) {
269 case TokenInfo::delimType:
270 case TokenInfo::delimDelimType:
271 case TokenInfo::delimSetType:
273 if (ti.token == tokenMscMdc)
274 break;
275 const StringC &delim = syntax().delimGeneral(ti.delim1);
276 if (!delim.size())
277 break;
278 EquivCode c = partition.charCode(delim[0]);
279 for (size_t i = 0; ; i++) {
280 if (i >= dataDelimCodes.size()) {
281 dataDelimCodes += c;
282 break;
284 if (dataDelimCodes[i] == c)
285 break;
288 break;
289 default:
290 break;
295 const String<EquivCode> emptyString;
296 Boolean multicode = syntax().multicode();
297 for (i = 0; i < n; i++) {
298 TrieBuilder tb(partition.maxCode() + 1);
299 TrieBuilder::TokenVector ambiguities;
300 Vector<Token> suppressTokens;
301 if (multicode) {
302 suppressTokens.assign(partition.maxCode() + 1, 0);
303 suppressTokens[partition.eECode()] = tokenEe;
305 tb.recognizeEE(partition.eECode(), tokenEe);
306 ModeInfo iter(modes[i], sd());
307 TokenInfo ti;
308 // We try to handle the possibility that some delimiters may be empty;
309 // this might happen when compiling recognizers for the SGML declaration.
310 while (iter.nextToken(&ti)) {
311 switch (ti.type) {
312 case TokenInfo::delimType:
313 if (delimCodes[ti.delim1].size() > 0)
314 tb.recognize(delimCodes[ti.delim1], ti.token,
315 ti.priority, ambiguities);
316 break;
317 case TokenInfo::delimDelimType:
319 String<EquivCode> str(delimCodes[ti.delim1]);
320 if (str.size() > 0 && delimCodes[ti.delim2].size() > 0) {
321 str += delimCodes[ti.delim2];
322 tb.recognize(str, ti.token, ti.priority, ambiguities);
325 break;
326 case TokenInfo::delimSetType:
327 if (delimCodes[ti.delim1].size() > 0)
328 tb.recognize(delimCodes[ti.delim1], setCodes[ti.set],
329 ti.token, ti.priority, ambiguities);
330 break;
331 case TokenInfo::setType:
332 tb.recognize(emptyString, setCodes[ti.set], ti.token,
333 ti.priority, ambiguities);
334 if (multicode) {
335 const String<EquivCode> &equivCodes = setCodes[ti.set];
336 for (size_t j = 0; j < equivCodes.size(); j++)
337 suppressTokens[equivCodes[j]] = ti.token;
339 break;
340 case TokenInfo::functionType:
341 tb.recognize(functionCode[ti.function], ti.token,
342 ti.priority, ambiguities);
343 if (multicode)
344 suppressTokens[functionCode[ti.function][0]] = ti.token;
345 break;
348 if (iter.includesShortref()) {
349 for (int j = 0; j < nShortref; j++) {
350 const SrInfo *p = &srInfo[j];
351 if (p->bSequenceLength > 0)
352 tb.recognizeB(p->chars, p->bSequenceLength,
353 syntax().quantity(Syntax::qBSEQLEN),
354 setCodes[Syntax::blank],
355 p->chars2, tokenFirstShortref + j,
356 ambiguities);
357 else
358 tb.recognize(p->chars, tokenFirstShortref + j,
359 Priority::delim, ambiguities);
362 if (options().warnDataDelim) {
363 switch (modes[i]) {
364 default:
365 if (!iter.includesShortref())
366 break;
367 // fall through
368 case alitMode:
369 case alitaMode:
370 case aliteMode:
371 case talitMode:
372 case talitaMode:
373 case taliteMode:
374 for (size_t j = 0; j < dataDelimCodes.size(); j++) {
375 String<EquivCode> code;
376 code += dataDelimCodes[j];
377 tb.recognize(code, tokenCharDelim, Priority::dataDelim, ambiguities);
379 break;
380 case plitMode:
381 case plitaMode:
382 case pliteMode:
384 String<EquivCode> code;
385 code += partition.charCode(syntax().delimGeneral(Syntax::dPERO)[0]);
386 tb.recognize(code, tokenCharDelim, Priority::dataDelim, ambiguities);
388 break;
391 setRecognizer(modes[i],
392 (multicode
393 ? new Recognizer(tb.extractTrie(), partition.map(),
394 suppressTokens)
395 : new Recognizer(tb.extractTrie(), partition.map())));
396 // FIXME give more information
397 for (size_t j = 0; j < ambiguities.size(); j += 2)
398 message(ParserMessages::lexicalAmbiguity,
399 TokenMessageArg(ambiguities[j], modes[i], syntaxPointer(),
400 sdPointer()),
401 TokenMessageArg(ambiguities[j + 1], modes[i], syntaxPointer(),
402 sdPointer()));
406 void Parser::compileNormalMap()
408 XcharMap<PackedBoolean> map(0);
409 ISetIter<Char> sgmlCharIter(*syntax().charSet(Syntax::sgmlChar));
410 Char min, max;
411 while (sgmlCharIter.next(min, max))
412 map.setRange(min, max, 1);
413 ModeInfo iter(mconnetMode, sd());
414 TokenInfo ti;
415 while (iter.nextToken(&ti)) {
416 switch (ti.type) {
417 case TokenInfo::delimType:
418 case TokenInfo::delimDelimType:
419 case TokenInfo::delimSetType:
421 const StringC &delim = syntax().delimGeneral(ti.delim1);
422 if (!delim.size())
423 break;
424 Char c = delim[0];
425 map.setChar(c, 0);
426 StringC str(syntax().generalSubstTable()->inverse(c));
427 for (size_t i = 0; i < str.size(); i++)
428 map.setChar(str[i], 0);
430 break;
431 case TokenInfo::setType:
432 if (ti.token != tokenChar) {
433 ISetIter<Char> setIter(*syntax().charSet(ti.set));
434 Char min, max;
435 while (setIter.next(min, max))
436 map.setRange(min, max, 0);
438 break;
439 case TokenInfo::functionType:
440 if (ti.token != tokenChar)
441 map.setChar(syntax().standardFunction(ti.function), 0);
442 break;
445 int nShortref = currentDtd().nShortref();
446 for (int i = 0; i < nShortref; i++) {
447 Char c = currentDtd().shortref(i)[0];
448 if (c == sd().execToInternal('B')) {
449 ISetIter<Char> setIter(*syntax().charSet(Syntax::blank));
450 Char min, max;
451 while (setIter.next(min, max))
452 map.setRange(min, max, 0);
454 else {
455 map.setChar(c, 0);
456 StringC str(syntax().generalSubstTable()->inverse(c));
457 for (size_t j = 0; j < str.size(); j++)
458 map.setChar(str[j], 0);
461 setNormalMap(map);
464 void Parser::addNeededShortrefs(Dtd &dtd, const Syntax &syntax)
466 if (!syntax.hasShortrefs())
467 return;
468 PackedBoolean delimRelevant[Syntax::nDelimGeneral];
469 size_t i;
470 for (i = 0; i < Syntax::nDelimGeneral; i++)
471 delimRelevant[i] = 0;
472 ModeInfo iter(mconnetMode, sd());
473 TokenInfo ti;
474 while (iter.nextToken(&ti)) {
475 switch (ti.type) {
476 case TokenInfo::delimType:
477 case TokenInfo::delimDelimType:
478 case TokenInfo::delimSetType:
479 delimRelevant[ti.delim1] = 1;
480 break;
481 default:
482 break;
486 // PIO and NET are the only delimiters that are recognized in con
487 // mode without context. If a short reference delimiter is
488 // identical to one of these delimiters, then we'll have an
489 // ambiguity. We make such a short reference delimiter needed
490 // to ensure that this ambiguity is reported.
491 if (syntax.isValidShortref(syntax.delimGeneral(Syntax::dPIO)))
492 dtd.addNeededShortref(syntax.delimGeneral(Syntax::dPIO));
493 if (syntax.isValidShortref(syntax.delimGeneral(Syntax::dNET)))
494 dtd.addNeededShortref(syntax.delimGeneral(Syntax::dNET));
496 size_t nShortrefComplex = syntax.nDelimShortrefComplex();
498 // A short reference delimiter is needed if it is used or if it can
499 // contains some other shorter delimiter that is either a relevant general
500 // delimiter or a shortref delimiter that is used.
502 for (i = 0; i < nShortrefComplex; i++) {
503 size_t j;
504 for (j = 0; j < Syntax::nDelimGeneral; j++)
505 if (delimRelevant[j]
506 && shortrefCanPreemptDelim(syntax.delimShortrefComplex(i),
507 syntax.delimGeneral(j),
509 syntax)) {
510 dtd.addNeededShortref(syntax.delimShortrefComplex(i));
511 break;
513 for (j = 0; j < dtd.nShortref(); j++)
514 if (shortrefCanPreemptDelim(syntax.delimShortrefComplex(i),
515 dtd.shortref(j),
517 syntax)) {
518 dtd.addNeededShortref(syntax.delimShortrefComplex(i));
519 break;
525 Boolean Parser::shortrefCanPreemptDelim(const StringC &sr,
526 const StringC &d,
527 Boolean dIsSr,
528 const Syntax &syntax)
530 Char letterB = sd().execToInternal('B');
531 for (size_t i = 0; i < sr.size(); i++) {
532 size_t j = 0;
533 size_t k = i;
534 for (;;) {
535 if (j == d.size())
536 return 1;
537 if (k >= sr.size())
538 break;
539 if (sr[k] == letterB) {
540 if (dIsSr && d[j] == letterB) {
541 j++;
542 k++;
544 else if (syntax.isB(d[j])) {
545 j++;
546 k++;
547 if (k == sr.size() || sr[k] != letterB) {
548 // it was the last B in the sequence
549 while (j < d.size() && syntax.isB(d[j]))
550 j++;
553 else
554 break;
556 else if (dIsSr && d[j] == letterB) {
557 if (syntax.isB(sr[k])) {
558 ++j;
559 ++k;
560 if (j < d.size() && d[j] != letterB) {
561 while (k < sr.size() && syntax.isB(sr[k]))
562 k++;
565 else
566 break;
568 else if (d[j] == sr[k]) {
569 j++;
570 k++;
572 else
573 break;
576 return 0;
579 #ifdef SP_NAMESPACE
581 #endif