1 // Copyright (c) 1994 James Clark
2 // See the file COPYING for copying permission.
3 #pragma ident "%Z%%M% %I% %E% SMI"
7 #include "ParserMessages.h"
8 #include "MessageArg.h"
9 #include "TokenMessageArg.h"
11 #include "Partition.h"
16 #include "TrieBuilder.h"
20 namespace SP_NAMESPACE
{
25 modeUsedInProlog
= 02,
26 modeUsedInInstance
= 04,
34 { grpMode
, modeUsedInProlog
|modeUsedInInstance
},
35 { alitMode
, modeUsedInProlog
|modeUsedInInstance
},
36 { alitaMode
, modeUsedInProlog
|modeUsedInInstance
},
37 { aliteMode
, modeUsedInProlog
|modeUsedInInstance
},
38 { talitMode
, modeUsedInProlog
|modeUsedInInstance
},
39 { talitaMode
, modeUsedInProlog
|modeUsedInInstance
},
40 { taliteMode
, modeUsedInProlog
|modeUsedInInstance
},
41 { mdMode
, modeUsedInProlog
|modeUsedInInstance
},
42 { mdMinusMode
, modeUsedInProlog
},
43 { mdPeroMode
, modeUsedInProlog
},
44 { sdMode
, modeUsedInSd
},
45 { comMode
, modeUsedInProlog
|modeUsedInInstance
},
46 { sdcomMode
, modeUsedInSd
},
47 { piMode
, modeUsedInProlog
|modeUsedInInstance
},
48 { refMode
, modeUsedInProlog
|modeUsedInInstance
|modeUsedInSd
},
49 { imsMode
, modeUsedInProlog
|modeUsedInInstance
},
50 { cmsMode
, modeUsedInProlog
|modeUsedInInstance
},
51 { rcmsMode
, modeUsedInProlog
|modeUsedInInstance
},
52 { proMode
, modeUsedInProlog
},
53 { dsMode
, modeUsedInProlog
},
54 { dsiMode
, modeUsedInProlog
},
55 { plitMode
, modeUsedInProlog
},
56 { plitaMode
, modeUsedInProlog
},
57 { pliteMode
, modeUsedInProlog
},
58 { sdplitMode
, modeUsedInSd
},
59 { sdplitaMode
, modeUsedInSd
},
60 { grpsufMode
, modeUsedInProlog
},
61 { mlitMode
, modeUsedInProlog
|modeUsedInSd
},
62 { mlitaMode
, modeUsedInProlog
|modeUsedInSd
},
63 { asMode
, modeUsedInProlog
},
64 { slitMode
, modeUsedInProlog
},
65 { slitaMode
, modeUsedInProlog
},
66 { sdslitMode
, modeUsedInSd
},
67 { sdslitaMode
, modeUsedInSd
},
68 { cconMode
, modeUsedInInstance
},
69 { rcconMode
, modeUsedInInstance
},
70 { cconnetMode
, modeUsedInInstance
},
71 { rcconnetMode
, modeUsedInInstance
},
72 { rcconeMode
, modeUsedInInstance
},
73 { tagMode
, modeUsedInInstance
},
74 { econMode
, modeUsedInInstance
|modeUsesSr
},
75 { mconMode
, modeUsedInInstance
|modeUsesSr
},
76 { econnetMode
, modeUsedInInstance
|modeUsesSr
},
77 { mconnetMode
, modeUsedInInstance
|modeUsesSr
},
80 void Parser::compileSdModes()
84 for (size_t i
= 0; i
< SIZEOF(modeTable
); i
++)
85 if (modeTable
[i
].flags
& modeUsedInSd
)
86 modes
[n
++] = modeTable
[i
].mode
;
87 compileModes(modes
, n
, 0);
90 void Parser::compilePrologModes()
92 Boolean scopeInstance
= sd().scopeInstance();
93 Boolean haveSr
= syntax().hasShortrefs();
96 for (size_t i
= 0; i
< SIZEOF(modeTable
); i
++) {
98 if (modeTable
[i
].flags
& modeUsedInProlog
)
99 modes
[n
++] = modeTable
[i
].mode
;
102 if ((modeTable
[i
].flags
& (modeUsedInInstance
|modeUsedInProlog
))
103 && !(modeTable
[i
].flags
& modeUsesSr
))
104 modes
[n
++] = modeTable
[i
].mode
;
107 if (modeTable
[i
].flags
& (modeUsedInInstance
|modeUsedInProlog
))
108 modes
[n
++] = modeTable
[i
].mode
;
111 compileModes(modes
, n
, 0);
114 void Parser::compileInstanceModes()
116 Boolean scopeInstance
= sd().scopeInstance();
118 if (!scopeInstance
&& !syntax().hasShortrefs())
122 for (size_t i
= 0; i
< SIZEOF(modeTable
); i
++) {
124 if (modeTable
[i
].flags
& modeUsedInInstance
)
125 modes
[n
++] = modeTable
[i
].mode
;
128 if (modeTable
[i
].flags
& modeUsesSr
)
129 modes
[n
++] = modeTable
[i
].mode
;
132 compileModes(modes
, n
, ¤tDtd());
135 void Parser::compileModes(const Mode
*modes
,
139 PackedBoolean sets
[Syntax::nSet
];
140 PackedBoolean delims
[Syntax::nDelimGeneral
];
141 PackedBoolean functions
[3];
143 Boolean includesShortref
= 0;
144 for (i
= 0; i
< Syntax::nSet
; i
++)
146 for (i
= 0; i
< Syntax::nDelimGeneral
; i
++)
148 for (i
= 0; i
< 3; i
++)
151 for (i
= 0; i
< n
; i
++) {
152 ModeInfo
iter(modes
[i
], sd());
154 while (iter
.nextToken(&ti
)) {
156 case TokenInfo::delimType
:
157 delims
[ti
.delim1
] = 1;
159 case TokenInfo::delimDelimType
:
160 delims
[ti
.delim1
] = 1;
161 delims
[ti
.delim2
] = 1;
163 case TokenInfo::delimSetType
:
164 delims
[ti
.delim1
] = 1;
166 case TokenInfo::setType
:
169 case TokenInfo::functionType
:
170 functions
[ti
.function
] = 1;
174 if (!includesShortref
&& iter
.includesShortref())
175 includesShortref
= 1;
180 for (i
= 0; i
< 3; i
++)
182 chars
.add(syntax().standardFunction(i
));
183 for (i
= 0; i
< Syntax::nDelimGeneral
; i
++)
185 const StringC
&str
= syntax().delimGeneral(i
);
186 for (size_t j
= 0; j
< str
.size(); j
++)
189 if (includesShortref
&& dtd
) {
190 size_t n
= dtd
->nShortref();
191 for (size_t i
= 0; i
< n
; i
++) {
192 const StringC
&delim
= dtd
->shortref(i
);
193 size_t len
= delim
.size();
194 for (size_t j
= 0; j
< len
; j
++)
195 if (delim
[j
] == sd().execToInternal('B'))
196 sets
[Syntax::blank
] = 1;
202 const ISet
<Char
> *csets
[Syntax::nSet
];
204 for (i
= 0; i
< Syntax::nSet
; i
++)
206 csets
[usedSets
++] = syntax().charSet(i
);
208 Partition
partition(chars
, csets
, usedSets
, *syntax().generalSubstTable());
210 String
<EquivCode
> setCodes
[Syntax::nSet
];
213 for (i
= 0; i
< Syntax::nSet
; i
++)
215 setCodes
[i
] = partition
.setCodes(nCodes
++);
217 String
<EquivCode
> delimCodes
[Syntax::nDelimGeneral
];
218 for (i
= 0; i
< Syntax::nDelimGeneral
; i
++)
220 StringC str
= syntax().delimGeneral(i
);
221 for (size_t j
= 0; j
< str
.size(); j
++)
222 delimCodes
[i
] += partition
.charCode(str
[j
]);
225 String
<EquivCode
> functionCode
[3];
226 for (i
= 0; i
< 3; i
++)
228 functionCode
[i
] += partition
.charCode(syntax().standardFunction(i
));
230 Vector
<SrInfo
> srInfo
;
233 if (!includesShortref
|| !dtd
)
236 nShortref
= dtd
->nShortref();
237 srInfo
.resize(nShortref
);
239 for (i
= 0; i
< nShortref
; i
++) {
240 const StringC delim
= dtd
->shortref(i
);
241 SrInfo
*p
= &srInfo
[i
];
243 for (j
= 0; j
< delim
.size(); j
++) {
244 if (delim
[j
] == sd().execToInternal('B'))
246 p
->chars
+= partition
.charCode(delim
[j
]);
248 if (j
< delim
.size()) {
249 p
->bSequenceLength
= 1;
250 for (++j
; j
< delim
.size(); j
++) {
251 if (delim
[j
] != sd().execToInternal('B'))
253 p
->bSequenceLength
+= 1;
255 for (; j
< delim
.size(); j
++)
256 p
->chars2
+= partition
.charCode(delim
[j
]);
259 p
->bSequenceLength
= 0;
263 String
<EquivCode
> dataDelimCodes
;
264 if (options().warnDataDelim
) {
265 ModeInfo
iter(mconMode
, sd());
267 while (iter
.nextToken(&ti
)) {
269 case TokenInfo::delimType
:
270 case TokenInfo::delimDelimType
:
271 case TokenInfo::delimSetType
:
273 if (ti
.token
== tokenMscMdc
)
275 const StringC
&delim
= syntax().delimGeneral(ti
.delim1
);
278 EquivCode c
= partition
.charCode(delim
[0]);
279 for (size_t i
= 0; ; i
++) {
280 if (i
>= dataDelimCodes
.size()) {
284 if (dataDelimCodes
[i
] == c
)
295 const String
<EquivCode
> emptyString
;
296 Boolean multicode
= syntax().multicode();
297 for (i
= 0; i
< n
; i
++) {
298 TrieBuilder
tb(partition
.maxCode() + 1);
299 TrieBuilder::TokenVector ambiguities
;
300 Vector
<Token
> suppressTokens
;
302 suppressTokens
.assign(partition
.maxCode() + 1, 0);
303 suppressTokens
[partition
.eECode()] = tokenEe
;
305 tb
.recognizeEE(partition
.eECode(), tokenEe
);
306 ModeInfo
iter(modes
[i
], sd());
308 // We try to handle the possibility that some delimiters may be empty;
309 // this might happen when compiling recognizers for the SGML declaration.
310 while (iter
.nextToken(&ti
)) {
312 case TokenInfo::delimType
:
313 if (delimCodes
[ti
.delim1
].size() > 0)
314 tb
.recognize(delimCodes
[ti
.delim1
], ti
.token
,
315 ti
.priority
, ambiguities
);
317 case TokenInfo::delimDelimType
:
319 String
<EquivCode
> str(delimCodes
[ti
.delim1
]);
320 if (str
.size() > 0 && delimCodes
[ti
.delim2
].size() > 0) {
321 str
+= delimCodes
[ti
.delim2
];
322 tb
.recognize(str
, ti
.token
, ti
.priority
, ambiguities
);
326 case TokenInfo::delimSetType
:
327 if (delimCodes
[ti
.delim1
].size() > 0)
328 tb
.recognize(delimCodes
[ti
.delim1
], setCodes
[ti
.set
],
329 ti
.token
, ti
.priority
, ambiguities
);
331 case TokenInfo::setType
:
332 tb
.recognize(emptyString
, setCodes
[ti
.set
], ti
.token
,
333 ti
.priority
, ambiguities
);
335 const String
<EquivCode
> &equivCodes
= setCodes
[ti
.set
];
336 for (size_t j
= 0; j
< equivCodes
.size(); j
++)
337 suppressTokens
[equivCodes
[j
]] = ti
.token
;
340 case TokenInfo::functionType
:
341 tb
.recognize(functionCode
[ti
.function
], ti
.token
,
342 ti
.priority
, ambiguities
);
344 suppressTokens
[functionCode
[ti
.function
][0]] = ti
.token
;
348 if (iter
.includesShortref()) {
349 for (int j
= 0; j
< nShortref
; j
++) {
350 const SrInfo
*p
= &srInfo
[j
];
351 if (p
->bSequenceLength
> 0)
352 tb
.recognizeB(p
->chars
, p
->bSequenceLength
,
353 syntax().quantity(Syntax::qBSEQLEN
),
354 setCodes
[Syntax::blank
],
355 p
->chars2
, tokenFirstShortref
+ j
,
358 tb
.recognize(p
->chars
, tokenFirstShortref
+ j
,
359 Priority::delim
, ambiguities
);
362 if (options().warnDataDelim
) {
365 if (!iter
.includesShortref())
374 for (size_t j
= 0; j
< dataDelimCodes
.size(); j
++) {
375 String
<EquivCode
> code
;
376 code
+= dataDelimCodes
[j
];
377 tb
.recognize(code
, tokenCharDelim
, Priority::dataDelim
, ambiguities
);
384 String
<EquivCode
> code
;
385 code
+= partition
.charCode(syntax().delimGeneral(Syntax::dPERO
)[0]);
386 tb
.recognize(code
, tokenCharDelim
, Priority::dataDelim
, ambiguities
);
391 setRecognizer(modes
[i
],
393 ? new Recognizer(tb
.extractTrie(), partition
.map(),
395 : new Recognizer(tb
.extractTrie(), partition
.map())));
396 // FIXME give more information
397 for (size_t j
= 0; j
< ambiguities
.size(); j
+= 2)
398 message(ParserMessages::lexicalAmbiguity
,
399 TokenMessageArg(ambiguities
[j
], modes
[i
], syntaxPointer(),
401 TokenMessageArg(ambiguities
[j
+ 1], modes
[i
], syntaxPointer(),
406 void Parser::compileNormalMap()
408 XcharMap
<PackedBoolean
> map(0);
409 ISetIter
<Char
> sgmlCharIter(*syntax().charSet(Syntax::sgmlChar
));
411 while (sgmlCharIter
.next(min
, max
))
412 map
.setRange(min
, max
, 1);
413 ModeInfo
iter(mconnetMode
, sd());
415 while (iter
.nextToken(&ti
)) {
417 case TokenInfo::delimType
:
418 case TokenInfo::delimDelimType
:
419 case TokenInfo::delimSetType
:
421 const StringC
&delim
= syntax().delimGeneral(ti
.delim1
);
426 StringC
str(syntax().generalSubstTable()->inverse(c
));
427 for (size_t i
= 0; i
< str
.size(); i
++)
428 map
.setChar(str
[i
], 0);
431 case TokenInfo::setType
:
432 if (ti
.token
!= tokenChar
) {
433 ISetIter
<Char
> setIter(*syntax().charSet(ti
.set
));
435 while (setIter
.next(min
, max
))
436 map
.setRange(min
, max
, 0);
439 case TokenInfo::functionType
:
440 if (ti
.token
!= tokenChar
)
441 map
.setChar(syntax().standardFunction(ti
.function
), 0);
445 int nShortref
= currentDtd().nShortref();
446 for (int i
= 0; i
< nShortref
; i
++) {
447 Char c
= currentDtd().shortref(i
)[0];
448 if (c
== sd().execToInternal('B')) {
449 ISetIter
<Char
> setIter(*syntax().charSet(Syntax::blank
));
451 while (setIter
.next(min
, max
))
452 map
.setRange(min
, max
, 0);
456 StringC
str(syntax().generalSubstTable()->inverse(c
));
457 for (size_t j
= 0; j
< str
.size(); j
++)
458 map
.setChar(str
[j
], 0);
464 void Parser::addNeededShortrefs(Dtd
&dtd
, const Syntax
&syntax
)
466 if (!syntax
.hasShortrefs())
468 PackedBoolean delimRelevant
[Syntax::nDelimGeneral
];
470 for (i
= 0; i
< Syntax::nDelimGeneral
; i
++)
471 delimRelevant
[i
] = 0;
472 ModeInfo
iter(mconnetMode
, sd());
474 while (iter
.nextToken(&ti
)) {
476 case TokenInfo::delimType
:
477 case TokenInfo::delimDelimType
:
478 case TokenInfo::delimSetType
:
479 delimRelevant
[ti
.delim1
] = 1;
486 // PIO and NET are the only delimiters that are recognized in con
487 // mode without context. If a short reference delimiter is
488 // identical to one of these delimiters, then we'll have an
489 // ambiguity. We make such a short reference delimiter needed
490 // to ensure that this ambiguity is reported.
491 if (syntax
.isValidShortref(syntax
.delimGeneral(Syntax::dPIO
)))
492 dtd
.addNeededShortref(syntax
.delimGeneral(Syntax::dPIO
));
493 if (syntax
.isValidShortref(syntax
.delimGeneral(Syntax::dNET
)))
494 dtd
.addNeededShortref(syntax
.delimGeneral(Syntax::dNET
));
496 size_t nShortrefComplex
= syntax
.nDelimShortrefComplex();
498 // A short reference delimiter is needed if it is used or if it can
499 // contains some other shorter delimiter that is either a relevant general
500 // delimiter or a shortref delimiter that is used.
502 for (i
= 0; i
< nShortrefComplex
; i
++) {
504 for (j
= 0; j
< Syntax::nDelimGeneral
; j
++)
506 && shortrefCanPreemptDelim(syntax
.delimShortrefComplex(i
),
507 syntax
.delimGeneral(j
),
510 dtd
.addNeededShortref(syntax
.delimShortrefComplex(i
));
513 for (j
= 0; j
< dtd
.nShortref(); j
++)
514 if (shortrefCanPreemptDelim(syntax
.delimShortrefComplex(i
),
518 dtd
.addNeededShortref(syntax
.delimShortrefComplex(i
));
525 Boolean
Parser::shortrefCanPreemptDelim(const StringC
&sr
,
528 const Syntax
&syntax
)
530 Char letterB
= sd().execToInternal('B');
531 for (size_t i
= 0; i
< sr
.size(); i
++) {
539 if (sr
[k
] == letterB
) {
540 if (dIsSr
&& d
[j
] == letterB
) {
544 else if (syntax
.isB(d
[j
])) {
547 if (k
== sr
.size() || sr
[k
] != letterB
) {
548 // it was the last B in the sequence
549 while (j
< d
.size() && syntax
.isB(d
[j
]))
556 else if (dIsSr
&& d
[j
] == letterB
) {
557 if (syntax
.isB(sr
[k
])) {
560 if (j
< d
.size() && d
[j
] != letterB
) {
561 while (k
< sr
.size() && syntax
.isB(sr
[k
]))
568 else if (d
[j
] == sr
[k
]) {