3 // namespace: System.Text.RegularExpressions
6 // author: Dan Lewis (dlewis@gmx.co.uk)
10 // Permission is hereby granted, free of charge, to any person obtaining
11 // a copy of this software and associated documentation files (the
12 // "Software"), to deal in the Software without restriction, including
13 // without limitation the rights to use, copy, modify, merge, publish,
14 // distribute, sublicense, and/or sell copies of the Software, and to
15 // permit persons to whom the Software is furnished to do so, subject to
16 // the following conditions:
18 // The above copyright notice and this permission notice shall be
19 // included in all copies or substantial portions of the Software.
21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System
.Globalization
;
33 namespace System
.Text
.RegularExpressions
{
35 enum Category
: ushort {
40 Any
, // any character except newline .
41 AnySingleline
, // any character . (s option)
42 Word
, // any word character \w
43 Digit
, // any digit character \d
44 WhiteSpace
, // any whitespace character \s
51 EcmaWord
, // [a-zA-Z_0-9]
53 EcmaWhiteSpace
, // [ \f\n\r\t\v]
60 UnicodeZ
, // Separator
61 UnicodeP
, // Punctuation
65 UnicodeLu
, // UppercaseLetter
66 UnicodeLl
, // LowercaseLetter
67 UnicodeLt
, // TitlecaseLetter
68 UnicodeLm
, // ModifierLetter
69 UnicodeLo
, // OtherLetter
70 UnicodeMn
, // NonspacingMark
71 UnicodeMe
, // EnclosingMark
72 UnicodeMc
, // SpacingMark
73 UnicodeNd
, // DecimalNumber
74 UnicodeNl
, // LetterNumber
75 UnicodeNo
, // OtherNumber
76 UnicodeZs
, // SpaceSeparator
77 UnicodeZl
, // LineSeparator
78 UnicodeZp
, // ParagraphSeparator
79 UnicodePd
, // DashPunctuation
80 UnicodePs
, // OpenPunctuation
81 UnicodePi
, // InitialPunctuation
82 UnicodePe
, // ClosePunctuation
83 UnicodePf
, // FinalPunctuation
84 UnicodePc
, // ConnectorPunctuation
85 UnicodePo
, // OtherPunctuation
86 UnicodeSm
, // MathSymbol
87 UnicodeSc
, // CurrencySymbol
88 UnicodeSk
, // ModifierSymbol
89 UnicodeSo
, // OtherSymbol
92 UnicodeCo
, // PrivateUse
93 UnicodeCs
, // Surrogate
94 UnicodeCn
, // Unassigned
96 // unicode block ranges
98 // notes: the categories marked with a star are valid unicode block ranges,
99 // but don't seem to be accepted by the MS parser using the /p{...} format.
103 UnicodeLatin1Supplement
, // *
104 UnicodeLatinExtendedA
, // *
105 UnicodeLatinExtendedB
, // *
106 UnicodeIPAExtensions
,
107 UnicodeSpacingModifierLetters
,
108 UnicodeCombiningDiacriticalMarks
,
134 UnicodeUnifiedCanadianAboriginalSyllabics
,
139 UnicodeLatinExtendedAdditional
,
140 UnicodeGreekExtended
,
141 UnicodeGeneralPunctuation
,
142 UnicodeSuperscriptsandSubscripts
,
143 UnicodeCurrencySymbols
,
144 UnicodeCombiningMarksforSymbols
,
145 UnicodeLetterlikeSymbols
,
148 UnicodeMathematicalOperators
,
149 UnicodeMiscellaneousTechnical
,
150 UnicodeControlPictures
,
151 UnicodeOpticalCharacterRecognition
,
152 UnicodeEnclosedAlphanumerics
,
154 UnicodeBlockElements
,
155 UnicodeGeometricShapes
,
156 UnicodeMiscellaneousSymbols
,
158 UnicodeBraillePatterns
,
159 UnicodeCJKRadicalsSupplement
,
160 UnicodeKangxiRadicals
,
161 UnicodeIdeographicDescriptionCharacters
,
162 UnicodeCJKSymbolsandPunctuation
,
166 UnicodeHangulCompatibilityJamo
,
168 UnicodeBopomofoExtended
,
169 UnicodeEnclosedCJKLettersandMonths
,
170 UnicodeCJKCompatibility
,
171 UnicodeCJKUnifiedIdeographsExtensionA
,
172 UnicodeCJKUnifiedIdeographs
,
175 UnicodeHangulSyllables
,
176 UnicodeHighSurrogates
,
177 UnicodeHighPrivateUseSurrogates
,
178 UnicodeLowSurrogates
,
180 UnicodeCJKCompatibilityIdeographs
,
181 UnicodeAlphabeticPresentationForms
,
182 UnicodeArabicPresentationFormsA
, // *
183 UnicodeCombiningHalfMarks
,
184 UnicodeCJKCompatibilityForms
,
185 UnicodeSmallFormVariants
,
186 UnicodeArabicPresentationFormsB
, // *
188 UnicodeHalfwidthandFullwidthForms
,
193 UnicodeByzantineMusicalSymbols
,
194 UnicodeMusicalSymbols
,
195 UnicodeMathematicalAlphanumericSymbols
,
196 UnicodeCJKUnifiedIdeographsExtensionB
,
197 UnicodeCJKCompatibilityIdeographsSupplement
,
200 LastValue
// Keep this with the higher value in the enumeration
203 class CategoryUtils
{
204 public static Category
CategoryFromName (string name
) {
206 if (name
.StartsWith ("Is")) // remove prefix from block range
207 name
= name
.Substring (2);
209 return (Category
)Enum
.Parse (typeof (Category
), "Unicode" + name
);
211 catch (ArgumentException
) {
212 return Category
.None
;
216 public static bool IsCategory (Category cat
, char c
) {
224 case Category
.AnySingleline
:
229 Char
.IsLetterOrDigit (c
) ||
230 IsCategory (UnicodeCategory
.ConnectorPunctuation
, c
);
233 return Char
.IsDigit (c
);
235 case Category
.WhiteSpace
:
236 return Char
.IsWhiteSpace (c
);
240 case Category
.EcmaAny
:
243 case Category
.EcmaAnySingleline
:
246 case Category
.EcmaWord
:
248 'a' <= c
&& c
<= 'z' ||
249 'A' <= c
&& c
<= 'Z' ||
250 '0' <= c
&& c
<= '9' ||
253 case Category
.EcmaDigit
:
257 case Category
.EcmaWhiteSpace
:
266 // Unicode categories...
270 case Category
.UnicodeLu
: return IsCategory (UnicodeCategory
.UppercaseLetter
, c
);
271 case Category
.UnicodeLl
: return IsCategory (UnicodeCategory
.LowercaseLetter
, c
);
272 case Category
.UnicodeLt
: return IsCategory (UnicodeCategory
.TitlecaseLetter
, c
);
273 case Category
.UnicodeLm
: return IsCategory (UnicodeCategory
.ModifierLetter
, c
);
274 case Category
.UnicodeLo
: return IsCategory (UnicodeCategory
.OtherLetter
, c
);
278 case Category
.UnicodeMn
: return IsCategory (UnicodeCategory
.NonSpacingMark
, c
);
279 case Category
.UnicodeMe
: return IsCategory (UnicodeCategory
.EnclosingMark
, c
);
280 case Category
.UnicodeMc
: return IsCategory (UnicodeCategory
.SpacingCombiningMark
, c
);
281 case Category
.UnicodeNd
: return IsCategory (UnicodeCategory
.DecimalDigitNumber
, c
);
285 case Category
.UnicodeNl
: return IsCategory (UnicodeCategory
.LetterNumber
, c
);
286 case Category
.UnicodeNo
: return IsCategory (UnicodeCategory
.OtherNumber
, c
);
290 case Category
.UnicodeZs
: return IsCategory (UnicodeCategory
.SpaceSeparator
, c
);
291 case Category
.UnicodeZl
: return IsCategory (UnicodeCategory
.LineSeparator
, c
);
292 case Category
.UnicodeZp
: return IsCategory (UnicodeCategory
.ParagraphSeparator
, c
);
296 case Category
.UnicodePd
: return IsCategory (UnicodeCategory
.DashPunctuation
, c
);
297 case Category
.UnicodePs
: return IsCategory (UnicodeCategory
.OpenPunctuation
, c
);
298 case Category
.UnicodePi
: return IsCategory (UnicodeCategory
.InitialQuotePunctuation
, c
);
299 case Category
.UnicodePe
: return IsCategory (UnicodeCategory
.ClosePunctuation
, c
);
300 case Category
.UnicodePf
: return IsCategory (UnicodeCategory
.FinalQuotePunctuation
, c
);
301 case Category
.UnicodePc
: return IsCategory (UnicodeCategory
.ConnectorPunctuation
, c
);
302 case Category
.UnicodePo
: return IsCategory (UnicodeCategory
.OtherPunctuation
, c
);
306 case Category
.UnicodeSm
: return IsCategory (UnicodeCategory
.MathSymbol
, c
);
307 case Category
.UnicodeSc
: return IsCategory (UnicodeCategory
.CurrencySymbol
, c
);
308 case Category
.UnicodeSk
: return IsCategory (UnicodeCategory
.ModifierSymbol
, c
);
309 case Category
.UnicodeSo
: return IsCategory (UnicodeCategory
.OtherSymbol
, c
);
313 case Category
.UnicodeCc
: return IsCategory (UnicodeCategory
.Control
, c
);
314 case Category
.UnicodeCf
: return IsCategory (UnicodeCategory
.Format
, c
);
315 case Category
.UnicodeCo
: return IsCategory (UnicodeCategory
.PrivateUse
, c
);
316 case Category
.UnicodeCs
: return IsCategory (UnicodeCategory
.Surrogate
, c
);
317 case Category
.UnicodeCn
: return IsCategory (UnicodeCategory
.OtherNotAssigned
, c
);
319 case Category
.UnicodeL
: // letter
321 IsCategory (UnicodeCategory
.UppercaseLetter
, c
) ||
322 IsCategory (UnicodeCategory
.LowercaseLetter
, c
) ||
323 IsCategory (UnicodeCategory
.TitlecaseLetter
, c
) ||
324 IsCategory (UnicodeCategory
.ModifierLetter
, c
) ||
325 IsCategory (UnicodeCategory
.OtherLetter
, c
);
327 case Category
.UnicodeM
: // mark
329 IsCategory (UnicodeCategory
.NonSpacingMark
, c
) ||
330 IsCategory (UnicodeCategory
.EnclosingMark
, c
) ||
331 IsCategory (UnicodeCategory
.SpacingCombiningMark
, c
);
333 case Category
.UnicodeN
: // number
335 IsCategory (UnicodeCategory
.DecimalDigitNumber
, c
) ||
336 IsCategory (UnicodeCategory
.LetterNumber
, c
) ||
337 IsCategory (UnicodeCategory
.OtherNumber
, c
);
339 case Category
.UnicodeZ
: // separator
341 IsCategory (UnicodeCategory
.SpaceSeparator
, c
) ||
342 IsCategory (UnicodeCategory
.LineSeparator
, c
) ||
343 IsCategory (UnicodeCategory
.ParagraphSeparator
, c
);
345 case Category
.UnicodeP
: // punctuation
347 IsCategory (UnicodeCategory
.DashPunctuation
, c
) ||
348 IsCategory (UnicodeCategory
.OpenPunctuation
, c
) ||
349 IsCategory (UnicodeCategory
.InitialQuotePunctuation
, c
) ||
350 IsCategory (UnicodeCategory
.ClosePunctuation
, c
) ||
351 IsCategory (UnicodeCategory
.FinalQuotePunctuation
, c
) ||
352 IsCategory (UnicodeCategory
.ConnectorPunctuation
, c
) ||
353 IsCategory (UnicodeCategory
.OtherPunctuation
, c
);
355 case Category
.UnicodeS
: // symbol
357 IsCategory (UnicodeCategory
.MathSymbol
, c
) ||
358 IsCategory (UnicodeCategory
.CurrencySymbol
, c
) ||
359 IsCategory (UnicodeCategory
.ModifierSymbol
, c
) ||
360 IsCategory (UnicodeCategory
.OtherSymbol
, c
);
362 case Category
.UnicodeC
: // other
364 IsCategory (UnicodeCategory
.Control
, c
) ||
365 IsCategory (UnicodeCategory
.Format
, c
) ||
366 IsCategory (UnicodeCategory
.PrivateUse
, c
) ||
367 IsCategory (UnicodeCategory
.Surrogate
, c
) ||
368 IsCategory (UnicodeCategory
.OtherNotAssigned
, c
);
370 // Unicode block ranges...
372 case Category
.UnicodeBasicLatin
:
373 return '\u0000' <= c
&& c
<= '\u007F';
375 case Category
.UnicodeLatin1Supplement
:
376 return '\u0080' <= c
&& c
<= '\u00FF';
378 case Category
.UnicodeLatinExtendedA
:
379 return '\u0100' <= c
&& c
<= '\u017F';
381 case Category
.UnicodeLatinExtendedB
:
382 return '\u0180' <= c
&& c
<= '\u024F';
384 case Category
.UnicodeIPAExtensions
:
385 return '\u0250' <= c
&& c
<= '\u02AF';
387 case Category
.UnicodeSpacingModifierLetters
:
388 return '\u02B0' <= c
&& c
<= '\u02FF';
390 case Category
.UnicodeCombiningDiacriticalMarks
:
391 return '\u0300' <= c
&& c
<= '\u036F';
393 case Category
.UnicodeGreek
:
394 return '\u0370' <= c
&& c
<= '\u03FF';
396 case Category
.UnicodeCyrillic
:
397 return '\u0400' <= c
&& c
<= '\u04FF';
399 case Category
.UnicodeArmenian
:
400 return '\u0530' <= c
&& c
<= '\u058F';
402 case Category
.UnicodeHebrew
:
403 return '\u0590' <= c
&& c
<= '\u05FF';
405 case Category
.UnicodeArabic
:
406 return '\u0600' <= c
&& c
<= '\u06FF';
408 case Category
.UnicodeSyriac
:
409 return '\u0700' <= c
&& c
<= '\u074F';
411 case Category
.UnicodeThaana
:
412 return '\u0780' <= c
&& c
<= '\u07BF';
414 case Category
.UnicodeDevanagari
:
415 return '\u0900' <= c
&& c
<= '\u097F';
417 case Category
.UnicodeBengali
:
418 return '\u0980' <= c
&& c
<= '\u09FF';
420 case Category
.UnicodeGurmukhi
:
421 return '\u0A00' <= c
&& c
<= '\u0A7F';
423 case Category
.UnicodeGujarati
:
424 return '\u0A80' <= c
&& c
<= '\u0AFF';
426 case Category
.UnicodeOriya
:
427 return '\u0B00' <= c
&& c
<= '\u0B7F';
429 case Category
.UnicodeTamil
:
430 return '\u0B80' <= c
&& c
<= '\u0BFF';
432 case Category
.UnicodeTelugu
:
433 return '\u0C00' <= c
&& c
<= '\u0C7F';
435 case Category
.UnicodeKannada
:
436 return '\u0C80' <= c
&& c
<= '\u0CFF';
438 case Category
.UnicodeMalayalam
:
439 return '\u0D00' <= c
&& c
<= '\u0D7F';
441 case Category
.UnicodeSinhala
:
442 return '\u0D80' <= c
&& c
<= '\u0DFF';
444 case Category
.UnicodeThai
:
445 return '\u0E00' <= c
&& c
<= '\u0E7F';
447 case Category
.UnicodeLao
:
448 return '\u0E80' <= c
&& c
<= '\u0EFF';
450 case Category
.UnicodeTibetan
:
451 return '\u0F00' <= c
&& c
<= '\u0FFF';
453 case Category
.UnicodeMyanmar
:
454 return '\u1000' <= c
&& c
<= '\u109F';
456 case Category
.UnicodeGeorgian
:
457 return '\u10A0' <= c
&& c
<= '\u10FF';
459 case Category
.UnicodeHangulJamo
:
460 return '\u1100' <= c
&& c
<= '\u11FF';
462 case Category
.UnicodeEthiopic
:
463 return '\u1200' <= c
&& c
<= '\u137F';
465 case Category
.UnicodeCherokee
:
466 return '\u13A0' <= c
&& c
<= '\u13FF';
468 case Category
.UnicodeUnifiedCanadianAboriginalSyllabics
:
469 return '\u1400' <= c
&& c
<= '\u167F';
471 case Category
.UnicodeOgham
:
472 return '\u1680' <= c
&& c
<= '\u169F';
474 case Category
.UnicodeRunic
:
475 return '\u16A0' <= c
&& c
<= '\u16FF';
477 case Category
.UnicodeKhmer
:
478 return '\u1780' <= c
&& c
<= '\u17FF';
480 case Category
.UnicodeMongolian
:
481 return '\u1800' <= c
&& c
<= '\u18AF';
483 case Category
.UnicodeLatinExtendedAdditional
:
484 return '\u1E00' <= c
&& c
<= '\u1EFF';
486 case Category
.UnicodeGreekExtended
:
487 return '\u1F00' <= c
&& c
<= '\u1FFF';
489 case Category
.UnicodeGeneralPunctuation
:
490 return '\u2000' <= c
&& c
<= '\u206F';
492 case Category
.UnicodeSuperscriptsandSubscripts
:
493 return '\u2070' <= c
&& c
<= '\u209F';
495 case Category
.UnicodeCurrencySymbols
:
496 return '\u20A0' <= c
&& c
<= '\u20CF';
498 case Category
.UnicodeCombiningMarksforSymbols
:
499 return '\u20D0' <= c
&& c
<= '\u20FF';
501 case Category
.UnicodeLetterlikeSymbols
:
502 return '\u2100' <= c
&& c
<= '\u214F';
504 case Category
.UnicodeNumberForms
:
505 return '\u2150' <= c
&& c
<= '\u218F';
507 case Category
.UnicodeArrows
:
508 return '\u2190' <= c
&& c
<= '\u21FF';
510 case Category
.UnicodeMathematicalOperators
:
511 return '\u2200' <= c
&& c
<= '\u22FF';
513 case Category
.UnicodeMiscellaneousTechnical
:
514 return '\u2300' <= c
&& c
<= '\u23FF';
516 case Category
.UnicodeControlPictures
:
517 return '\u2400' <= c
&& c
<= '\u243F';
519 case Category
.UnicodeOpticalCharacterRecognition
:
520 return '\u2440' <= c
&& c
<= '\u245F';
522 case Category
.UnicodeEnclosedAlphanumerics
:
523 return '\u2460' <= c
&& c
<= '\u24FF';
525 case Category
.UnicodeBoxDrawing
:
526 return '\u2500' <= c
&& c
<= '\u257F';
528 case Category
.UnicodeBlockElements
:
529 return '\u2580' <= c
&& c
<= '\u259F';
531 case Category
.UnicodeGeometricShapes
:
532 return '\u25A0' <= c
&& c
<= '\u25FF';
534 case Category
.UnicodeMiscellaneousSymbols
:
535 return '\u2600' <= c
&& c
<= '\u26FF';
537 case Category
.UnicodeDingbats
:
538 return '\u2700' <= c
&& c
<= '\u27BF';
540 case Category
.UnicodeBraillePatterns
:
541 return '\u2800' <= c
&& c
<= '\u28FF';
543 case Category
.UnicodeCJKRadicalsSupplement
:
544 return '\u2E80' <= c
&& c
<= '\u2EFF';
546 case Category
.UnicodeKangxiRadicals
:
547 return '\u2F00' <= c
&& c
<= '\u2FDF';
549 case Category
.UnicodeIdeographicDescriptionCharacters
:
550 return '\u2FF0' <= c
&& c
<= '\u2FFF';
552 case Category
.UnicodeCJKSymbolsandPunctuation
:
553 return '\u3000' <= c
&& c
<= '\u303F';
555 case Category
.UnicodeHiragana
:
556 return '\u3040' <= c
&& c
<= '\u309F';
558 case Category
.UnicodeKatakana
:
559 return '\u30A0' <= c
&& c
<= '\u30FF';
561 case Category
.UnicodeBopomofo
:
562 return '\u3100' <= c
&& c
<= '\u312F';
564 case Category
.UnicodeHangulCompatibilityJamo
:
565 return '\u3130' <= c
&& c
<= '\u318F';
567 case Category
.UnicodeKanbun
:
568 return '\u3190' <= c
&& c
<= '\u319F';
570 case Category
.UnicodeBopomofoExtended
:
571 return '\u31A0' <= c
&& c
<= '\u31BF';
573 case Category
.UnicodeEnclosedCJKLettersandMonths
:
574 return '\u3200' <= c
&& c
<= '\u32FF';
576 case Category
.UnicodeCJKCompatibility
:
577 return '\u3300' <= c
&& c
<= '\u33FF';
579 case Category
.UnicodeCJKUnifiedIdeographsExtensionA
:
580 return '\u3400' <= c
&& c
<= '\u4DB5';
582 case Category
.UnicodeCJKUnifiedIdeographs
:
583 return '\u4E00' <= c
&& c
<= '\u9FFF';
585 case Category
.UnicodeYiSyllables
:
586 return '\uA000' <= c
&& c
<= '\uA48F';
588 case Category
.UnicodeYiRadicals
:
589 return '\uA490' <= c
&& c
<= '\uA4CF';
591 case Category
.UnicodeHangulSyllables
:
592 return '\uAC00' <= c
&& c
<= '\uD7A3';
594 case Category
.UnicodeHighSurrogates
:
595 return '\uD800' <= c
&& c
<= '\uDB7F';
597 case Category
.UnicodeHighPrivateUseSurrogates
:
598 return '\uDB80' <= c
&& c
<= '\uDBFF';
600 case Category
.UnicodeLowSurrogates
:
601 return '\uDC00' <= c
&& c
<= '\uDFFF';
603 case Category
.UnicodePrivateUse
:
604 return '\uE000' <= c
&& c
<= '\uF8FF';
606 case Category
.UnicodeCJKCompatibilityIdeographs
:
607 return '\uF900' <= c
&& c
<= '\uFAFF';
609 case Category
.UnicodeAlphabeticPresentationForms
:
610 return '\uFB00' <= c
&& c
<= '\uFB4F';
612 case Category
.UnicodeArabicPresentationFormsA
:
613 return '\uFB50' <= c
&& c
<= '\uFDFF';
615 case Category
.UnicodeCombiningHalfMarks
:
616 return '\uFE20' <= c
&& c
<= '\uFE2F';
618 case Category
.UnicodeCJKCompatibilityForms
:
619 return '\uFE30' <= c
&& c
<= '\uFE4F';
621 case Category
.UnicodeSmallFormVariants
:
622 return '\uFE50' <= c
&& c
<= '\uFE6F';
624 case Category
.UnicodeArabicPresentationFormsB
:
625 return '\uFE70' <= c
&& c
<= '\uFEFE';
627 case Category
.UnicodeHalfwidthandFullwidthForms
:
628 return '\uFF00' <= c
&& c
<= '\uFFEF';
630 case Category
.UnicodeSpecials
:
632 '\uFEFF' <= c
&& c
<= '\uFEFF' ||
633 '\uFFF0' <= c
&& c
<= '\uFFFD';
635 // these block ranges begin above 0x10000
637 case Category
.UnicodeOldItalic
:
638 case Category
.UnicodeGothic
:
639 case Category
.UnicodeDeseret
:
640 case Category
.UnicodeByzantineMusicalSymbols
:
641 case Category
.UnicodeMusicalSymbols
:
642 case Category
.UnicodeMathematicalAlphanumericSymbols
:
643 case Category
.UnicodeCJKUnifiedIdeographsExtensionB
:
644 case Category
.UnicodeCJKCompatibilityIdeographsSupplement
:
645 case Category
.UnicodeTags
:
653 private static bool IsCategory (UnicodeCategory uc
, char c
) {
654 if (Char
.GetUnicodeCategory (c
) == uc
)