2 * Copyright 2000-2009 JetBrains s.r.o.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 package com
.intellij
.spellchecker
.inspections
;
18 import com
.intellij
.openapi
.util
.TextRange
;
19 import com
.intellij
.openapi
.util
.text
.StringUtil
;
20 import com
.intellij
.psi
.codeStyle
.NameUtil
;
21 import com
.intellij
.spellchecker
.util
.Strings
;
22 import org
.jetbrains
.annotations
.NonNls
;
23 import org
.jetbrains
.annotations
.NotNull
;
24 import org
.jetbrains
.annotations
.Nullable
;
26 import java
.util
.ArrayList
;
27 import java
.util
.List
;
28 import java
.util
.regex
.Matcher
;
29 import java
.util
.regex
.Pattern
;
32 * @author shkate@jetbrains.com
34 public class TextSplitter
{
37 private static final Pattern NON_SPACE
= Pattern
.compile("\\S+");
40 private static final Pattern HTML
= Pattern
.compile("<(0)>");
44 private static final Pattern WORD
= Pattern
.compile("\\b\\p{L}*'?\\p{L}*");
46 private static final Pattern EXTENDED_WORD
= Pattern
.compile("\\b\\p{L}*'?\\p{L}(_*\\p{L})*");
48 private static final String WORD_SPLITTER
= "\\s+|<[^>]+>";
52 private static final Pattern URL
= Pattern
.compile("(https?|ftp|mailto)\\:\\/\\/");
54 private static final Pattern COMPLEX
= Pattern
.compile("(\\.[^\\.]+)|([@]+)");
57 private static final Pattern SPECIAL
= Pattern
.compile("^&\\p{Alnum}{4};");
60 private TextSplitter() {
65 public static List
<CheckArea
> splitText(@Nullable String text
) {
66 if (text
== null || StringUtil
.isEmpty(text
)) {
70 int i
= Math
.max(text
.indexOf("<!--"), text
.indexOf("<%--"));
71 i
= (i
> -1) ? i
+ 4 : 0;
72 List
<CheckArea
> results
= new ArrayList
<CheckArea
>();
73 String
[] pieces
= text
.substring(i
).split(WORD_SPLITTER
);
74 for (String s
: pieces
) {
76 int p1
= text
.indexOf(s
, i
);
77 TextRange range
= TextRange
.from(p1
, s
.length());
78 List
<CheckArea
> areaList
= splitNonSpace(text
, range
);
79 if (areaList
!= null) {
80 results
.addAll(areaList
);
82 i
+= (range
.getEndOffset() - range
.getStartOffset());
85 return (results
.size() == 0) ?
null : results
;
89 private static List
<CheckArea
> splitNonSpace(String text
, TextRange range
) {
90 String nonSpaceArea
= text
.substring(range
.getStartOffset(), range
.getEndOffset());
91 if (URL
.matcher(nonSpaceArea
).find() || COMPLEX
.matcher(nonSpaceArea
).find()) {
94 return splitWord(text
, range
);
99 private static List
<CheckArea
> splitSimpleWord(String text
, TextRange range
) {
100 List
<CheckArea
> results
= new ArrayList
<CheckArea
>();
101 if (text
==null || range
==null || range
.getLength()<1){
104 String word
= text
.substring(range
.getStartOffset(), range
.getEndOffset());
105 String
[] words
= NameUtil
.splitNameIntoWords(word
);
106 if (words
== null || words
.length
==0) {
110 if (words
.length
== 1) {
111 Matcher matcher
= WORD
.matcher(words
[0]);
112 if (matcher
.find()) {
113 TextRange found
= matcherRange(range
, matcher
);
114 addWord(text
, results
, false, found
);
119 boolean isCapitalized
= Strings
.isCapitalized(words
[0]);
120 boolean containsShortWord
= containsShortWord(words
);
122 if (isCapitalized
&& containsShortWord
) {
123 results
.add(new CheckArea(text
, range
, true));
127 boolean isAllWordsAreUpperCased
= isAllWordsAreUpperCased(words
);
129 for (String s
: words
) {
130 int start
= word
.indexOf(s
, index
);
131 int end
= start
+ s
.length();
132 boolean isUpperCase
= Strings
.isUpperCase(s
);
133 boolean flag
= (isUpperCase
&& !isAllWordsAreUpperCased
) || isKeyword(s
);
134 Matcher matcher
= WORD
.matcher(s
);
135 if (matcher
.find()) {
136 TextRange found
= matcherRange(subRange(range
, start
, end
), matcher
);
137 addWord(text
, results
, flag
, found
);
146 private static List
<CheckArea
> splitWord(String text
, TextRange range
) {
147 if (StringUtil
.isEmpty(text
) || range
.getLength() <= 1) {
151 List
<CheckArea
> results
= new ArrayList
<CheckArea
>();
152 String word
= text
.substring(range
.getStartOffset(), range
.getEndOffset());
154 Matcher specialMatcher
= SPECIAL
.matcher(word
);
155 if (specialMatcher
.find()) {
156 TextRange found
= matcherRange(range
, specialMatcher
);
157 addWord(text
, results
, true, found
);
161 Matcher extendedMatcher
= EXTENDED_WORD
.matcher(word
);
162 if (extendedMatcher
.find()) {
163 TextRange found
= matcherRange(range
, extendedMatcher
);
164 results
.addAll(splitSimpleWord(text
, found
));
171 private static void addWord(String text
, List
<CheckArea
> results
, boolean flag
, TextRange found
) {
172 boolean tooShort
= (found
.getEndOffset() - found
.getStartOffset()) <= 3;
173 results
.add(new CheckArea(text
, found
, flag
|| tooShort
));
176 private static boolean isKeyword(String s
) {
180 private static boolean isAllWordsAreUpperCased(String
[] words
) {
181 if (words
== null) return false;
182 for (String word
: words
) {
183 if (!Strings
.isUpperCase(word
)) {
190 private static boolean containsShortWord(String
[] words
) {
191 if (words
== null) return false;
192 for (String word
: words
) {
193 if (word
.length() < 2) {
201 private static TextRange
matcherRange(@NotNull TextRange range
, @NotNull Matcher matcher
) {
202 return subRange(range
, matcher
.start(), matcher
.end());
206 private static TextRange
subRange(@NotNull TextRange range
, int start
, int end
) {
207 return TextRange
.from(range
.getStartOffset() + start
, end
- start
);