platform/platform-impl/src/com/intellij/openapi/fileEditor/impl/LoadTextUtil.java

   1 /*
   2  * Copyright 2000-2009 JetBrains s.r.o.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 package com.intellij.openapi.fileEditor.impl;
  17
  18 import com.intellij.Patches;
  19 import com.intellij.lang.properties.charset.Native2AsciiCharset;
  20 import com.intellij.openapi.fileTypes.*;
  21 import com.intellij.openapi.project.Project;
  22 import com.intellij.openapi.util.Key;
  23 import com.intellij.openapi.util.Pair;
  24 import com.intellij.openapi.util.text.StringUtil;
  25 import com.intellij.openapi.vfs.CharsetToolkit;
  26 import com.intellij.openapi.vfs.VirtualFile;
  27 import com.intellij.openapi.vfs.encoding.EncodingManager;
  28 import com.intellij.testFramework.LightVirtualFile;
  29 import com.intellij.util.ArrayUtil;
  30 import org.jetbrains.annotations.NotNull;
  31 import org.jetbrains.annotations.Nullable;
  32
  33 import java.io.*;
  34 import java.nio.ByteBuffer;
  35 import java.nio.CharBuffer;
  36 import java.nio.charset.Charset;
  37
  38 public final class LoadTextUtil {
  39   static final Key<String> DETECTED_LINE_SEPARATOR_KEY = Key.create("DETECTED_LINE_SEPARATOR_KEY");
  40
  41   private LoadTextUtil() {
  42   }
  43
  44   private static Pair<CharSequence, String> convertLineSeparators(final CharBuffer buffer) {
  45     final int LF = 1;
  46     final int CR = 2;
  47     int line_separator = 0;
  48
  49     int dst = 0;
  50     char prev = ' ';
  51     final int length = buffer.length();
  52     for (int src = 0; src < length; src++) {
  53       char c = buffer.charAt(src);
  54       switch (c) {
  55         case '\r':
  56           buffer.put(dst++, '\n');
  57           line_separator = CR;
  58           break;
  59         case '\n':
  60           if (prev == '\r') {
  61             line_separator = CR + LF;
  62           }
  63           else {
  64             buffer.put(dst++, '\n');
  65             line_separator = LF;
  66           }
  67           break;
  68         default:
  69           buffer.put(dst++, c);
  70           break;
  71       }
  72       prev = c;
  73     }
  74
  75     String detectedLineSeparator = null;
  76     switch (line_separator) {
  77       case CR:
  78         detectedLineSeparator = "\r";
  79         break;
  80       case LF:
  81         detectedLineSeparator = "\n";
  82         break;
  83       case CR + LF:
  84         detectedLineSeparator = "\r\n";
  85         break;
  86     }
  87
  88     CharSequence result;
  89     if (buffer.length() == dst) {
  90       result = buffer;
  91     }
  92     else {
  93       result = buffer.subSequence(0, dst);
  94     }
  95     return Pair.create(result, detectedLineSeparator);
  96   }
  97
  98   public static Charset detectCharset(final VirtualFile virtualFile, final byte[] content) {
  99     Charset charset = dodetectCharset(virtualFile, content);
 100     charset = charset == null ? EncodingManager.getInstance().getDefaultCharset() : charset;
 101     if (virtualFile.getFileType() == StdFileTypes.PROPERTIES && EncodingManager.getInstance().isNative2AsciiForPropertiesFiles(virtualFile)) {
 102       charset = Native2AsciiCharset.wrap(charset);
 103     }
 104     virtualFile.setCharset(charset);
 105     return charset;
 106   }
 107
 108   private static Charset dodetectCharset(final VirtualFile virtualFile, final byte[] content) {
 109     EncodingManager settings = EncodingManager.getInstance();
 110     boolean shouldGuess = settings != null && settings.isUseUTFGuessing(virtualFile);
 111     CharsetToolkit toolkit = shouldGuess ? new CharsetToolkit(content, EncodingManager.getInstance().getDefaultCharset()) : null;
 112     setUtfCharsetWasDetectedFromBytes(virtualFile, false);
 113     if (shouldGuess) {
 114       toolkit.setEnforce8Bit(true);
 115       Charset charset = toolkit.guessFromBOM();
 116       if (charset != null) {
 117         setUtfCharsetWasDetectedFromBytes(virtualFile, true);
 118         return charset;
 119       }
 120       CharsetToolkit.GuessedEncoding guessed = toolkit.guessFromContent(content.length);
 121       if (guessed == CharsetToolkit.GuessedEncoding.VALID_UTF8) {
 122         setUtfCharsetWasDetectedFromBytes(virtualFile, true);
 123         return CharsetToolkit.UTF8_CHARSET; //UTF detected, ignore all directives
 124       }
 125     }
 126
 127     FileType fileType = virtualFile.getFileType();
 128     String charsetName = fileType.getCharset(virtualFile, content);
 129
 130     if (charsetName == null) {
 131       Charset saved = EncodingManager.getInstance().getEncoding(virtualFile, true);
 132       if (saved != null) return saved;
 133     }
 134     return CharsetToolkit.forName(charsetName);
 135   }
 136
 137   private static int skipBOM(final VirtualFile virtualFile, byte[] content) {
 138     final byte[] bom = getBOM(content, Patches.SUN_BUG_ID_4508058? virtualFile.getCharset() : null);
 139     if (bom.length != 0) {
 140       virtualFile.setBOM(bom);
 141     }
 142     return bom.length;
 143   }
 144
 145   @NotNull
 146   private static byte[] getBOM(byte[] content, final Charset charset) {
 147     if (Patches.SUN_BUG_ID_4508058) {
 148       if (charset != null && charset.name().contains(CharsetToolkit.UTF8) && CharsetToolkit.hasUTF8Bom(content)) {
 149         return CharsetToolkit.UTF8_BOM;
 150       }
 151     }
 152     if (CharsetToolkit.hasUTF16LEBom(content)) {
 153       return CharsetToolkit.UTF16LE_BOM;
 154     }
 155     if (CharsetToolkit.hasUTF16BEBom(content)) {
 156       return CharsetToolkit.UTF16BE_BOM;
 157     }
 158     return ArrayUtil.EMPTY_BYTE_ARRAY;
 159   }
 160
 161   /**
 162    * Gets the <code>Writer</code> for this file and sets modification stamp and time stamp to the specified values
 163    * after closing the Writer.<p>
 164    * <p/>
 165    * Normally you should not use this method.
 166    *
 167    * @param project
 168    *@param virtualFile
 169    * @param requestor            any object to control who called this method. Note that
 170  *                             it is considered to be an external change if <code>requestor</code> is <code>null</code>.
 171  *                             See {@link com.intellij.openapi.vfs.VirtualFileEvent#getRequestor}
 172    * @param text
 173    * @param newModificationStamp new modification stamp or -1 if no special value should be set @return <code>Writer</code>
 174    * @throws java.io.IOException if an I/O error occurs
 175    * @see VirtualFile#getModificationStamp()
 176    */
 177   @SuppressWarnings({"IOResourceOpenedButNotSafelyClosed"})
 178   public static Writer getWriter(@Nullable Project project, final VirtualFile virtualFile, Object requestor, final String text, final long newModificationStamp)
 179     throws IOException {
 180     Charset existing = virtualFile.getCharset();
 181     Charset specified = extractCharsetFromFileContent(project, virtualFile, text);
 182     Charset charset = chooseMostlyHarmlessCharset(existing, specified, text);
 183     if (charset != null && !charset.equals(existing)) {
 184       virtualFile.setCharset(charset);
 185       if (virtualFile.getBOM() != null) {
 186         // prevent file to be reloaded in other encoding after save with BOM
 187         setUtfCharsetWasDetectedFromBytes(virtualFile, true);
 188       }
 189     }
 190     OutputStream outputStream = virtualFile.getOutputStream(requestor, newModificationStamp, -1);
 191     return new BufferedWriter(charset == null ? new OutputStreamWriter(outputStream) : new OutputStreamWriter(outputStream, charset));
 192   }
 193
 194   private static Charset chooseMostlyHarmlessCharset(Charset existing, Charset specified, String text) {
 195     if (existing == null) return specified;
 196     if (specified == null) return existing;
 197     if (specified.equals(existing)) return specified;
 198     if (isSupported(specified, text)) return specified; //if explicitly specified encoding is safe, return it
 199     if (isSupported(existing, text)) return existing;   //otherwise stick to the old encoding if it's ok
 200     return specified;                                   //if both are bad there is no difference
 201   }
 202
 203   private static boolean isSupported(Charset charset, String str) {
 204     if (!charset.canEncode()) return false;
 205     ByteBuffer out = charset.encode(str);
 206     CharBuffer buffer = charset.decode(out);
 207     return str.equals(buffer.toString());
 208   }
 209
 210   public static Charset extractCharsetFromFileContent(@Nullable Project project, final VirtualFile virtualFile, final String text) {
 211     Charset charset = charsetFromContentOrNull(project, virtualFile, text);
 212     if (charset == null) charset = virtualFile.getCharset();
 213     return charset;
 214   }
 215
 216   @Nullable("returns null if cannot determine from content")
 217   public static Charset charsetFromContentOrNull(@Nullable Project project, @NotNull VirtualFile virtualFile, @NotNull String text) {
 218     FileType fileType = virtualFile.getFileType();
 219     if (fileType instanceof LanguageFileType) {
 220       return ((LanguageFileType)fileType).extractCharsetFromFileContent(project, virtualFile, text);
 221     }
 222     return null;
 223   }
 224
 225   public static CharSequence loadText(@NotNull VirtualFile file) {
 226     return loadText(file, false);
 227   }
 228
 229   public static CharSequence loadText(@NotNull VirtualFile file, final boolean allowMissingDecompiler) {
 230     if (file instanceof LightVirtualFile) {
 231       CharSequence content = ((LightVirtualFile)file).getContent();
 232       if (StringUtil.indexOf(content, '\r') == -1) return content;
 233
 234       CharBuffer buffer = CharBuffer.allocate(content.length());
 235       buffer.append(content);
 236       buffer.rewind();
 237       return convertLineSeparators(buffer).first;
 238     }
 239
 240     assert !file.isDirectory() : "'"+file.getPresentableUrl() + "' is directory";
 241     final FileType fileType = file.getFileType();
 242
 243     if (fileType.isBinary()) {
 244       final BinaryFileDecompiler decompiler = BinaryFileTypeDecompilers.INSTANCE.forFileType(fileType);
 245       if (decompiler != null) {
 246         CharSequence text = decompiler.decompile(file);
 247         StringUtil.assertValidSeparators(text);
 248         return text;
 249       }
 250
 251       if (allowMissingDecompiler) return null;
 252       throw new IllegalArgumentException("Attempt to load text for binary file, that doesn't have decompiler plugged in: "+file.getPresentableUrl());
 253     }
 254
 255     try {
 256       byte[] bytes = file.contentsToByteArray();
 257       return getTextByBinaryPresentation(bytes, file);
 258     }
 259     catch (IOException e) {
 260       return ArrayUtil.EMPTY_CHAR_SEQUENCE;
 261     }
 262   }
 263
 264   @NotNull
 265   public static CharSequence getTextByBinaryPresentation(@NotNull final byte[] bytes, @NotNull VirtualFile virtualFile) {
 266     return getTextByBinaryPresentation(bytes, virtualFile, true);
 267   }
 268
 269   @NotNull
 270   public static CharSequence getTextByBinaryPresentation(@NotNull byte[] bytes, @NotNull VirtualFile virtualFile, final boolean rememberDetectedSeparators) {
 271     final Charset charset = detectCharset(virtualFile, bytes);
 272     final int offset = skipBOM(virtualFile, bytes);
 273
 274     final Pair<CharSequence, String> result = convertBytes(bytes, charset, offset);
 275     if (rememberDetectedSeparators) {
 276       virtualFile.putUserData(DETECTED_LINE_SEPARATOR_KEY, result.getSecond());
 277     }
 278     return result.getFirst();
 279   }
 280
 281   /**
 282    * Get detected line separator, if the file never been loaded, is loaded if checkFile parameter is specified.
 283    *
 284    * @param file      the file to check
 285    * @param checkFile if the line separator was not detected before, try to detect it
 286    * @return the detected line separator or null
 287    */
 288   @Nullable
 289   public static String detectLineSeparator(@NotNull VirtualFile file, boolean checkFile) {
 290     String lineSeparator = file.getUserData(DETECTED_LINE_SEPARATOR_KEY);
 291     if (lineSeparator == null && checkFile) {
 292       try {
 293         getTextByBinaryPresentation(file.contentsToByteArray(), file);
 294         lineSeparator = file.getUserData(DETECTED_LINE_SEPARATOR_KEY);
 295       }
 296       catch (IOException e) {
 297         // null will be returned
 298       }
 299     }
 300     return lineSeparator;
 301   }
 302
 303   /**
 304    * Change line separator for the file to the specified value (assumes that the documents were saved)
 305    *
 306    * @param project          the project instance
 307    * @param requestor        the requestor for the operation
 308    * @param file             the file to convert
 309    * @param newLineSeparator the new line separator for the file
 310    * @throws IOException in the case of IO problem
 311    */
 312   public static void changeLineSeparator(@Nullable Project project,
 313                                          @Nullable Object requestor,
 314                                          @NotNull VirtualFile file,
 315                                          @NotNull String newLineSeparator) throws IOException {
 316     String lineSeparator = file.getUserData(DETECTED_LINE_SEPARATOR_KEY);
 317     if (lineSeparator != null && lineSeparator.equals(newLineSeparator)) {
 318       return;
 319     }
 320     CharSequence cs = getTextByBinaryPresentation(file.contentsToByteArray(), file);
 321     lineSeparator = file.getUserData(DETECTED_LINE_SEPARATOR_KEY);
 322     if (lineSeparator == null || lineSeparator.equals(newLineSeparator)) {
 323       return;
 324     }
 325     if (!newLineSeparator.equals("\n")) {
 326       cs = StringUtil.convertLineSeparators(cs.toString(), newLineSeparator);
 327     }
 328     String text = cs.toString();
 329     file.putUserData(DETECTED_LINE_SEPARATOR_KEY, newLineSeparator);
 330     Writer w = getWriter(project, file, requestor, text, System.currentTimeMillis());
 331     try {
 332       w.write(text);
 333     }
 334     finally {
 335       w.close();
 336     }
 337   }
 338
 339   @NotNull
 340   public static CharSequence getTextByBinaryPresentation(@NotNull byte[] bytes, Charset charset) {
 341     final int offset = getBOM(bytes, charset).length;
 342     return convertBytes(bytes, charset, offset).getFirst();
 343   }
 344
 345   @NotNull
 346   private static Pair<CharSequence, String> convertBytes(@NotNull byte[] bytes, Charset charset, final int startOffset) {
 347     ByteBuffer byteBuffer = ByteBuffer.wrap(bytes, startOffset, bytes.length - startOffset);
 348
 349     if (charset == null) {
 350       charset = CharsetToolkit.getDefaultSystemCharset();
 351     }
 352     if (charset == null) {
 353       //noinspection HardCodedStringLiteral
 354       charset = Charset.forName("ISO-8859-1");
 355     }
 356     CharBuffer charBuffer = charset.decode(byteBuffer);
 357     return convertLineSeparators(charBuffer);
 358   }
 359
 360   private static final Key<Boolean> UTF_CHARSET_WAS_DETECTED_FROM_BYTES = new Key<Boolean>("UTF_CHARSET_WAS_DETECTED_FROM_BYTES");
 361   public static boolean utfCharsetWasDetectedFromBytes(@NotNull VirtualFile virtualFile) {
 362     return virtualFile.getUserData(UTF_CHARSET_WAS_DETECTED_FROM_BYTES) != null;
 363   }
 364   private static void setUtfCharsetWasDetectedFromBytes(@NotNull  VirtualFile virtualFile, boolean flag) {
 365     virtualFile.putUserData(UTF_CHARSET_WAS_DETECTED_FROM_BYTES, flag ? Boolean.TRUE : null);
 366   }
 367 }