1 // TortoiseMerge - a Diff/Patch program
3 // Copyright (C) 2007-2011 - TortoiseSVN
5 // This program is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU General Public License
7 // as published by the Free Software Foundation; either version 2
8 // of the License, or (at your option) any later version.
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software Foundation,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 #include "UnicodeUtils.h"
23 #include ".\filetextlines.h"
24 #include "FormatMessageWrapper.h"
25 #include "SmartHandle.h"
26 #include "auto_buffer.h"
28 wchar_t WideCharSwap(wchar_t nValue
)
30 return (((nValue
>> 8)) | (nValue
<< 8));
33 CFileTextLines::CFileTextLines(void)
34 : m_UnicodeType(CFileTextLines::AUTOTYPE
)
35 , m_LineEndings(EOL_AUTOLINE
)
36 , m_bReturnAtEnd(false)
40 CFileTextLines::~CFileTextLines(void)
44 CFileTextLines::UnicodeType
CFileTextLines::CheckUnicodeType(LPVOID pBuffer
, int cb
)
47 return CFileTextLines::ASCII
;
48 UINT16
* pVal16
= (UINT16
*)pBuffer
;
49 UINT8
* pVal8
= (UINT8
*)(pVal16
+1);
50 // scan the whole buffer for a 0x0000 sequence
51 // if found, we assume a binary file
52 for (int i
=0; i
<(cb
-2); i
=i
+2)
54 if (0x0000 == *pVal16
++)
55 return CFileTextLines::BINARY
;
57 pVal16
= (UINT16
*)pBuffer
;
58 if (*pVal16
== 0xFEFF)
59 return CFileTextLines::UNICODE_LE
;
60 if (*pVal16
== 0xFFFE)
61 return CFileTextLines::UNICODE_BE
;
64 if (*pVal16
== 0xBBEF)
67 return CFileTextLines::UTF8BOM
;
69 // check for illegal UTF8 chars
70 pVal8
= (UINT8
*)pBuffer
;
71 for (int i
=0; i
<cb
; ++i
)
73 if ((*pVal8
== 0xC0)||(*pVal8
== 0xC1)||(*pVal8
>= 0xF5))
74 return CFileTextLines::ASCII
;
77 pVal8
= (UINT8
*)pBuffer
;
79 bool bNonANSI
= false;
80 for (int i
=0; i
<(cb
-3); ++i
)
84 if ((*pVal8
& 0xE0)==0xC0)
87 if ((*pVal8
& 0xC0)!=0x80)
88 return CFileTextLines::ASCII
;
91 if ((*pVal8
& 0xF0)==0xE0)
94 if ((*pVal8
& 0xC0)!=0x80)
95 return CFileTextLines::ASCII
;
97 if ((*pVal8
& 0xC0)!=0x80)
98 return CFileTextLines::ASCII
;
101 if ((*pVal8
& 0xF8)==0xF0)
104 if ((*pVal8
& 0xC0)!=0x80)
105 return CFileTextLines::ASCII
;
107 if ((*pVal8
& 0xC0)!=0x80)
108 return CFileTextLines::ASCII
;
110 if ((*pVal8
& 0xC0)!=0x80)
111 return CFileTextLines::ASCII
;
117 return CFileTextLines::UTF8
;
118 if ((!bNonANSI
)&&(DWORD(CRegDWORD(_T("Software\\TortoiseMerge\\UseUTF8"), FALSE
))))
119 return CFileTextLines::UTF8
;
120 return CFileTextLines::ASCII
;
124 EOL
CFileTextLines::CheckLineEndings(LPVOID pBuffer
, int cb
)
126 EOL retval
= EOL_AUTOLINE
;
127 char * buf
= (char *)pBuffer
;
128 for (int i
=0; i
<cb
; i
++)
130 //now search the buffer for line endings
140 if (buf
[i
+2] == 0x0d)
152 else if (buf
[i
+1] == 0x0d)
161 else if (buf
[i
] == 0x0d)
170 if (buf
[i
+2] == 0x0a)
182 else if (buf
[i
+1] == 0x0a)
195 BOOL
CFileTextLines::Load(const CString
& sFilePath
, int lengthHint
/* = 0*/)
197 WCHAR exceptionError
[1000] = {0};
198 m_LineEndings
= EOL_AUTOLINE
;
199 m_UnicodeType
= CFileTextLines::AUTOTYPE
;
207 if (PathIsDirectory(sFilePath
))
209 m_sErrorString
.Format(IDS_ERR_FILE_NOTAFILE
, (LPCTSTR
)sFilePath
);
213 if (!PathFileExists(sFilePath
))
215 //file does not exist, so just return SUCCESS
219 CAutoFile hFile
= CreateFile(sFilePath
, GENERIC_READ
, FILE_SHARE_READ
, NULL
, OPEN_EXISTING
, NULL
, NULL
);
227 if (!GetFileSizeEx(hFile
, &fsize
))
234 // file is way too big for us
235 m_sErrorString
.LoadString(IDS_ERR_FILE_TOOBIG
);
239 // If new[] was done for type T delete[] must be called on a pointer of type T*,
240 // otherwise the behavior is undefined.
241 // +1 is to address possible truncation when integer division is done
242 wchar_t* pFileBuf
= NULL
;
245 pFileBuf
= new wchar_t[fsize
.LowPart
/sizeof(wchar_t) + 1];
247 catch (CMemoryException
* e
)
249 e
->GetErrorMessage(exceptionError
, _countof(exceptionError
));
250 m_sErrorString
= exceptionError
;
253 DWORD dwReadBytes
= 0;
254 if (!ReadFile(hFile
, pFileBuf
, fsize
.LowPart
, &dwReadBytes
, NULL
))
260 if (m_UnicodeType
== CFileTextLines::AUTOTYPE
)
262 m_UnicodeType
= this->CheckUnicodeType(pFileBuf
, dwReadBytes
);
264 if (m_LineEndings
== EOL_AUTOLINE
)
266 m_LineEndings
= CheckLineEndings(pFileBuf
, min(10000, dwReadBytes
));
270 if (m_UnicodeType
== CFileTextLines::BINARY
)
272 m_sErrorString
.Format(IDS_ERR_FILE_BINARY
, (LPCTSTR
)sFilePath
);
277 // we may have to convert the file content
278 if ((m_UnicodeType
== UTF8
)||(m_UnicodeType
== UTF8BOM
))
280 int ret
= MultiByteToWideChar(CP_UTF8
, 0, (LPCSTR
)pFileBuf
, dwReadBytes
, NULL
, 0);
281 wchar_t * pWideBuf
= NULL
;
284 pWideBuf
= new wchar_t[ret
];
286 catch (CMemoryException
* e
)
288 e
->GetErrorMessage(exceptionError
, _countof(exceptionError
));
289 m_sErrorString
= exceptionError
;
293 int ret2
= MultiByteToWideChar(CP_UTF8
, 0, (LPCSTR
)pFileBuf
, dwReadBytes
, pWideBuf
, ret
);
302 else if (m_UnicodeType
== ASCII
)
304 int ret
= MultiByteToWideChar(CP_ACP
, MB_PRECOMPOSED
, (LPCSTR
)pFileBuf
, dwReadBytes
, NULL
, 0);
305 wchar_t * pWideBuf
= NULL
;
308 pWideBuf
= new wchar_t[ret
];
310 catch (CMemoryException
* e
)
312 e
->GetErrorMessage(exceptionError
, _countof(exceptionError
));
313 m_sErrorString
= exceptionError
;
317 int ret2
= MultiByteToWideChar(CP_ACP
, MB_PRECOMPOSED
, (LPCSTR
)pFileBuf
, dwReadBytes
, pWideBuf
, ret
);
327 // fill in the lines into the array
328 wchar_t * pTextBuf
= pFileBuf
;
329 wchar_t * pLineStart
= pFileBuf
;
330 if ((m_UnicodeType
== UNICODE_LE
)||(m_UnicodeType
== UNICODE_BE
))
332 // UTF16 have two bytes per char
335 if ((m_UnicodeType
== UTF8BOM
)||(m_UnicodeType
== UNICODE_LE
)||(m_UnicodeType
== UNICODE_BE
))
343 if (m_UnicodeType
== UNICODE_BE
)
345 // swap the bytes to little-endian order to get proper strings in wchar_t format
346 wchar_t * pSwapBuf
= pTextBuf
;
347 for (DWORD i
= 0; i
<dwReadBytes
; ++i
)
349 *pSwapBuf
= WideCharSwap(*pSwapBuf
);
354 for (DWORD i
= 0; i
<dwReadBytes
; ++i
)
356 if (*pTextBuf
== '\r')
358 if ((i
+ 1) < dwReadBytes
)
360 if (*(pTextBuf
+1) == '\n')
363 CString
line(pLineStart
, (int)(pTextBuf
-pLineStart
));
365 pLineStart
= pTextBuf
+2;
372 CString
line(pLineStart
, (int)(pTextBuf
-pLineStart
));
374 pLineStart
=pTextBuf
+1;
378 else if (*pTextBuf
== '\n')
381 CString
line(pLineStart
, (int)(pTextBuf
-pLineStart
));
383 pLineStart
=pTextBuf
+1;
387 if (pLineStart
< pTextBuf
)
389 CString
line(pLineStart
, (int)(pTextBuf
-pLineStart
));
390 Add(line
, EOL_NOENDING
);
391 m_bReturnAtEnd
= false;
394 m_bReturnAtEnd
= true;
401 void CFileTextLines::StripWhiteSpace(CString
& sLine
,DWORD dwIgnoreWhitespaces
, bool blame
)
405 if (sLine
.GetLength() > 66)
406 sLine
= sLine
.Mid(66);
408 switch (dwIgnoreWhitespaces
)
411 // Compare whitespaces
415 // Ignore all whitespaces
416 sLine
.TrimLeft(_T(" \t"));
417 sLine
.TrimRight(_T(" \t"));
420 // Ignore leading whitespace
421 sLine
.TrimLeft(_T(" \t"));
424 // Ignore ending whitespace
425 sLine
.TrimRight(_T(" \t"));
430 void CFileTextLines::StripAsciiWhiteSpace(CStringA
& sLine
,DWORD dwIgnoreWhitespaces
, bool blame
)
434 if (sLine
.GetLength() > 66)
435 sLine
= sLine
.Mid(66);
437 switch (dwIgnoreWhitespaces
)
439 case 0: // Compare whitespaces
443 // Ignore all whitespaces
444 StripAsciiWhiteSpace(sLine
);
447 // Ignore leading whitespace
448 sLine
.TrimLeft(" \t");
451 // Ignore leading whitespace
452 sLine
.TrimRight(" \t");
458 // Fast in-place removal of spaces and tabs from CStringA line
460 void CFileTextLines::StripAsciiWhiteSpace(CStringA
& sLine
)
463 char* pWriteChr
= sLine
.GetBuffer(sLine
.GetLength());
464 const char* pReadChr
= pWriteChr
;
467 if(*pReadChr
!= ' ' && *pReadChr
!= '\t')
469 *pWriteChr
++ = *pReadChr
;
475 sLine
.ReleaseBuffer(outputLen
);
478 BOOL
CFileTextLines::Save(const CString
& sFilePath
, bool bSaveAsUTF8
, DWORD dwIgnoreWhitespaces
/*=0*/, BOOL bIgnoreCase
/*= FALSE*/, bool bBlame
/*= false*/)
482 CString destPath
= sFilePath
;
483 // now make sure that the destination directory exists
485 while (destPath
.Find('\\', ind
)>=2)
487 if (!PathIsDirectory(destPath
.Left(destPath
.Find('\\', ind
))))
489 if (!CreateDirectory(destPath
.Left(destPath
.Find('\\', ind
)), NULL
))
492 ind
= destPath
.Find('\\', ind
)+1;
495 CStdioFile file
; // Hugely faster than CFile for big file writes - because it uses buffering
496 if (!file
.Open(sFilePath
, CFile::modeCreate
| CFile::modeWrite
| CFile::typeBinary
))
498 m_sErrorString
.Format(IDS_ERR_FILE_OPEN
, (LPCTSTR
)sFilePath
);
501 if ((!bSaveAsUTF8
)&&(m_UnicodeType
== CFileTextLines::UNICODE_LE
))
503 //first write the BOM
504 UINT16 wBOM
= 0xFEFF;
505 file
.Write(&wBOM
, 2);
506 for (int i
=0; i
<GetCount(); i
++)
508 CString sLine
= GetAt(i
);
509 EOL ending
= GetLineEnding(i
);
510 StripWhiteSpace(sLine
,dwIgnoreWhitespaces
, bBlame
);
512 sLine
= sLine
.MakeLower();
513 file
.Write((LPCTSTR
)sLine
, sLine
.GetLength()*sizeof(TCHAR
));
514 if (ending
== EOL_AUTOLINE
)
515 ending
= m_LineEndings
;
523 sLine
= _T("\x0d\x0a");
529 sLine
= _T("\x0a\x0d");
535 if ((m_bReturnAtEnd
)||(i
!= GetCount()-1))
536 file
.Write((LPCTSTR
)sLine
, sLine
.GetLength()*sizeof(TCHAR
));
539 if ((!bSaveAsUTF8
)&&(m_UnicodeType
== CFileTextLines::UNICODE_BE
))
541 int linebuflen
= 4096;
542 auto_buffer
<BYTE
> beBuf(linebuflen
);
543 //first write the BOM
544 UINT16 wBOM
= 0xFFFE;
545 file
.Write(&wBOM
, 2);
546 for (int i
=0; i
<GetCount(); i
++)
548 CString sLine
= GetAt(i
);
549 EOL ending
= GetLineEnding(i
);
550 StripWhiteSpace(sLine
,dwIgnoreWhitespaces
, bBlame
);
552 sLine
= sLine
.MakeLower();
553 int bytelen
= sLine
.GetLength()*sizeof(WCHAR
);
554 if (bytelen
> linebuflen
)
556 // increase buffer size if necessary
557 linebuflen
= bytelen
+ 1024;
558 beBuf
.reset(linebuflen
);
560 for (int spos
= 0; spos
< bytelen
; )
562 // swap the bytes to big-endian order
563 wchar_t c
= sLine
[spos
/2];
564 beBuf
[spos
++] = c
>>8;
565 beBuf
[spos
++] = c
& 0xFF;
567 file
.Write(beBuf
.get(), sLine
.GetLength()*sizeof(WCHAR
));
568 if (ending
== EOL_AUTOLINE
)
569 ending
= m_LineEndings
;
577 sLine
= _T("\x0d\x0a");
583 sLine
= _T("\x0a\x0d");
589 if ((m_bReturnAtEnd
)||(i
!= GetCount()-1))
591 // swap the bytes to big-endian order
594 if (sLine
.GetLength() > 0)
596 wchar_t c
= sLine
[0];
600 if (sLine
.GetLength() > 1)
602 wchar_t c
= sLine
[1];
607 file
.Write(buf
, sLine
.GetLength()*sizeof(TCHAR
));
611 else if ((!bSaveAsUTF8
)&&((m_UnicodeType
== CFileTextLines::ASCII
)||(m_UnicodeType
== CFileTextLines::AUTOTYPE
)))
613 for (int i
=0; i
< GetCount(); i
++)
615 // Copy CString to 8 bit without conversion
616 CString sLineT
= GetAt(i
);
617 CStringA sLine
= CStringA(sLineT
);
618 EOL ending
= GetLineEnding(i
);
620 StripAsciiWhiteSpace(sLine
,dwIgnoreWhitespaces
, bBlame
);
622 sLine
= sLine
.MakeLower();
623 if ((m_bReturnAtEnd
)||(i
!= GetCount()-1))
625 if (ending
== EOL_AUTOLINE
)
626 ending
= m_LineEndings
;
634 sLine
.Append("\x0d\x0a", 2);
640 sLine
.Append("\x0a\x0d", 2);
644 file
.Write((LPCSTR
)sLine
, sLine
.GetLength());
647 else if ((bSaveAsUTF8
)||((m_UnicodeType
== CFileTextLines::UTF8BOM
)||(m_UnicodeType
== CFileTextLines::UTF8
)))
649 if (m_UnicodeType
== CFileTextLines::UTF8BOM
)
651 //first write the BOM
652 UINT16 wBOM
= 0xBBEF;
653 file
.Write(&wBOM
, 2);
655 file
.Write(&uBOM
, 1);
657 for (int i
=0; i
<GetCount(); i
++)
659 CStringA sLine
= CUnicodeUtils::GetUTF8(GetAt(i
));
660 EOL ending
= GetLineEnding(i
);
661 StripAsciiWhiteSpace(sLine
,dwIgnoreWhitespaces
, bBlame
);
663 sLine
= sLine
.MakeLower();
665 if ((m_bReturnAtEnd
)||(i
!= GetCount()-1))
667 if (ending
== EOL_AUTOLINE
)
668 ending
= m_LineEndings
;
676 sLine
.Append("\x0d\x0a",2);
682 sLine
.Append("\x0a\x0d",2);
686 file
.Write((LPCSTR
)sLine
, sLine
.GetLength());
691 catch (CException
* e
)
693 e
->GetErrorMessage(m_sErrorString
.GetBuffer(4096), 4096);
694 m_sErrorString
.ReleaseBuffer();
701 void CFileTextLines::SetErrorString()
703 m_sErrorString
= CFormatMessageWrapper();
706 void CFileTextLines::CopySettings(CFileTextLines
* pFileToCopySettingsTo
)
708 if (pFileToCopySettingsTo
)
710 pFileToCopySettingsTo
->m_UnicodeType
= m_UnicodeType
;
711 pFileToCopySettingsTo
->m_LineEndings
= m_LineEndings
;
712 pFileToCopySettingsTo
->m_bReturnAtEnd
= m_bReturnAtEnd
;