1 // TortoiseGitMerge - a Diff/Patch program
3 // Copyright (C) 2007-2012 - TortoiseSVN
5 // This program is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU General Public License
7 // as published by the Free Software Foundation; either version 2
8 // of the License, or (at your option) any later version.
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
15 // You should have received a copy of the GNU General Public License
16 // along with this program; if not, write to the Free Software Foundation,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 #include "UnicodeUtils.h"
23 #include "filetextlines.h"
24 #include "FormatMessageWrapper.h"
25 #include "SmartHandle.h"
27 wchar_t inline WideCharSwap(wchar_t nValue
)
29 return (((nValue
>> 8)) | (nValue
<< 8));
30 //return _byteswap_ushort(nValue);
33 UINT64
inline WordSwapBytes(UINT64 nValue
)
35 return ((nValue
&0xff00ff00ff00ff)<<8) | ((nValue
>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
38 UINT32
inline DwordSwapBytes(UINT32 nValue
)
40 UINT32 nRet
= (nValue
<<16) | (nValue
>>16); // swap WORDs
41 nRet
= ((nRet
&0xff00ff)<<8) | ((nRet
>>8)&0xff00ff); // swap BYTESs in WORDs
43 //return _byteswap_ulong(nValue);
46 UINT64
inline DwordSwapBytes(UINT64 nValue
)
48 UINT64 nRet
= ((nValue
&0xffff0000ffffL
)<<16) | ((nValue
>>16)&0xffff0000ffffL
); // swap WORDs in DWORDs
49 nRet
= ((nRet
&0xff00ff00ff00ff)<<8) | ((nRet
>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
53 CFileTextLines::CFileTextLines(void)
54 : m_UnicodeType(CFileTextLines::AUTOTYPE
)
55 , m_LineEndings(EOL_AUTOLINE
)
56 , m_bNeedsConversion(false)
60 CFileTextLines::~CFileTextLines(void)
64 CFileTextLines::UnicodeType
CFileTextLines::CheckUnicodeType(LPVOID pBuffer
, int cb
)
67 return CFileTextLines::ASCII
;
68 const UINT32
* const pVal32
= (UINT32
*)pBuffer
;
69 const UINT16
* const pVal16
= (UINT16
*)pBuffer
;
70 const UINT8
* const pVal8
= (UINT8
*)pBuffer
;
71 // scan the whole buffer for a 0x00000000 sequence
72 // if found, we assume a binary file
74 for (int i
=0; i
<nDwords
; ++i
)
76 if (0x00000000 == pVal32
[i
])
77 return CFileTextLines::BINARY
;
81 if (*pVal32
== 0x0000FEFF)
83 return CFileTextLines::UTF32_LE
;
85 if (*pVal32
== 0xFFFE0000)
87 return CFileTextLines::UTF32_BE
;
90 if (*pVal16
== 0xFEFF)
92 return CFileTextLines::UTF16_LE
;
94 if (*pVal16
== 0xFFFE)
96 return CFileTextLines::UTF16_BE
;
99 return CFileTextLines::ASCII
;
100 if (*pVal16
== 0xBBEF)
102 if (pVal8
[2] == 0xBF)
103 return CFileTextLines::UTF8BOM
;
105 // check for illegal UTF8 sequences
106 bool bNonANSI
= false;
109 // run fast for ascii
112 if ((*(UINT64
*)&pVal8
[i
] & 0x8080808080808080)!=0) // all Ascii?
121 UINT8 zChar
= pVal8
[i
];
122 if ((zChar
& 0x80)==0) // Ascii
126 return CFileTextLines::ASCII
;
130 if ((zChar
& 0x40)==0) // top bit
133 return CFileTextLines::ASCII
;
138 return CFileTextLines::ASCII
;
140 else if ((zChar
& 0x20)==0) // top two bits
143 return CFileTextLines::ASCII
;
146 else if ((zChar
& 0x10)==0) // top three bits
150 else if ((zChar
& 0x08)==0) // top four bits
153 return CFileTextLines::ASCII
;
157 return CFileTextLines::ASCII
;
159 if (bNonANSI
&& nNeedData
==0)
160 // if get here thru nonAscii and no missing data left then its valid UTF8
161 return CFileTextLines::UTF8
;
162 if ((!bNonANSI
)&&(DWORD(CRegDWORD(_T("Software\\TortoiseGitMerge\\UseUTF8"), FALSE
))))
163 return CFileTextLines::UTF8
;
164 return CFileTextLines::ASCII
;
168 BOOL
CFileTextLines::Load(const CString
& sFilePath
, int lengthHint
/* = 0*/)
170 WCHAR exceptionError
[1000] = {0};
171 m_LineEndings
= EOL_AUTOLINE
;
172 m_UnicodeType
= CFileTextLines::AUTOTYPE
;
179 if (PathIsDirectory(sFilePath
))
181 m_sErrorString
.Format(IDS_ERR_FILE_NOTAFILE
, (LPCTSTR
)sFilePath
);
185 if (!PathFileExists(sFilePath
))
187 //file does not exist, so just return SUCCESS
191 CAutoFile hFile
= CreateFile(sFilePath
, GENERIC_READ
, FILE_SHARE_READ
|FILE_SHARE_DELETE
|FILE_SHARE_WRITE
, NULL
, OPEN_EXISTING
, NULL
, NULL
);
199 if (!GetFileSizeEx(hFile
, &fsize
))
206 // file is way too big for us
207 m_sErrorString
.LoadString(IDS_ERR_FILE_TOOBIG
);
212 // If new[] was done for type T delete[] must be called on a pointer of type T*,
213 // otherwise the behavior is undefined.
214 // +1 is to address possible truncation when integer division is done
218 oFile
.SetLength(fsize
.LowPart
);
220 catch (CMemoryException
* e
)
222 e
->GetErrorMessage(exceptionError
, _countof(exceptionError
));
223 m_sErrorString
= exceptionError
;
228 DWORD dwReadBytes
= 0;
229 if (!ReadFile(hFile
, (void *)oFile
, fsize
.LowPart
, &dwReadBytes
, NULL
))
237 if (m_UnicodeType
== CFileTextLines::AUTOTYPE
)
239 m_UnicodeType
= this->CheckUnicodeType((LPVOID
)oFile
, dwReadBytes
);
240 // enforce conversion for all but ASCII and UTF8 type
241 m_bNeedsConversion
= (m_UnicodeType
!=CFileTextLines::UTF8
)&&(m_UnicodeType
!=CFileTextLines::ASCII
);
244 // we may have to convert the file content - CString is UTF16LE
247 CBaseFilter
* pFilter
= NULL
;
248 switch (m_UnicodeType
)
251 m_sErrorString
.Format(IDS_ERR_FILE_BINARY
, (LPCTSTR
)sFilePath
);
255 pFilter
= new CUtf8Filter(NULL
);
259 pFilter
= new CAsciiFilter(NULL
);
262 pFilter
= new CUtf16beFilter(NULL
);
265 pFilter
= new CUtf16leFilter(NULL
);
268 pFilter
= new CUtf32beFilter(NULL
);
271 pFilter
= new CUtf32leFilter(NULL
);
274 pFilter
->Decode(oFile
);
277 catch (CMemoryException
* e
)
279 e
->GetErrorMessage(exceptionError
, _countof(exceptionError
));
280 m_sErrorString
= exceptionError
;
284 int nReadChars
=oFile
.GetLength()/sizeof(wchar_t);
285 wchar_t * pTextBuf
= (wchar_t *)oFile
;
286 wchar_t * pLineStart
= pTextBuf
;
287 if ((m_UnicodeType
== UTF8BOM
)
288 || (m_UnicodeType
== UTF16_LE
)
289 || (m_UnicodeType
== UTF16_BE
)
290 || (m_UnicodeType
== UTF32_LE
)
291 || (m_UnicodeType
== UTF32_BE
))
299 // fill in the lines into the array
300 size_t countEOLs
[EOL__COUNT
];
301 memset(countEOLs
, 0, sizeof(countEOLs
));
302 CFileTextLine oTextLine
;
303 for (int i
= nReadChars
; i
; --i
)
309 // crlf line ending or cr line ending
310 eEol
= ((i
> 1) && *(pTextBuf
) == '\n') ? EOL_CRLF
: EOL_CR
;
313 // lfcr line ending or lf line ending
314 eEol
= ((i
> 1) && *(pTextBuf
) == '\r') ? EOL_LFCR
: EOL_LF
;
334 oTextLine
.sLine
= CString(pLineStart
, (int)(pTextBuf
-pLineStart
)-1);
335 oTextLine
.eEnding
= eEol
;
338 if (eEol
==EOL_CRLF
|| eEol
==EOL_LFCR
)
343 pLineStart
= pTextBuf
;
345 CString
line(pLineStart
, (int)(pTextBuf
-pLineStart
));
346 Add(line
, EOL_NOENDING
);
348 // some EOLs are not supported by the svn diff lib.
349 m_bNeedsConversion
|= (countEOLs
[EOL_CRLF
]!=0);
350 m_bNeedsConversion
|= (countEOLs
[EOL_FF
]!=0);
351 m_bNeedsConversion
|= (countEOLs
[EOL_VT
]!=0);
352 m_bNeedsConversion
|= (countEOLs
[EOL_NEL
]!=0);
353 m_bNeedsConversion
|= (countEOLs
[EOL_LS
]!=0);
354 m_bNeedsConversion
|= (countEOLs
[EOL_PS
]!=0);
357 for (int nEol
= 0; nEol
<EOL__COUNT
; nEol
++)
359 if (eolmax
< countEOLs
[nEol
])
361 eolmax
= countEOLs
[nEol
];
362 m_LineEndings
= (EOL
)nEol
;
369 void CFileTextLines::StripWhiteSpace(CString
& sLine
, DWORD dwIgnoreWhitespaces
, bool blame
)
373 if (sLine
.GetLength() > 66)
374 sLine
= sLine
.Mid(66);
376 switch (dwIgnoreWhitespaces
)
379 // Compare whitespaces
383 // Ignore all whitespaces
384 sLine
.TrimLeft(_T(" \t"));
385 sLine
.TrimRight(_T(" \t"));
388 // Ignore leading whitespace
389 sLine
.TrimLeft(_T(" \t"));
392 // Ignore ending whitespace
393 sLine
.TrimRight(_T(" \t"));
402 - modify line - whitespaces, lowercase
404 - get cached encoded eol
407 BOOL
CFileTextLines::Save(const CString
& sFilePath
408 , bool bSaveAsUTF8
/*= false*/
409 , bool bUseSVNCompatibleEOLs
/*= false*/
410 , DWORD dwIgnoreWhitespaces
/*=0*/
411 , BOOL bIgnoreCase
/*= FALSE*/
412 , bool bBlame
/*= false*/) const
416 CString destPath
= sFilePath
;
417 // now make sure that the destination directory exists
419 while (destPath
.Find('\\', ind
)>=2)
421 if (!PathIsDirectory(destPath
.Left(destPath
.Find('\\', ind
))))
423 if (!CreateDirectory(destPath
.Left(destPath
.Find('\\', ind
)), NULL
))
426 ind
= destPath
.Find('\\', ind
)+1;
429 CStdioFile file
; // Hugely faster than CFile for big file writes - because it uses buffering
430 if (!file
.Open(sFilePath
, CFile::modeCreate
| CFile::modeWrite
| CFile::typeBinary
))
432 const_cast<CString
*>(&m_sErrorString
)->Format(IDS_ERR_FILE_OPEN
, (LPCTSTR
)sFilePath
);
436 CBaseFilter
* pFilter
= NULL
;
437 bool bSaveBom
= true;
438 CFileTextLines::UnicodeType eUnicodeType
= bSaveAsUTF8
? CFileTextLines::UTF8
: m_UnicodeType
;
439 switch (eUnicodeType
)
442 case CFileTextLines::ASCII
:
444 pFilter
= new CAsciiFilter(&file
);
446 case CFileTextLines::UTF8
:
448 case CFileTextLines::UTF8BOM
:
449 pFilter
= new CUtf8Filter(&file
);
451 case CFileTextLines::UTF16_BE
:
452 pFilter
= new CUtf16beFilter(&file
);
454 case CFileTextLines::UTF16_LE
:
455 pFilter
= new CUtf16leFilter(&file
);
457 case CFileTextLines::UTF32_BE
:
458 pFilter
= new CUtf32beFilter(&file
);
460 case CFileTextLines::UTF32_LE
:
461 pFilter
= new CUtf32leFilter(&file
);
467 //first write the BOM
468 pFilter
->Write(L
"\xfeff");
471 CBuffer oEncodedEol
[EOL__COUNT
];
472 oEncodedEol
[EOL_LF
] = pFilter
->Encode(_T("\n")); // x0a
473 oEncodedEol
[EOL_CR
] = pFilter
->Encode(_T("\r")); // x0d
474 oEncodedEol
[EOL_CRLF
] = pFilter
->Encode(_T("\r\n")); // x0d x0a
475 if (bUseSVNCompatibleEOLs
)
477 // when using EOLs that are supported by the svn lib,
478 // we have to use the same EOLs as the file has in case
479 // they're already supported, but a different supported one
480 // in case the original one isn't supported.
481 // Only this way the option "ignore EOLs (recommended)" unchecked
482 // actually shows the lines as different.
483 // However, the diff won't find and differences in EOLs
484 // for these special EOLs if they differ between those special ones
486 // But it will work properly for the most common EOLs LF/CR/CRLF.
487 oEncodedEol
[EOL_LFCR
] = oEncodedEol
[EOL_CR
];
488 for (int nEol
= 0; nEol
<EOL_NOENDING
; nEol
++)
490 if (oEncodedEol
[nEol
].IsEmpty())
491 oEncodedEol
[nEol
] = oEncodedEol
[EOL_LF
];
496 oEncodedEol
[EOL_LFCR
] = pFilter
->Encode(_T("\n\r"));
497 oEncodedEol
[EOL_VT
] = pFilter
->Encode(_T("\v")); // x0b
498 oEncodedEol
[EOL_FF
] = pFilter
->Encode(_T("\f")); // x0c
499 oEncodedEol
[EOL_NEL
] = pFilter
->Encode(_T("\x85"));
500 oEncodedEol
[EOL_LS
] = pFilter
->Encode(_T("\x2028"));
501 oEncodedEol
[EOL_PS
] = pFilter
->Encode(_T("\x2029"));
503 oEncodedEol
[EOL_AUTOLINE
] = oEncodedEol
[m_LineEndings
==EOL_AUTOLINE
? EOL_CRLF
: m_LineEndings
];
505 for (int i
=0; i
<GetCount(); i
++)
507 CString sLineT
= GetAt(i
);
508 StripWhiteSpace(sLineT
, dwIgnoreWhitespaces
, bBlame
);
510 sLineT
= sLineT
.MakeLower();
511 pFilter
->Write(sLineT
);
512 EOL eEol
= GetLineEnding(i
);
513 pFilter
->Write(oEncodedEol
[eEol
]);
518 catch (CException
* e
)
520 CString
* psErrorString
= const_cast<CString
*>(&m_sErrorString
);
521 e
->GetErrorMessage(psErrorString
->GetBuffer(4096), 4096);
522 psErrorString
->ReleaseBuffer();
529 void CFileTextLines::SetErrorString()
531 m_sErrorString
= CFormatMessageWrapper();
534 void CFileTextLines::CopySettings(CFileTextLines
* pFileToCopySettingsTo
)
536 if (pFileToCopySettingsTo
)
538 pFileToCopySettingsTo
->m_UnicodeType
= m_UnicodeType
;
539 pFileToCopySettingsTo
->m_LineEndings
= m_LineEndings
;
545 void CBuffer::ExpandToAtLeast(int nNewSize
)
547 if (nNewSize
>m_nAllocated
)
549 delete [] m_pBuffer
; // we don't preserve buffer content intentionally
552 m_pBuffer
=new BYTE
[nNewSize
];
553 m_nAllocated
=nNewSize
;
557 void CBuffer::SetLength(int nUsed
)
559 ExpandToAtLeast(nUsed
);
563 void CBuffer::Swap(CBuffer
& Src
)
565 std::swap(Src
.m_nAllocated
, m_nAllocated
);
566 std::swap(Src
.m_pBuffer
, m_pBuffer
);
567 std::swap(Src
.m_nUsed
, m_nUsed
);
570 void CBuffer::Copy(const CBuffer
& Src
)
574 SetLength(Src
.m_nUsed
);
575 memcpy(m_pBuffer
, Src
.m_pBuffer
, m_nUsed
);
581 bool CBaseFilter::Decode(/*in out*/ CBuffer
& data
)
583 int nFlags
= (m_nCodePage
==CP_ACP
) ? MB_PRECOMPOSED
: 0;
584 // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
585 int nReadChars
= MultiByteToWideChar(m_nCodePage
, nFlags
, (LPCSTR
)data
, data
.GetLength(), NULL
, 0);
586 m_oBuffer
.SetLength(nReadChars
*sizeof(wchar_t));
587 int ret2
= MultiByteToWideChar(m_nCodePage
, nFlags
, (LPCSTR
)data
, data
.GetLength(), (LPWSTR
)(void *)m_oBuffer
, nReadChars
);
588 if (ret2
!= nReadChars
)
592 data
.Swap(m_oBuffer
);
596 const CBuffer
& CBaseFilter::Encode(const CString s
)
598 m_oBuffer
.SetLength(s
.GetLength()*3+1); // set buffer to guessed max size
599 int nConvertedLen
= WideCharToMultiByte(m_nCodePage
, 0, (LPCTSTR
)s
, s
.GetLength(), (LPSTR
)m_oBuffer
, m_oBuffer
.GetLength(), NULL
, NULL
);
600 m_oBuffer
.SetLength(nConvertedLen
); // set buffer to used size
606 bool CUtf16leFilter::Decode(/*in out*/ CBuffer
& /*data*/)
608 // we believe data is ok for use
612 const CBuffer
& CUtf16leFilter::Encode(const CString s
)
614 int nNeedBytes
= s
.GetLength()*sizeof(TCHAR
);
615 m_oBuffer
.SetLength(nNeedBytes
);
616 memcpy((void *)m_oBuffer
, (LPCTSTR
)s
, nNeedBytes
);
622 bool CUtf16beFilter::Decode(/*in out*/ CBuffer
& data
)
624 int nNeedBytes
= data
.GetLength();
625 // make in place WORD BYTEs swap
626 UINT64
* p_qw
= (UINT64
*)(void *)data
;
627 int nQwords
= nNeedBytes
/8;
628 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
630 p_qw
[nQword
] = WordSwapBytes(p_qw
[nQword
]);
632 wchar_t * p_w
= (wchar_t *)p_qw
;
633 int nWords
= nNeedBytes
/2;
634 for (int nWord
= nQwords
*4; nWord
<nWords
; nWord
++)
636 p_w
[nWord
] = WideCharSwap(p_w
[nWord
]);
638 return CUtf16leFilter::Decode(data
);
641 const CBuffer
& CUtf16beFilter::Encode(const CString s
)
643 int nNeedBytes
= s
.GetLength()*sizeof(TCHAR
);
644 m_oBuffer
.SetLength(nNeedBytes
);
645 // copy swaping BYTE order in WORDs
646 const UINT64
* p_qwIn
= (const UINT64
*)(LPCTSTR
)s
;
647 UINT64
* p_qwOut
= (UINT64
*)(void *)m_oBuffer
;
648 int nQwords
= nNeedBytes
/8;
649 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
651 p_qwOut
[nQword
] = WordSwapBytes(p_qwIn
[nQword
]);
653 wchar_t * p_wIn
= (wchar_t *)p_qwIn
;
654 wchar_t * p_wOut
= (wchar_t *)p_qwOut
;
655 int nWords
= nNeedBytes
/2;
656 for (int nWord
= nQwords
*4; nWord
<nWords
; nWord
++)
658 p_wOut
[nWord
] = WideCharSwap(p_wIn
[nWord
]);
665 bool CUtf32leFilter::Decode(/*in out*/ CBuffer
& data
)
667 // UTF32 have four bytes per char
668 int nReadChars
= data
.GetLength()/4;
669 UINT32
* p32
= (UINT32
*)(void *)data
;
671 // count chars which needs surrogate pair
672 int nSurrogatePairCount
= 0;
673 for (int i
= 0; i
<nReadChars
; ++i
)
675 if (p32
[i
]<0x110000 && p32
[i
]>=0x10000)
677 ++nSurrogatePairCount
;
682 m_oBuffer
.SetLength((nReadChars
+nSurrogatePairCount
)*sizeof(wchar_t));
683 wchar_t * pOut
= (wchar_t *)m_oBuffer
;
684 for (int i
= 0; i
<nReadChars
; ++i
, ++pOut
)
686 UINT32 zChar
= p32
[i
];
689 *pOut
=0xfffd; // ? mark
691 else if (zChar
>=0x10000)
694 pOut
[0] = ((zChar
>>10)&0x3ff) | 0xd800; // lead surrogate
695 pOut
[1] = (zChar
&0x7ff) | 0xdc00; // trail surrogate
700 *pOut
= (wchar_t)zChar
;
703 data
.Swap(m_oBuffer
);
707 const CBuffer
& CUtf32leFilter::Encode(const CString s
)
709 int nInWords
= s
.GetLength();
710 m_oBuffer
.SetLength(nInWords
*2);
712 LPCTSTR p_In
= (LPCTSTR
)s
;
713 UINT32
* p_Out
= (UINT32
*)(void *)m_oBuffer
;
715 for (int nInWord
= 0; nInWord
<nInWords
; nInWord
++, nOutDword
++)
717 UINT32 zChar
= p_In
[nInWord
];
718 if ((zChar
&0xfc00) == 0xd800) // lead surrogate
720 if (nInWord
+1<nInWords
&& (p_In
[nInWord
+1]&0xfc00) == 0xdc00) // trail surrogate follows
722 zChar
= 0x10000 + ((zChar
&0x3ff)<<10) + (p_In
[++nInWord
]&0x3ff);
726 zChar
= 0xfffd; // ? mark
729 else if ((zChar
&0xfc00) == 0xdc00) // trail surrogate without lead
731 zChar
= 0xfffd; // ? mark
733 p_Out
[nOutDword
] = zChar
;
735 m_oBuffer
.SetLength(nOutDword
*4); // store length reduced by surrogates
741 bool CUtf32beFilter::Decode(/*in out*/ CBuffer
& data
)
744 // swap BYTEs order in DWORDs
745 UINT64
* p64
= (UINT64
*)(void *)data
;
746 int nQwords
= data
.GetLength()/8;
747 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
749 p64
[nQword
] = DwordSwapBytes(p64
[nQword
]);
752 UINT32
* p32
= (UINT32
*)p64
;
753 int nDwords
= data
.GetLength()/4;
754 for (int nDword
= nQwords
*2; nDword
<nDwords
; nDword
++)
756 p32
[nDword
] = DwordSwapBytes(p32
[nDword
]);
758 return CUtf32leFilter::Decode(data
);
761 const CBuffer
& CUtf32beFilter::Encode(const CString s
)
763 CUtf32leFilter::Encode(s
);
765 // swap BYTEs order in DWORDs
766 UINT64
* p64
= (UINT64
*)(void *)m_oBuffer
;
767 int nQwords
= m_oBuffer
.GetLength()/8;
768 for (int nQword
= 0; nQword
<nQwords
; nQword
++)
770 p64
[nQword
] = DwordSwapBytes(p64
[nQword
]);
773 UINT32
* p32
= (UINT32
*)p64
;
774 int nDwords
= m_oBuffer
.GetLength()/4;
775 for (int nDword
= nQwords
*2; nDword
<nDwords
; nDword
++)
777 p32
[nDword
] = DwordSwapBytes(p32
[nDword
]);