src/TortoiseMerge/FileTextLines.cpp

   1 // TortoiseGitMerge - a Diff/Patch program
   2
   3 // Copyright (C) 2007-2013 - TortoiseSVN
   4
   5 // This program is free software; you can redistribute it and/or
   6 // modify it under the terms of the GNU General Public License
   7 // as published by the Free Software Foundation; either version 2
   8 // of the License, or (at your option) any later version.
   9
  10 // This program is distributed in the hope that it will be useful,
  11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 // GNU General Public License for more details.
  14
  15 // You should have received a copy of the GNU General Public License
  16 // along with this program; if not, write to the Free Software Foundation,
  17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 #include "stdafx.h"
  20 #include "resource.h"
  21 #include "UnicodeUtils.h"
  22 #include "registry.h"
  23 #include "filetextlines.h"
  24 #include "FormatMessageWrapper.h"
  25 #include "SmartHandle.h"
  26
  27 wchar_t inline WideCharSwap(wchar_t nValue)
  28 {
  29         return (((nValue>> 8)) | (nValue << 8));
  30         //return _byteswap_ushort(nValue);
  31 }
  32
  33 UINT64 inline WordSwapBytes(UINT64 nValue)
  34 {
  35         return ((nValue&0xff00ff00ff00ff)<<8) | ((nValue>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  36 }
  37
  38 UINT32 inline DwordSwapBytes(UINT32 nValue)
  39 {
  40         UINT32 nRet = (nValue<<16) | (nValue>>16); // swap WORDs
  41         nRet = ((nRet&0xff00ff)<<8) | ((nRet>>8)&0xff00ff); // swap BYTESs in WORDs
  42         return nRet;
  43         //return _byteswap_ulong(nValue);
  44 }
  45
  46 UINT64 inline DwordSwapBytes(UINT64 nValue)
  47 {
  48         UINT64 nRet = ((nValue&0xffff0000ffffL)<<16) | ((nValue>>16)&0xffff0000ffffL); // swap WORDs in DWORDs
  49         nRet = ((nRet&0xff00ff00ff00ff)<<8) | ((nRet>>8)&0xff00ff00ff00ff); // swap BYTESs in WORDs
  50         return nRet;
  51 }
  52
  53 CFileTextLines::CFileTextLines(void)
  54         : m_UnicodeType(CFileTextLines::AUTOTYPE)
  55         , m_LineEndings(EOL_AUTOLINE)
  56         , m_bNeedsConversion(false)
  57 {
  58 }
  59
  60 CFileTextLines::~CFileTextLines(void)
  61 {
  62 }
  63
  64 CFileTextLines::UnicodeType CFileTextLines::CheckUnicodeType(LPVOID pBuffer, int cb)
  65 {
  66         if (cb < 2)
  67                 return CFileTextLines::ASCII;
  68         const UINT32 * const pVal32 = (UINT32 *)pBuffer;
  69         const UINT16 * const pVal16 = (UINT16 *)pBuffer;
  70         const UINT8 * const pVal8 = (UINT8 *)pBuffer;
  71         // scan the whole buffer for a 0x00000000 sequence
  72         // if found, we assume a binary file
  73         int nDwords = cb/4;
  74         for (int i=0; i<nDwords; ++i)
  75         {
  76                 if (0x00000000 == pVal32[i])
  77                         return CFileTextLines::BINARY;
  78         }
  79         if (cb >=4 )
  80         {
  81                 if (*pVal32 == 0x0000FEFF)
  82                 {
  83                         return CFileTextLines::UTF32_LE;
  84                 }
  85                 if (*pVal32 == 0xFFFE0000)
  86                 {
  87                         return CFileTextLines::UTF32_BE;
  88                 }
  89         }
  90         if (*pVal16 == 0xFEFF)
  91         {
  92                 return CFileTextLines::UTF16_LE;
  93         }
  94         if (*pVal16 == 0xFFFE)
  95         {
  96                 return CFileTextLines::UTF16_BE;
  97         }
  98         if (cb < 3)
  99                 return CFileTextLines::ASCII;
 100         if (*pVal16 == 0xBBEF)
 101         {
 102                 if (pVal8[2] == 0xBF)
 103                         return CFileTextLines::UTF8BOM;
 104         }
 105         // check for illegal UTF8 sequences
 106         bool bNonANSI = false;
 107         int nNeedData = 0;
 108         int i=0;
 109         // run fast for ascii
 110         for (; i<cb; i+=8)
 111         {
 112                 if ((*(UINT64 *)&pVal8[i] & 0x8080808080808080)!=0) // all Ascii?
 113                 {
 114                         bNonANSI = true;
 115                         break;
 116                 }
 117         }
 118         // continue slow
 119         for (; i<cb; ++i)
 120         {
 121                 UINT8 zChar = pVal8[i];
 122                 if ((zChar & 0x80)==0) // Ascii
 123                 {
 124                         if (nNeedData)
 125                         {
 126                                 return CFileTextLines::ASCII;
 127                         }
 128                         continue;
 129                 }
 130                 if ((zChar & 0x40)==0) // top bit
 131                 {
 132                         if (!nNeedData)
 133                                 return CFileTextLines::ASCII;
 134                         --nNeedData;
 135                 }
 136                 else if (nNeedData)
 137                 {
 138                         return CFileTextLines::ASCII;
 139                 }
 140                 else if ((zChar & 0x20)==0) // top two bits
 141                 {
 142                         if (zChar<=0xC1)
 143                                 return CFileTextLines::ASCII;
 144                         nNeedData = 1;
 145                 }
 146                 else if ((zChar & 0x10)==0) // top three bits
 147                 {
 148                         nNeedData = 2;
 149                 }
 150                 else if ((zChar & 0x08)==0) // top four bits
 151                 {
 152                         if (zChar>=0xf5)
 153                                 return CFileTextLines::ASCII;
 154                         nNeedData = 3;
 155                 }
 156                 else
 157                         return CFileTextLines::ASCII;
 158         }
 159         if (bNonANSI && nNeedData==0)
 160                 // if get here thru nonAscii and no missing data left then its valid UTF8
 161                 return CFileTextLines::UTF8;
 162         if ((!bNonANSI)&&(DWORD(CRegDWORD(_T("Software\\TortoiseGitMerge\\UseUTF8"), FALSE))))
 163                 return CFileTextLines::UTF8;
 164         return CFileTextLines::ASCII;
 165 }
 166
 167
 168 BOOL CFileTextLines::Load(const CString& sFilePath, int lengthHint /* = 0*/)
 169 {
 170         WCHAR exceptionError[1000] = {0};
 171         m_LineEndings = EOL_AUTOLINE;
 172         m_UnicodeType = CFileTextLines::AUTOTYPE;
 173         RemoveAll();
 174         if(lengthHint != 0)
 175         {
 176                 Reserve(lengthHint);
 177         }
 178
 179         if (PathIsDirectory(sFilePath))
 180         {
 181                 m_sErrorString.Format(IDS_ERR_FILE_NOTAFILE, (LPCTSTR)sFilePath);
 182                 return FALSE;
 183         }
 184
 185         if (!PathFileExists(sFilePath))
 186         {
 187                 //file does not exist, so just return SUCCESS
 188                 return TRUE;
 189         }
 190
 191         CAutoFile hFile = CreateFile(sFilePath, GENERIC_READ, FILE_SHARE_READ|FILE_SHARE_DELETE|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, NULL, NULL);
 192         if (!hFile)
 193         {
 194                 SetErrorString();
 195                 return FALSE;
 196         }
 197
 198         LARGE_INTEGER fsize;
 199         if (!GetFileSizeEx(hFile, &fsize))
 200         {
 201                 SetErrorString();
 202                 return FALSE;
 203         }
 204         if (fsize.HighPart)
 205         {
 206                 // file is way too big for us
 207                 m_sErrorString.LoadString(IDS_ERR_FILE_TOOBIG);
 208                 return FALSE;
 209         }
 210
 211         // create buffer
 212         // If new[] was done for type T delete[] must be called on a pointer of type T*,
 213         // otherwise the behavior is undefined.
 214         // +1 is to address possible truncation when integer division is done
 215         CBuffer oFile;
 216         try
 217         {
 218                 oFile.SetLength(fsize.LowPart);
 219         }
 220         catch (CMemoryException* e)
 221         {
 222                 e->GetErrorMessage(exceptionError, _countof(exceptionError));
 223                 m_sErrorString = exceptionError;
 224                 return FALSE;
 225         }
 226
 227         // load file
 228         DWORD dwReadBytes = 0;
 229         if (!ReadFile(hFile, (void *)oFile, fsize.LowPart, &dwReadBytes, NULL))
 230         {
 231                 SetErrorString();
 232                 return FALSE;
 233         }
 234         hFile.CloseHandle();
 235
 236         // detect type
 237         if (m_UnicodeType == CFileTextLines::AUTOTYPE)
 238         {
 239                 m_UnicodeType = this->CheckUnicodeType((LPVOID)oFile, dwReadBytes);
 240                 // enforce conversion for all but ASCII and UTF8 type
 241                 m_bNeedsConversion = (m_UnicodeType!=CFileTextLines::UTF8)&&(m_UnicodeType!=CFileTextLines::ASCII);
 242         }
 243
 244         // we may have to convert the file content - CString is UTF16LE
 245         try
 246         {
 247                 CBaseFilter * pFilter = NULL;
 248                 switch (m_UnicodeType)
 249                 {
 250                 case BINARY:
 251                         m_sErrorString.Format(IDS_ERR_FILE_BINARY, (LPCTSTR)sFilePath);
 252                         return FALSE;
 253                 case UTF8:
 254                 case UTF8BOM:
 255                         pFilter = new CUtf8Filter(NULL);
 256                         break;
 257                 default:
 258                 case ASCII:
 259                         pFilter = new CAsciiFilter(NULL);
 260                         break;
 261                 case UTF16_BE:
 262                         pFilter = new CUtf16beFilter(NULL);
 263                         break;
 264                 case UTF16_LE:
 265                         pFilter = new CUtf16leFilter(NULL);
 266                         break;
 267                 case UTF32_BE:
 268                         pFilter = new CUtf32beFilter(NULL);
 269                         break;
 270                 case UTF32_LE:
 271                         pFilter = new CUtf32leFilter(NULL);
 272                         break;
 273                 }
 274                 pFilter->Decode(oFile);
 275                 delete pFilter;
 276         }
 277         catch (CMemoryException* e)
 278         {
 279                 e->GetErrorMessage(exceptionError, _countof(exceptionError));
 280                 m_sErrorString = exceptionError;
 281                 return FALSE;
 282         }
 283
 284         int nReadChars=oFile.GetLength()/sizeof(wchar_t);
 285         wchar_t * pTextBuf = (wchar_t *)oFile;
 286         wchar_t * pLineStart = pTextBuf;
 287         if ((m_UnicodeType == UTF8BOM)
 288                 || (m_UnicodeType == UTF16_LE)
 289                 || (m_UnicodeType == UTF16_BE)
 290                 || (m_UnicodeType == UTF32_LE)
 291                 || (m_UnicodeType == UTF32_BE))
 292         {
 293                 // ignore the BOM
 294                 ++pTextBuf;
 295                 ++pLineStart;
 296                 --nReadChars;
 297         }
 298
 299         // fill in the lines into the array
 300         size_t countEOLs[EOL__COUNT];
 301         memset(countEOLs, 0, sizeof(countEOLs));
 302         CFileTextLine oTextLine;
 303         for (int i = nReadChars; i; --i)
 304         {
 305                 EOL eEol;
 306                 switch (*pTextBuf++)
 307                 {
 308                 case '\r':
 309                         // crlf line ending or cr line ending
 310                         eEol = ((i > 1) && *(pTextBuf) == '\n') ? EOL_CRLF : EOL_CR;
 311                         break;
 312                 case '\n':
 313                         // lfcr line ending or lf line ending
 314                         eEol = ((i > 1) && *(pTextBuf) == '\r') ? EOL_LFCR : EOL_LF;
 315                         if (eEol == EOL_LFCR)
 316                         {
 317                                 // LFCR is very rare on Windows, so we have to double check
 318                                 // that this is not just a LF followed by CRLF
 319                                 if (((countEOLs[EOL_CRLF] > 1) || (countEOLs[EOL_LF]>1)) &&
 320                                         ((i > 2) && (*(pTextBuf+1) == '\n')))
 321                                 {
 322                                         // change the EOL back to a simple LF
 323                                         eEol = EOL_LF;
 324                                 }
 325                         }
 326                         break;
 327                 case 0x000b:
 328                         eEol = EOL_VT;
 329                         break;
 330                 case 0x000c:
 331                         eEol = EOL_FF;
 332                         break;
 333                 case 0x0085:
 334                         eEol = EOL_NEL;
 335                         break;
 336                 case 0x2028:
 337                         eEol = EOL_LS;
 338                         break;
 339                 case 0x2029:
 340                         eEol = EOL_PS;
 341                         break;
 342                 default:
 343                         continue;
 344                 }
 345                 oTextLine.sLine = CString(pLineStart, (int)(pTextBuf-pLineStart)-1);
 346                 oTextLine.eEnding = eEol;
 347                 Add(oTextLine);
 348                 ++countEOLs[eEol];
 349                 if (eEol==EOL_CRLF || eEol==EOL_LFCR)
 350                 {
 351                         ++pTextBuf;
 352                         --i;
 353                 }
 354                 pLineStart = pTextBuf;
 355         }
 356         CString line(pLineStart, (int)(pTextBuf-pLineStart));
 357         Add(line, EOL_NOENDING);
 358
 359         // some EOLs are not supported by the svn diff lib.
 360         m_bNeedsConversion |= (countEOLs[EOL_CRLF]!=0);
 361         m_bNeedsConversion |= (countEOLs[EOL_FF]!=0);
 362         m_bNeedsConversion |= (countEOLs[EOL_VT]!=0);
 363         m_bNeedsConversion |= (countEOLs[EOL_NEL]!=0);
 364         m_bNeedsConversion |= (countEOLs[EOL_LS]!=0);
 365         m_bNeedsConversion |= (countEOLs[EOL_PS]!=0);
 366
 367         size_t eolmax = 0;
 368         for (int nEol = 0; nEol<EOL__COUNT; nEol++)
 369         {
 370                 if (eolmax < countEOLs[nEol])
 371                 {
 372                         eolmax = countEOLs[nEol];
 373                         m_LineEndings = (EOL)nEol;
 374                 }
 375         }
 376
 377         return TRUE;
 378 }
 379
 380 void CFileTextLines::StripWhiteSpace(CString& sLine, DWORD dwIgnoreWhitespaces, bool blame)
 381 {
 382         if (blame)
 383         {
 384                 if (sLine.GetLength() > 66)
 385                         sLine = sLine.Mid(66);
 386         }
 387         switch (dwIgnoreWhitespaces)
 388         {
 389         case 0:
 390                 // Compare whitespaces
 391                 // do nothing
 392                 break;
 393         case 1:
 394                 // Ignore all whitespaces
 395                 sLine.TrimLeft(_T(" \t"));
 396                 sLine.TrimRight(_T(" \t"));
 397                 break;
 398         case 2:
 399                 // Ignore leading whitespace
 400                 sLine.TrimLeft(_T(" \t"));
 401                 break;
 402         case 3:
 403                 // Ignore ending whitespace
 404                 sLine.TrimRight(_T(" \t"));
 405                 break;
 406         }
 407 }
 408
 409 /**
 410         Encoding pattern:
 411                 - encode & save BOM
 412                 - Get Line
 413                 - modify line - whitespaces, lowercase
 414                 - encode & save line
 415                 - get cached encoded eol
 416                 - save eol
 417 */
 418 BOOL CFileTextLines::Save(const CString& sFilePath
 419                                                  , bool bSaveAsUTF8 /*= false*/
 420                                                  , bool bUseSVNCompatibleEOLs /*= false*/
 421                                                  , DWORD dwIgnoreWhitespaces /*=0*/
 422                                                  , BOOL bIgnoreCase /*= FALSE*/
 423                                                  , bool bBlame /*= false*/) const
 424 {
 425         try
 426         {
 427                 CString destPath = sFilePath;
 428                 // now make sure that the destination directory exists
 429                 int ind = 0;
 430                 while (destPath.Find('\\', ind)>=2)
 431                 {
 432                         if (!PathIsDirectory(destPath.Left(destPath.Find('\\', ind))))
 433                         {
 434                                 if (!CreateDirectory(destPath.Left(destPath.Find('\\', ind)), NULL))
 435                                         return FALSE;
 436                         }
 437                         ind = destPath.Find('\\', ind)+1;
 438                 }
 439
 440                 CStdioFile file;                        // Hugely faster than CFile for big file writes - because it uses buffering
 441                 if (!file.Open(sFilePath, CFile::modeCreate | CFile::modeWrite | CFile::typeBinary))
 442                 {
 443                         const_cast<CString *>(&m_sErrorString)->Format(IDS_ERR_FILE_OPEN, (LPCTSTR)sFilePath);
 444                         return FALSE;
 445                 }
 446
 447                 CBaseFilter * pFilter = NULL;
 448                 bool bSaveBom = true;
 449                 CFileTextLines::UnicodeType eUnicodeType = bSaveAsUTF8 ? CFileTextLines::UTF8 : m_UnicodeType;
 450                 switch (eUnicodeType)
 451                 {
 452                 default:
 453                 case CFileTextLines::ASCII:
 454                         bSaveBom = false;
 455                         pFilter = new CAsciiFilter(&file);
 456                         break;
 457                 case CFileTextLines::UTF8:
 458                         bSaveBom = false;
 459                 case CFileTextLines::UTF8BOM:
 460                         pFilter = new CUtf8Filter(&file);
 461                         break;
 462                 case CFileTextLines::UTF16_BE:
 463                         pFilter = new CUtf16beFilter(&file);
 464                         break;
 465                 case CFileTextLines::UTF16_LE:
 466                         pFilter = new CUtf16leFilter(&file);
 467                         break;
 468                 case CFileTextLines::UTF32_BE:
 469                         pFilter = new CUtf32beFilter(&file);
 470                         break;
 471                 case CFileTextLines::UTF32_LE:
 472                         pFilter = new CUtf32leFilter(&file);
 473                         break;
 474                 }
 475
 476                 if (bSaveBom)
 477                 {
 478                         //first write the BOM
 479                         pFilter->Write(L"\xfeff");
 480                 }
 481                 // cache EOLs
 482                 CBuffer oEncodedEol[EOL__COUNT];
 483                 oEncodedEol[EOL_LF] = pFilter->Encode(_T("\n")); // x0a
 484                 oEncodedEol[EOL_CR] = pFilter->Encode(_T("\r")); // x0d
 485                 oEncodedEol[EOL_CRLF] = pFilter->Encode(_T("\r\n")); // x0d x0a
 486                 if (bUseSVNCompatibleEOLs)
 487                 {
 488                         // when using EOLs that are supported by the svn lib,
 489                         // we have to use the same EOLs as the file has in case
 490                         // they're already supported, but a different supported one
 491                         // in case the original one isn't supported.
 492                         // Only this way the option "ignore EOLs (recommended)" unchecked
 493                         // actually shows the lines as different.
 494                         // However, the diff won't find and differences in EOLs
 495                         // for these special EOLs if they differ between those special ones
 496                         // listed below.
 497                         // But it will work properly for the most common EOLs LF/CR/CRLF.
 498                         oEncodedEol[EOL_LFCR] = oEncodedEol[EOL_CR];
 499                         for (int nEol = 0; nEol<EOL_NOENDING; nEol++)
 500                         {
 501                                 if (oEncodedEol[nEol].IsEmpty())
 502                                         oEncodedEol[nEol] = oEncodedEol[EOL_LF];
 503                         }
 504                 }
 505                 else
 506                 {
 507                         oEncodedEol[EOL_LFCR] = pFilter->Encode(_T("\n\r"));
 508                         oEncodedEol[EOL_VT] = pFilter->Encode(_T("\v")); // x0b
 509                         oEncodedEol[EOL_FF] = pFilter->Encode(_T("\f")); // x0c
 510                         oEncodedEol[EOL_NEL] = pFilter->Encode(_T("\x85"));
 511                         oEncodedEol[EOL_LS] = pFilter->Encode(_T("\x2028"));
 512                         oEncodedEol[EOL_PS] = pFilter->Encode(_T("\x2029"));
 513                 }
 514                 oEncodedEol[EOL_AUTOLINE] = oEncodedEol[m_LineEndings==EOL_AUTOLINE ? EOL_CRLF : m_LineEndings];
 515
 516                 for (int i=0; i<GetCount(); i++)
 517                 {
 518                         CString sLineT = GetAt(i);
 519                         StripWhiteSpace(sLineT, dwIgnoreWhitespaces, bBlame);
 520                         if (bIgnoreCase)
 521                                 sLineT = sLineT.MakeLower();
 522                         pFilter->Write(sLineT);
 523                         EOL eEol = GetLineEnding(i);
 524                         pFilter->Write(oEncodedEol[eEol]);
 525                 }
 526                 delete pFilter;
 527                 file.Close();
 528         }
 529         catch (CException * e)
 530         {
 531                 CString * psErrorString = const_cast<CString *>(&m_sErrorString);
 532                 e->GetErrorMessage(psErrorString->GetBuffer(4096), 4096);
 533                 psErrorString->ReleaseBuffer();
 534                 e->Delete();
 535                 return FALSE;
 536         }
 537         return TRUE;
 538 }
 539
 540 void CFileTextLines::SetErrorString()
 541 {
 542         m_sErrorString = CFormatMessageWrapper();
 543 }
 544
 545 void CFileTextLines::CopySettings(CFileTextLines * pFileToCopySettingsTo)
 546 {
 547         if (pFileToCopySettingsTo)
 548         {
 549                 pFileToCopySettingsTo->m_UnicodeType = m_UnicodeType;
 550                 pFileToCopySettingsTo->m_LineEndings = m_LineEndings;
 551         }
 552 }
 553
 554
 555
 556 void CBuffer::ExpandToAtLeast(int nNewSize)
 557 {
 558         if (nNewSize>m_nAllocated)
 559         {
 560                 delete [] m_pBuffer; // we don't preserve buffer content intentionally
 561                 nNewSize+=2048-1;
 562                 nNewSize&=~(1024-1);
 563                 m_pBuffer=new BYTE[nNewSize];
 564                 m_nAllocated=nNewSize;
 565         }
 566 }
 567
 568 void CBuffer::SetLength(int nUsed)
 569 {
 570         ExpandToAtLeast(nUsed);
 571         m_nUsed = nUsed;
 572 }
 573
 574 void CBuffer::Swap(CBuffer & Src)
 575 {
 576         std::swap(Src.m_nAllocated, m_nAllocated);
 577         std::swap(Src.m_pBuffer, m_pBuffer);
 578         std::swap(Src.m_nUsed, m_nUsed);
 579 }
 580
 581 void CBuffer::Copy(const CBuffer & Src)
 582 {
 583         if (&Src != this)
 584         {
 585                 SetLength(Src.m_nUsed);
 586                 memcpy(m_pBuffer, Src.m_pBuffer, m_nUsed);
 587         }
 588 }
 589
 590
 591
 592 bool CBaseFilter::Decode(/*in out*/ CBuffer & data)
 593 {
 594         int nFlags = (m_nCodePage==CP_ACP) ? MB_PRECOMPOSED : 0;
 595         // dry decode is around 8 times faster then real one, alternatively we can set buffer to max length
 596         int nReadChars = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), NULL, 0);
 597         m_oBuffer.SetLength(nReadChars*sizeof(wchar_t));
 598         int ret2 = MultiByteToWideChar(m_nCodePage, nFlags, (LPCSTR)data, data.GetLength(), (LPWSTR)(void *)m_oBuffer, nReadChars);
 599         if (ret2 != nReadChars)
 600         {
 601                 return FALSE;
 602         }
 603         data.Swap(m_oBuffer);
 604         return TRUE;
 605 }
 606
 607 const CBuffer & CBaseFilter::Encode(const CString s)
 608 {
 609         m_oBuffer.SetLength(s.GetLength()*3+1); // set buffer to guessed max size
 610         int nConvertedLen = WideCharToMultiByte(m_nCodePage, 0, (LPCTSTR)s, s.GetLength(), (LPSTR)m_oBuffer, m_oBuffer.GetLength(), NULL, NULL);
 611         m_oBuffer.SetLength(nConvertedLen); // set buffer to used size
 612         return m_oBuffer;
 613 }
 614
 615
 616
 617 bool CUtf16leFilter::Decode(/*in out*/ CBuffer & /*data*/)
 618 {
 619         // we believe data is ok for use
 620         return TRUE;
 621 }
 622
 623 const CBuffer & CUtf16leFilter::Encode(const CString s)
 624 {
 625         int nNeedBytes = s.GetLength()*sizeof(TCHAR);
 626         m_oBuffer.SetLength(nNeedBytes);
 627         memcpy((void *)m_oBuffer, (LPCTSTR)s, nNeedBytes);
 628         return m_oBuffer;
 629 }
 630
 631
 632
 633 bool CUtf16beFilter::Decode(/*in out*/ CBuffer & data)
 634 {
 635         int nNeedBytes = data.GetLength();
 636         // make in place WORD BYTEs swap
 637         UINT64 * p_qw = (UINT64 *)(void *)data;
 638         int nQwords = nNeedBytes/8;
 639         for (int nQword = 0; nQword<nQwords; nQword++)
 640         {
 641                 p_qw[nQword] = WordSwapBytes(p_qw[nQword]);
 642         }
 643         wchar_t * p_w = (wchar_t *)p_qw;
 644         int nWords = nNeedBytes/2;
 645         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 646         {
 647                 p_w[nWord] = WideCharSwap(p_w[nWord]);
 648         }
 649         return CUtf16leFilter::Decode(data);
 650 }
 651
 652 const CBuffer & CUtf16beFilter::Encode(const CString s)
 653 {
 654         int nNeedBytes = s.GetLength()*sizeof(TCHAR);
 655         m_oBuffer.SetLength(nNeedBytes);
 656         // copy swaping BYTE order in WORDs
 657         const UINT64 * p_qwIn = (const UINT64 *)(LPCTSTR)s;
 658         UINT64 * p_qwOut = (UINT64 *)(void *)m_oBuffer;
 659         int nQwords = nNeedBytes/8;
 660         for (int nQword = 0; nQword<nQwords; nQword++)
 661         {
 662                 p_qwOut[nQword] = WordSwapBytes(p_qwIn[nQword]);
 663         }
 664         wchar_t * p_wIn = (wchar_t *)p_qwIn;
 665         wchar_t * p_wOut = (wchar_t *)p_qwOut;
 666         int nWords = nNeedBytes/2;
 667         for (int nWord = nQwords*4; nWord<nWords; nWord++)
 668         {
 669                 p_wOut[nWord] = WideCharSwap(p_wIn[nWord]);
 670         }
 671         return m_oBuffer;
 672 }
 673
 674
 675
 676 bool CUtf32leFilter::Decode(/*in out*/ CBuffer & data)
 677 {
 678         // UTF32 have four bytes per char
 679         int nReadChars = data.GetLength()/4;
 680         UINT32 * p32 = (UINT32 *)(void *)data;
 681
 682         // count chars which needs surrogate pair
 683         int nSurrogatePairCount = 0;
 684         for (int i = 0; i<nReadChars; ++i)
 685         {
 686                 if (p32[i]<0x110000 && p32[i]>=0x10000)
 687                 {
 688                         ++nSurrogatePairCount;
 689                 }
 690         }
 691
 692         // fill buffer
 693         m_oBuffer.SetLength((nReadChars+nSurrogatePairCount)*sizeof(wchar_t));
 694         wchar_t * pOut = (wchar_t *)m_oBuffer;
 695         for (int i = 0; i<nReadChars; ++i, ++pOut)
 696         {
 697                 UINT32 zChar = p32[i];
 698                 if (zChar>=0x110000)
 699                 {
 700                         *pOut=0xfffd; // ? mark
 701                 }
 702                 else if (zChar>=0x10000)
 703                 {
 704                         zChar-=0x10000;
 705                         pOut[0] = ((zChar>>10)&0x3ff) | 0xd800; // lead surrogate
 706                         pOut[1] = (zChar&0x7ff) | 0xdc00; // trail surrogate
 707                         pOut++;
 708                 }
 709                 else
 710                 {
 711                         *pOut = (wchar_t)zChar;
 712                 }
 713         }
 714         data.Swap(m_oBuffer);
 715         return TRUE;
 716 }
 717
 718 const CBuffer & CUtf32leFilter::Encode(const CString s)
 719 {
 720         int nInWords = s.GetLength();
 721         m_oBuffer.SetLength(nInWords*2);
 722
 723         LPCTSTR p_In = (LPCTSTR)s;
 724         UINT32 * p_Out = (UINT32 *)(void *)m_oBuffer;
 725         int nOutDword = 0;
 726         for (int nInWord = 0; nInWord<nInWords; nInWord++, nOutDword++)
 727         {
 728                 UINT32 zChar = p_In[nInWord];
 729                 if ((zChar&0xfc00) == 0xd800) // lead surrogate
 730                 {
 731                         if (nInWord+1<nInWords && (p_In[nInWord+1]&0xfc00) == 0xdc00) // trail surrogate follows
 732                         {
 733                                 zChar = 0x10000 + ((zChar&0x3ff)<<10) + (p_In[++nInWord]&0x3ff);
 734                         }
 735                         else
 736                         {
 737                                 zChar = 0xfffd; // ? mark
 738                         }
 739                 }
 740                 else if ((zChar&0xfc00) == 0xdc00) // trail surrogate without lead
 741                 {
 742                         zChar = 0xfffd; // ? mark
 743                 }
 744                 p_Out[nOutDword] = zChar;
 745         }
 746         m_oBuffer.SetLength(nOutDword*4); // store length reduced by surrogates
 747         return m_oBuffer;
 748 }
 749
 750
 751
 752 bool CUtf32beFilter::Decode(/*in out*/ CBuffer & data)
 753 {
 754
 755         // swap BYTEs order in DWORDs
 756         UINT64 * p64 = (UINT64 *)(void *)data;
 757         int nQwords = data.GetLength()/8;
 758         for (int nQword = 0; nQword<nQwords; nQword++)
 759         {
 760                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 761         }
 762
 763         UINT32 * p32 = (UINT32 *)p64;
 764         int nDwords = data.GetLength()/4;
 765         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 766         {
 767                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 768         }
 769         return CUtf32leFilter::Decode(data);
 770 }
 771
 772 const CBuffer & CUtf32beFilter::Encode(const CString s)
 773 {
 774         CUtf32leFilter::Encode(s);
 775
 776         // swap BYTEs order in DWORDs
 777         UINT64 * p64 = (UINT64 *)(void *)m_oBuffer;
 778         int nQwords = m_oBuffer.GetLength()/8;
 779         for (int nQword = 0; nQword<nQwords; nQword++)
 780         {
 781                 p64[nQword] = DwordSwapBytes(p64[nQword]);
 782         }
 783
 784         UINT32 * p32 = (UINT32 *)p64;
 785         int nDwords = m_oBuffer.GetLength()/4;
 786         for (int nDword = nQwords*2; nDword<nDwords; nDword++)
 787         {
 788                 p32[nDword] = DwordSwapBytes(p32[nDword]);
 789         }
 790         return m_oBuffer;
 791 }
 792