From fde14dafcddc1bdb474cc38ac62d4e592326b919 Mon Sep 17 00:00:00 2001 From: "yuzhuohuang@qq.com" Date: Wed, 17 Oct 2012 23:24:08 +0800 Subject: [PATCH] Merge with MPC-HC 6d1472b2f18266d92e5bc068667de348c0cd3b3b. --- include/Utf8.h | 57 +++++ src/subtitles/STS.cpp | 4 +- src/subtitles/TextFile.cpp | 298 ++++++++++++++++++------- src/subtitles/TextFile.h | 13 +- src/subtitles/subtitles_vs2010.vcxproj | 36 +-- src/subtitles/subtitles_vs2010.vcxproj.filters | 6 +- 6 files changed, 302 insertions(+), 112 deletions(-) create mode 100644 include/Utf8.h diff --git a/include/Utf8.h b/include/Utf8.h new file mode 100644 index 0000000..eed48df --- /dev/null +++ b/include/Utf8.h @@ -0,0 +1,57 @@ +// Simple functions to test UTF-8 characters. +// Copyright (C)2010 Francois-R.Boyer@PolyMtl.ca +// First version 2010-08 +// +// Written for notepad++, and distributed under same license: +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either +// version 2 of the License, or (at your option) any later version. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +#pragma once + +namespace Utf8 { // could be a static class, instead of a namespace, if it needs private members + // basic classification of UTF-8 bytes + inline static bool isSingleByte(unsigned char c) { return c < 0x80; } + inline static bool isPartOfMultibyte(unsigned char c) { return c >= 0x80; } + inline static bool isFirstOfMultibyte(unsigned char c) { return c >= 0xC2 && c < 0xF5; } // 0xF5 to 0xFD are defined by UTF-8, but are not currently valid Unicode + inline static bool isContinuation(unsigned char c) { return (c & 0xC0) == 0x80; } + inline static bool isValid(unsigned char c) { return c < 0xC0 || isFirstOfMultibyte(c); } // validates a byte, out of context + + // number of continuation bytes for a given valid first character (0 for single byte characters) + inline static int continuationBytes(unsigned char c) { + static const char _len[] = { 1,1,2,3 }; + return (c < 0xC0) ? 0 : _len[(c & 0x30) >> 4]; + } + + // validates a full character + inline static bool isValid(const unsigned char* buf, int buflen) { + if(isSingleByte(buf[0])) return true; // single byte is valid + if(!isFirstOfMultibyte(buf[0])) return false; // not single byte, nor valid multi-byte first byte + int charContinuationBytes = continuationBytes(buf[0]); + if(buflen < charContinuationBytes+1) return false; // character does not fit in buffer + for(int i = charContinuationBytes; i>0; --i) + if(!isContinuation(*(++buf))) return false; // not enough continuation bytes + return true; // the character is valid (if there are too many continuation bytes, it is the next character that will be invalid) + } + + // rewinds to the first byte of a multi-byte character for any valid UTF-8 (and will not rewind too much on any other input) + inline static int characterStart(const unsigned char* buf, int startingIndex) { + int charContinuationBytes = 0; + while(charContinuationBytes < startingIndex // rewind past start of buffer? + && charContinuationBytes < 5 // UTF-8 support up to 5 continuation bytes (but valid sequences currently do not have more than 3) + && isContinuation(buf[startingIndex-charContinuationBytes]) + ) + ++charContinuationBytes; + return startingIndex-charContinuationBytes; + } +}; diff --git a/src/subtitles/STS.cpp b/src/subtitles/STS.cpp index d4246e5..231ffef 100644 --- a/src/subtitles/STS.cpp +++ b/src/subtitles/STS.cpp @@ -2761,7 +2761,7 @@ bool CSimpleTextSubtitle::Open(CString fn, int CharSet, CString name) { Empty(); - CWebTextFile f; + CWebTextFile f(CTextFile::UTF8); if(!f.Open(fn)) return(false); fn.Replace('\\', '/'); @@ -2828,7 +2828,7 @@ bool CSimpleTextSubtitle::Open(CTextFile* f, int CharSet, CString name) m_encoding = f->GetEncoding(); m_path = f->GetFilePath(); - CWebTextFile f2; + CWebTextFile f2(CTextFile::UTF8); if(f2.Open(f->GetFilePath() + _T(".style"))) OpenSubStationAlpha(&f2, *this, CharSet); diff --git a/src/subtitles/TextFile.cpp b/src/subtitles/TextFile.cpp index 46683ac..4fd129e 100644 --- a/src/subtitles/TextFile.cpp +++ b/src/subtitles/TextFile.cpp @@ -23,17 +23,20 @@ #include #include #include "TextFile.h" +#include "Utf8.h" CTextFile::CTextFile(enc e) + : m_encoding(e) + , m_defaultencoding(e) + , m_offset(0) { - m_encoding = m_defaultencoding = e; - m_offset = 0; } bool CTextFile::Open(LPCTSTR lpszFileName) { - if(!__super::Open(lpszFileName, modeRead|typeBinary|shareDenyWrite)) - return(false); + if (!__super::Open(lpszFileName, modeRead | typeBinary | shareDenyNone)) { + return false; + } m_encoding = m_defaultencoding; m_offset = 0; @@ -68,14 +71,22 @@ bool CTextFile::Open(LPCTSTR lpszFileName) } } - if(m_encoding == m_defaultencoding) - { - __super::Close(); // CWebTextFile::Close() would delete the temp file if we called it... - if(!__super::Open(lpszFileName, modeRead|typeText|shareDenyWrite)) - return(false); - } + if (m_encoding == ASCII) { + if (!ReopenAsText()) { + return false; + } + } else if (m_offset == 0) { // No BOM detected, ensure the file is read from the beginning + Seek(0, begin); + } - return(true); + return true; +} + +bool CTextFile::ReopenAsText() +{ + __super::Close(); // CWebTextFile::Close() would delete the temp file if we called it... + + return __super::Open(m_strFileName, modeRead | typeText | shareDenyNone)==TRUE; } bool CTextFile::Save(LPCTSTR lpszFileName, enc e) @@ -152,7 +163,7 @@ ULONGLONG CTextFile::Seek(LONGLONG lOff, UINT nFrom) case end: lOff = len - lOff; break; } - lOff = max(min(lOff, len), 0) + m_offset; + lOff = max(min((ULONGLONG)lOff, len), 0) + m_offset; pos = CStdioFile::Seek(lOff, begin) - m_offset; @@ -203,8 +214,8 @@ void CTextFile::WriteString(LPCWSTR lpsz/*CStringW str*/) else if(m_encoding == UTF8) { str.Replace(L"\n", L"\r\n"); - for(int i = 0; i < str.GetLength(); i++) - { + for (unsigned int i = 0, l = str.GetLength(); i < l; i++) + { DWORD c = (WORD)str[i]; if(0 <= c && c < 0x80) // 0xxxxxxx @@ -239,10 +250,11 @@ void CTextFile::WriteString(LPCWSTR lpsz/*CStringW str*/) else if(m_encoding == BE16) { str.Replace(L"\n", L"\r\n"); - for(int i = 0; i < str.GetLength(); i++) - str.SetAt(i, ((str[i]>>8)&0x00ff)|((str[i]<<8)&0xff00)); - Write((LPCWSTR)str, str.GetLength()*2); - } + for (unsigned int i = 0, l = str.GetLength(); i < l; i++) { + str.SetAt(i, ((str[i] >> 8) & 0x00ff) | ((str[i] << 8) & 0xff00)); + } + Write((LPCWSTR)str, str.GetLength() * 2); + } } BOOL CTextFile::ReadString(CStringA& str) @@ -251,12 +263,20 @@ BOOL CTextFile::ReadString(CStringA& str) str.Empty(); - if(m_encoding == ASCII) - { - CString s; - fEOF = !__super::ReadString(s); - str = TToA(s); - } + if(m_encoding == ASCII) + { + CString s; + fEOF = !__super::ReadString(s); + str = TToA(s); + // For consistency with other encodings, we continue reading + // the file even when a NUL char is encountered. + char c; + while (fEOF && (Read(&c, sizeof(c)) == sizeof(c))) { + str += c; + fEOF = !__super::ReadString(s); + str += TToA(s); + } + } else if(m_encoding == ANSI) { char c; @@ -270,30 +290,88 @@ BOOL CTextFile::ReadString(CStringA& str) } else if(m_encoding == UTF8) { - BYTE b; - while(Read(&b, sizeof(b)) == sizeof(b)) - { - fEOF = false; - char c = '?'; - if(!(b&0x80)) // 0xxxxxxx - { - c = b&0x7f; - } - else if((b&0xe0) == 0xc0) // 110xxxxx 10xxxxxx - { - if(Read(&b, sizeof(b)) != sizeof(b)) break; - } - else if((b&0xf0) == 0xe0) // 1110xxxx 10xxxxxx 10xxxxxx - { - if(Read(&b, sizeof(b)) != sizeof(b)) break; - if(Read(&b, sizeof(b)) != sizeof(b)) break; - } - if(c == '\r') continue; - if(c == '\n') break; - str += c; - } - } - else if(m_encoding == LE16) + int nBytesRead = 0; + BYTE buffer[3]; + bool bValid = true; + + while (Read(&buffer[0], sizeof(buffer[0])) == sizeof(buffer[0])) + { + nBytesRead++; + fEOF = false; + char c = '?'; + + if (Utf8::isSingleByte(buffer[0])) + { // 0xxxxxxx + c = buffer[0] & 0x7f; + } + else if (Utf8::isFirstOfMultibyte(buffer[0])) + { + int nContinuationBytes = Utf8::continuationBytes(buffer[0]); + bValid = (nContinuationBytes <= 2); + + // We don't support characters wider than 16 bits + if (bValid) { + UINT nRead = Read(&buffer[1], nContinuationBytes * sizeof(buffer[1])); + nBytesRead += nContinuationBytes; + bValid = (nRead == nContinuationBytes * sizeof(buffer[1])); + + if (bValid) { + for (int i = 0; i < nContinuationBytes; i++) { + if (!Utf8::isContinuation(buffer[i + 1])) { + bValid = false; + } + } + + switch (nContinuationBytes) { + case 0: // 0xxxxxxx + c = buffer[0] & 0x7f; + break; + case 1: // 110xxxxx 10xxxxxx + case 2: // 1110xxxx 10xxxxxx 10xxxxxx + // Unsupported for non unicode strings + break; + } + } + } + } + else + { + bValid = false; + } + + if (bValid) + { + if (c == '\r') { + continue; + } + if (c == '\n') { + break; + } + str += c; + } + else + { + // Switch to text and read again + m_encoding = ASCII; + // Rewind to the end of the line and save the position + Seek(-nBytesRead, current); + ULONGLONG currentPosition = GetPosition(); + + fEOF = !ReopenAsText(); + + if (!fEOF) + { + // Seek back at the beginning of the line where we stopped + Seek(currentPosition, begin); + + fEOF = !ReadString(str); + } + + break; + } + } + } + else if (m_encoding == LE16) { WORD w; while(Read(&w, sizeof(w)) == sizeof(w)) @@ -334,7 +412,15 @@ BOOL CTextFile::ReadString(CStringW& str) CString s; fEOF = !__super::ReadString(s); str = TToW(s); - } + // For consistency with other encodings, we continue reading + // the file even when a NUL char is encountered. + char c; + while (fEOF && (Read(&c, sizeof(c)) == sizeof(c))) { + str += c; + fEOF = !__super::ReadString(s); + str += TToW(s); + } + } else if(m_encoding == ANSI) { CStringA stra; @@ -349,35 +435,80 @@ BOOL CTextFile::ReadString(CStringW& str) str = CStringW(CString(stra)); // TODO: codepage } else if(m_encoding == UTF8) - { - BYTE b; - while(Read(&b, sizeof(b)) == sizeof(b)) - { - fEOF = false; - WCHAR c = '?'; - if(!(b&0x80)) // 0xxxxxxx - { - c = b&0x7f; - } - else if((b&0xe0) == 0xc0) // 110xxxxx 10xxxxxx - { - c = (b&0x1f)<<6; - if(Read(&b, sizeof(b)) != sizeof(b)) break; - c |= (b&0x3f); - } - else if((b&0xf0) == 0xe0) // 1110xxxx 10xxxxxx 10xxxxxx - { - c = (b&0x0f)<<12; - if(Read(&b, sizeof(b)) != sizeof(b)) break; - c |= (b&0x3f)<<6; - if(Read(&b, sizeof(b)) != sizeof(b)) break; - c |= (b&0x3f); - } - if(c == '\r') continue; - if(c == '\n') break; - str += c; - } - } + { + int nBytesRead = 0; + BYTE buffer[3]; + bool bValid = true; + + while (Read(&buffer[0], sizeof(buffer[0])) == sizeof(buffer[0])) { + nBytesRead++; + fEOF = false; + WCHAR c = L'?'; + + if (Utf8::isSingleByte(buffer[0])) { // 0xxxxxxx + c = buffer[0] & 0x7f; + } else if (Utf8::isFirstOfMultibyte(buffer[0])) { + int nContinuationBytes = Utf8::continuationBytes(buffer[0]); + bValid = (nContinuationBytes <= 2); + + // We don't support characters wider than 16 bits + if (bValid) { + UINT nRead = Read(&buffer[1], nContinuationBytes * sizeof(buffer[1])); + nBytesRead += nContinuationBytes; + bValid = (nRead == nContinuationBytes * sizeof(buffer[1])); + + if (bValid) { + for (int i = 0; i < nContinuationBytes; i++) { + if (!Utf8::isContinuation(buffer[i + 1])) { + bValid = false; + } + } + + switch (nContinuationBytes) { + case 0: // 0xxxxxxx + c = buffer[0] & 0x7f; + break; + case 1: // 110xxxxx 10xxxxxx + c = (buffer[0] & 0x1f) << 6 | (buffer[1] & 0x3f); + break; + case 2: // 1110xxxx 10xxxxxx 10xxxxxx + c = (buffer[0] & 0x0f) << 12 | (buffer[1] & 0x3f) << 6 | (buffer[2] & 0x3f); + break; + } + } + } + } else { + bValid = false; + } + + if (bValid) { + if (c == '\r') { + continue; + } + if (c == '\n') { + break; + } + str += c; + } else { + // Switch to text and read again + m_encoding = ASCII; + // Rewind to the end of the line and save the position + Seek(-nBytesRead, current); + ULONGLONG currentPosition = GetPosition(); + + fEOF = !ReopenAsText(); + + if (!fEOF) { + // Seek back to the beginning of the line where we stopped + Seek(currentPosition, begin); + + fEOF = !ReadString(str); + } + + break; + } + } + } else if(m_encoding == LE16) { WCHAR wc; @@ -413,8 +544,9 @@ UINT CTextFile::Read( void* lpBuf, UINT nCount ) // CWebTextFile // -CWebTextFile::CWebTextFile(LONGLONG llMaxSize) - : m_llMaxSize(llMaxSize) +CWebTextFile::CWebTextFile(CTextFile::enc e, LONGLONG llMaxSize) + : CTextFile(e) + , m_llMaxSize(llMaxSize) { } @@ -515,9 +647,7 @@ CString WToT(const CStringW& str) #else CString ret; for(int i = 0, j = str.GetLength(); i < j; i++) - ret += (TCHAR)(WORD)str[i]; - return(ret); - ret = str; + ret += (TCHAR)(WORD)str[i]; return(ret); #endif } @@ -540,7 +670,7 @@ CStringW TToW(const CString& str) #ifdef UNICODE ret = str; #else - for(int i = 0, j = str.GetLength(); i < j; i++) + for(size_t i = 0, j = str.GetLength(); i < j; i++) ret += (WCHAR)(BYTE)str[i]; #endif return(ret); diff --git a/src/subtitles/TextFile.h b/src/subtitles/TextFile.h index ed00bb8..5812341 100644 --- a/src/subtitles/TextFile.h +++ b/src/subtitles/TextFile.h @@ -56,6 +56,9 @@ public: void WriteString(LPCWSTR lpsz/*CStringW str*/); BOOL ReadString(CStringA& str); BOOL ReadString(CStringW& str); + +protected: + virtual bool ReopenAsText(); }; class CWebTextFile : public CTextFile @@ -64,7 +67,7 @@ class CWebTextFile : public CTextFile CString m_tempfn; public: - CWebTextFile(LONGLONG llMaxSize = 1024*1024); + CWebTextFile(CTextFile::enc e = ASCII, LONGLONG llMaxSize = 1024 * 1024); bool Open(LPCTSTR lpszFileName); bool Save(LPCTSTR lpszFileName, enc e /*= ASCII*/); @@ -73,7 +76,7 @@ public: CStringW AToW(const CStringA& str); CStringA WToA(const CStringW& str); -CString AToT(const CStringA& str); -CString WToT(const CStringW& str); -CStringA TToA(const CString& str); -CStringW TToW(const CString& str); +CString AToT(const CStringA& str); +CString WToT(const CStringW& str); +CStringA TToA(const CString& str); +CStringW TToW(const CString& str); diff --git a/src/subtitles/subtitles_vs2010.vcxproj b/src/subtitles/subtitles_vs2010.vcxproj index 843b414..9101e97 100644 --- a/src/subtitles/subtitles_vs2010.vcxproj +++ b/src/subtitles/subtitles_vs2010.vcxproj @@ -4,7 +4,7 @@ Debug Win32 - + Release log Win32 @@ -12,7 +12,7 @@ Release Win32 - + Debug x64 @@ -38,14 +38,14 @@ Static Unicode true - + StaticLibrary Static Unicode true - - + + StaticLibrary Static Unicode @@ -59,8 +59,8 @@ - <_ProjectFileVersion>10.0.30319.1 - $(WindowsSdkDir)include;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(FrameworkSDKDir)\include;$(VCInstallDir)PlatformSDK\include;$(SolutionDir);$(SolutionDir)\..\..\..\log4cplus\include\;$(SolutionDir)\..\..\BaseClasses;$(SolutionDir)\..\..\..\thirdparty\boost_1_47_0\ + <_ProjectFileVersion>10.0.30319.1 + $(WindowsSdkDir)include;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(FrameworkSDKDir)\include;$(VCInstallDir)PlatformSDK\include;$(SolutionDir);$(SolutionDir)\..\..\..\log4cplus\include\;$(SolutionDir)\..\..\BaseClasses;$(SolutionDir)\..\..\..\thirdparty\boost_1_47_0\;$(SolutionDir)..\..\..\..\include\ $(ProjectName)RL @@ -86,8 +86,8 @@ $(OutDir)$(TargetName)$(TargetExt) MachineX64 - - + + true WIN32;NDEBUG;%(PreprocessorDefinitions) @@ -111,8 +111,8 @@ - - + + true WIN32;NDEBUG;__DO_LOG;%(PreprocessorDefinitions) @@ -195,16 +195,16 @@ - AssemblyAndSourceCode + AssemblyAndSourceCode - + AssemblyAndSourceCode AssemblyAndSourceCode - - AssemblyAndSourceCode + + AssemblyAndSourceCode Create @@ -216,13 +216,13 @@ - + - - + + NotUsing diff --git a/src/subtitles/subtitles_vs2010.vcxproj.filters b/src/subtitles/subtitles_vs2010.vcxproj.filters index ea0c3b2..66e9938 100644 --- a/src/subtitles/subtitles_vs2010.vcxproj.filters +++ b/src/subtitles/subtitles_vs2010.vcxproj.filters @@ -42,9 +42,6 @@ Source Files - - Source Files - Source Files @@ -99,6 +96,9 @@ Source Files + + Source Files + -- 2.11.4.GIT