Related: tdf#137748 "Update" should have use-underline
[LibreOffice.git] / vcl / osx / HtmlFmtFlt.cxx
blob3549ecd2104bdf96e97ffe29f5ae2cbedb021ae7
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "HtmlFmtFlt.hxx"
22 #include <rtl/string.h>
23 #include <osl/diagnose.h>
25 #include <string>
26 #include <sstream>
27 #include <vector>
28 #include <iomanip>
29 #include <cassert>
31 using namespace com::sun::star::uno;
33 // converts the openoffice text/html clipboard format to the HTML Format
34 // well known under MS Windows
35 // the MS HTML Format has a header before the real html data
37 // Version:1.0 Version number of the clipboard. Starting is 0.9
38 // StartHTML: Byte count from the beginning of the clipboard to the start
39 // of the context, or -1 if no context
40 // EndHTML: Byte count from the beginning of the clipboard to the end
41 // of the context, or -1 if no context
42 // StartFragment: Byte count from the beginning of the clipboard to the
43 // start of the fragment
44 // EndFragment: Byte count from the beginning of the clipboard to the
45 // end of the fragment
46 // StartSelection: Byte count from the beginning of the clipboard to the
47 // start of the selection
48 // EndSelection: Byte count from the beginning of the clipboard to the
49 // end of the selection
51 // StartSelection and EndSelection are optional
52 // The fragment should be preceded and followed by the HTML comments
53 // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
54 // text
56 namespace
58 std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
60 std::ostringstream htmlHeader;
61 htmlHeader << "Version:1.0" << '\r' << '\n';
62 htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
63 htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
64 htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
65 htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
66 return htmlHeader.str();
71 // the office always writes the start and end html tag in upper cases and
72 // without spaces both tags don't allow parameters
73 const std::string TAG_HTML("<html>");
74 const std::string TAG_END_HTML("</html>");
76 // The body tag may have parameters so we need to search for the
77 // closing '>' manually e.g. <BODY param> #92840#
78 const std::string TAG_BODY("<body");
79 const std::string TAG_END_BODY("</body");
81 Sequence<sal_Int8> TextHtmlToHTMLFormat(Sequence<sal_Int8> const & aTextHtml)
83 OSL_ASSERT(aTextHtml.getLength() > 0);
85 if (aTextHtml.getLength() <= 0)
86 return Sequence<sal_Int8>();
88 // fill the buffer with dummy values to calc the exact length
89 std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
90 size_t lHtmlFormatHeader = dummyHtmlHeader.length();
92 std::string textHtml(
93 reinterpret_cast<const char*>(aTextHtml.getConstArray()),
94 reinterpret_cast<const char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());
96 std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
97 std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?
99 // The body tag may have parameters so we need to search for the
100 // closing '>' manually e.g. <BODY param> #92840#
101 std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
102 std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;
104 std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
105 htmlFormat += textHtml;
107 Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
108 memset(byteSequence.getArray(), 0, byteSequence.getLength());
110 memcpy(
111 static_cast<void*>(byteSequence.getArray()),
112 static_cast<const void*>(htmlFormat.c_str()),
113 htmlFormat.length());
115 return byteSequence;
118 const char* const HtmlStartTag = "<html";
120 Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
122 assert(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");
124 Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
125 char* dataStart = reinterpret_cast<char*>(nonconstHTMLFormatRef.getArray());
126 char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
127 const char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);
129 assert(htmlStartTag && "Seems to be no HTML at all");
131 // It doesn't seem to be HTML? Well then simply return what has been
132 // provided in non-debug builds
133 if (htmlStartTag == nullptr)
135 return aHTMLFormat;
138 sal_Int32 len = dataEnd - htmlStartTag;
139 Sequence<sal_Int8> plainHtmlData(len);
141 memcpy(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);
143 return plainHtmlData;
146 /* A simple format detection. We are just comparing the first few bytes
147 of the provided byte sequence to see whether or not it is the MS
148 Office Html format. If it shows that this is not reliable enough we
149 can improve this
151 const char HtmlFormatStart[] = "Version:";
152 int const HtmlFormatStartLen = sizeof(HtmlFormatStart) - 1;
154 bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
156 if (aHtmlSequence.getLength() < HtmlFormatStartLen)
157 return false;
159 return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
160 HtmlFormatStartLen,
161 reinterpret_cast<const char*>(aHtmlSequence.getConstArray()),
162 HtmlFormatStartLen) == 0;
165 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */