1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "HtmlFmtFlt.hxx"
22 #include <rtl/string.h>
23 #include <osl/diagnose.h>
31 using namespace com::sun::star::uno
;
33 // converts the openoffice text/html clipboard format to the HTML Format
34 // well known under MS Windows
35 // the MS HTML Format has a header before the real html data
37 // Version:1.0 Version number of the clipboard. Starting is 0.9
38 // StartHTML: Byte count from the beginning of the clipboard to the start
39 // of the context, or -1 if no context
40 // EndHTML: Byte count from the beginning of the clipboard to the end
41 // of the context, or -1 if no context
42 // StartFragment: Byte count from the beginning of the clipboard to the
43 // start of the fragment
44 // EndFragment: Byte count from the beginning of the clipboard to the
45 // end of the fragment
46 // StartSelection: Byte count from the beginning of the clipboard to the
47 // start of the selection
48 // EndSelection: Byte count from the beginning of the clipboard to the
49 // end of the selection
51 // StartSelection and EndSelection are optional
52 // The fragment should be preceded and followed by the HTML comments
53 // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
58 std::string
GetHtmlFormatHeader(size_t startHtml
, size_t endHtml
, size_t startFragment
,
61 std::ostringstream htmlHeader
;
62 htmlHeader
<< "Version:1.0" << '\r' << '\n';
63 htmlHeader
<< "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec
<< startHtml
65 htmlHeader
<< "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec
<< endHtml
<< '\r'
67 htmlHeader
<< "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec
68 << startFragment
<< '\r' << '\n';
69 htmlHeader
<< "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec
<< endFragment
71 return htmlHeader
.str();
75 // the office always writes the start and end html tag in upper cases and
76 // without spaces both tags don't allow parameters
77 const std::string TAG_HTML
= std::string("<html>");
78 const std::string TAG_END_HTML
= std::string("</html>");
80 // The body tag may have parameters so we need to search for the
81 // closing '>' manually e.g. <BODY param> #92840#
82 const std::string TAG_BODY
= std::string("<body");
83 const std::string TAG_END_BODY
= std::string("</body");
85 Sequence
<sal_Int8
> SAL_CALL
TextHtmlToHTMLFormat(Sequence
<sal_Int8
> const& aTextHtml
)
87 OSL_ASSERT(aTextHtml
.getLength() > 0);
89 if (aTextHtml
.getLength() <= 0)
90 return Sequence
<sal_Int8
>();
92 // fill the buffer with dummy values to calc the exact length
93 std::string dummyHtmlHeader
= GetHtmlFormatHeader(0, 0, 0, 0);
94 size_t lHtmlFormatHeader
= dummyHtmlHeader
.length();
96 std::string
textHtml(reinterpret_cast<const char*>(aTextHtml
.getConstArray()),
97 reinterpret_cast<const char*>(aTextHtml
.getConstArray())
98 + aTextHtml
.getLength());
100 std::string::size_type nStartHtml
= textHtml
.find(TAG_HTML
) + lHtmlFormatHeader
101 - 1; // we start one before '<HTML>' Word 2000 does also so
102 std::string::size_type nEndHtml
= textHtml
.find(TAG_END_HTML
) + lHtmlFormatHeader
103 + TAG_END_HTML
.length()
104 + 1; // our SOffice 5.2 wants 2 behind </HTML>?
106 // The body tag may have parameters so we need to search for the
107 // closing '>' manually e.g. <BODY param> #92840#
108 std::string::size_type nStartFragment
109 = textHtml
.find(">", textHtml
.find(TAG_BODY
)) + lHtmlFormatHeader
+ 1;
110 std::string::size_type nEndFragment
= textHtml
.find(TAG_END_BODY
) + lHtmlFormatHeader
;
112 std::string htmlFormat
113 = GetHtmlFormatHeader(nStartHtml
, nEndHtml
, nStartFragment
, nEndFragment
);
114 htmlFormat
+= textHtml
;
116 Sequence
<sal_Int8
> byteSequence(htmlFormat
.length() + 1); // space the trailing '\0'
117 memset(byteSequence
.getArray(), 0, byteSequence
.getLength());
119 memcpy(static_cast<void*>(byteSequence
.getArray()),
120 static_cast<const void*>(htmlFormat
.c_str()), htmlFormat
.length());
125 const char* const HtmlStartTag
= "<html";
127 Sequence
<sal_Int8
> HTMLFormatToTextHtml(const Sequence
<sal_Int8
>& aHTMLFormat
)
129 assert(isHTMLFormat(aHTMLFormat
) && "No HTML Format provided");
131 Sequence
<sal_Int8
>& nonconstHTMLFormatRef
= const_cast<Sequence
<sal_Int8
>&>(aHTMLFormat
);
132 char* dataStart
= reinterpret_cast<char*>(nonconstHTMLFormatRef
.getArray());
133 char* dataEnd
= dataStart
+ nonconstHTMLFormatRef
.getLength() - 1;
134 const char* htmlStartTag
= strcasestr(dataStart
, HtmlStartTag
);
136 assert(htmlStartTag
&& "Seems to be no HTML at all");
138 // It doesn't seem to be HTML? Well then simply return what has been
139 // provided in non-debug builds
140 if (htmlStartTag
== nullptr)
145 sal_Int32 len
= dataEnd
- htmlStartTag
;
146 Sequence
<sal_Int8
> plainHtmlData(len
);
148 memcpy(static_cast<void*>(plainHtmlData
.getArray()), htmlStartTag
, len
);
150 return plainHtmlData
;
153 /* A simple format detection. We are just comparing the first few bytes
154 of the provided byte sequence to see whether or not it is the MS
155 Office Html format. If it shows that this is not reliable enough we
158 const char HtmlFormatStart
[] = "Version:";
159 int const HtmlFormatStartLen
= (sizeof(HtmlFormatStart
) - 1);
161 bool isHTMLFormat(const Sequence
<sal_Int8
>& aHtmlSequence
)
163 if (aHtmlSequence
.getLength() < HtmlFormatStartLen
)
166 return rtl_str_compareIgnoreAsciiCase_WithLength(
167 HtmlFormatStart
, HtmlFormatStartLen
,
168 reinterpret_cast<const char*>(aHtmlSequence
.getConstArray()), HtmlFormatStartLen
)
172 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */