vcl/osx/HtmlFmtFlt.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include "HtmlFmtFlt.hxx"
  21
  22 #include <rtl/string.h>
  23 #include <osl/diagnose.h>
  24
  25 #include <string>
  26 #include <sstream>
  27 #include <vector>
  28 #include <iomanip>
  29 #include <cassert>
  30
  31 using namespace com::sun::star::uno;
  32
  33 // converts the openoffice text/html clipboard format to the HTML Format
  34 // well known under MS Windows
  35 // the MS HTML Format has a header before the real html data
  36
  37 // Version:1.0      Version number of the clipboard. Starting is 0.9
  38 // StartHTML:       Byte count from the beginning of the clipboard to the start
  39 //                  of the context, or -1 if no context
  40 // EndHTML:         Byte count from the beginning of the clipboard to the end
  41 //                  of the context, or -1 if no context
  42 // StartFragment:   Byte count from the beginning of the clipboard to the
  43 //                  start of the fragment
  44 // EndFragment:     Byte count from the beginning of the clipboard to the
  45 //                  end of the fragment
  46 // StartSelection:  Byte count from the beginning of the clipboard to the
  47 //                  start of the selection
  48 // EndSelection:    Byte count from the beginning of the clipboard to the
  49 //                  end of the selection
  50
  51 // StartSelection and EndSelection are optional
  52 // The fragment should be preceded and followed by the HTML comments
  53 // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
  54 // text
  55
  56 namespace
  57 {
  58 std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
  59 {
  60     std::ostringstream htmlHeader;
  61     htmlHeader << "Version:1.0" << '\r' << '\n';
  62     htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
  63     htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
  64     htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
  65     htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
  66     return htmlHeader.str();
  67 }
  68
  69 }
  70
  71 // the office always writes the start and end html tag in upper cases and
  72 // without spaces both tags don't allow parameters
  73 const std::string TAG_HTML("<html>");
  74 const std::string TAG_END_HTML("</html>");
  75
  76 // The body tag may have parameters so we need to search for the
  77 // closing '>' manually e.g. <BODY param> #92840#
  78 const std::string TAG_BODY("<body");
  79 const std::string TAG_END_BODY("</body");
  80
  81 Sequence<sal_Int8> TextHtmlToHTMLFormat(Sequence<sal_Int8> const & aTextHtml)
  82 {
  83     OSL_ASSERT(aTextHtml.getLength() > 0);
  84
  85     if (aTextHtml.getLength() <= 0)
  86         return Sequence<sal_Int8>();
  87
  88     // fill the buffer with dummy values to calc the exact length
  89     std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
  90     size_t lHtmlFormatHeader = dummyHtmlHeader.length();
  91
  92     std::string textHtml(
  93         reinterpret_cast<const char*>(aTextHtml.getConstArray()),
  94         reinterpret_cast<const char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());
  95
  96     std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
  97     std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?
  98
  99     // The body tag may have parameters so we need to search for the
 100     // closing '>' manually e.g. <BODY param> #92840#
 101     std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
 102     std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;
 103
 104     std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
 105     htmlFormat += textHtml;
 106
 107     Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
 108     memset(byteSequence.getArray(), 0, byteSequence.getLength());
 109
 110     memcpy(
 111         static_cast<void*>(byteSequence.getArray()),
 112         static_cast<const void*>(htmlFormat.c_str()),
 113         htmlFormat.length());
 114
 115     return byteSequence;
 116 }
 117
 118 const char* const HtmlStartTag = "<html";
 119
 120 Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
 121 {
 122   assert(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");
 123
 124   Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
 125   char* dataStart = reinterpret_cast<char*>(nonconstHTMLFormatRef.getArray());
 126   char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
 127   const char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);
 128
 129   assert(htmlStartTag && "Seems to be no HTML at all");
 130
 131   // It doesn't seem to be HTML? Well then simply return what has been
 132   // provided in non-debug builds
 133   if (htmlStartTag == nullptr)
 134   {
 135     return aHTMLFormat;
 136   }
 137
 138   sal_Int32 len = dataEnd - htmlStartTag;
 139   Sequence<sal_Int8> plainHtmlData(len);
 140
 141   memcpy(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);
 142
 143   return plainHtmlData;
 144 }
 145
 146 /* A simple format detection. We are just comparing the first few bytes
 147    of the provided byte sequence to see whether or not it is the MS
 148    Office Html format. If it shows that this is not reliable enough we
 149    can improve this
 150 */
 151 const char HtmlFormatStart[] = "Version:";
 152 int const HtmlFormatStartLen = sizeof(HtmlFormatStart) - 1;
 153
 154 bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
 155 {
 156   if (aHtmlSequence.getLength() < HtmlFormatStartLen)
 157     return false;
 158
 159   return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
 160                                                    HtmlFormatStartLen,
 161                                                    reinterpret_cast<const char*>(aHtmlSequence.getConstArray()),
 162                                                    HtmlFormatStartLen) == 0;
 163 }
 164
 165 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */