cid#1618310 COPY_INSTEAD_OF_MOVE
[LibreOffice.git] / filter / source / config / cache / typedetection.cxx
blob4fb9ab48257175fb1502f73b96198bfe2d199a99
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "typedetection.hxx"
21 #include "constant.hxx"
23 #include <com/sun/star/document/XExtendedFilterDetection.hpp>
24 #include <com/sun/star/frame/Desktop.hpp>
25 #include <com/sun/star/util/URLTransformer.hpp>
26 #include <com/sun/star/util/XURLTransformer.hpp>
28 #include <com/sun/star/beans/XPropertySet.hpp>
29 #include <com/sun/star/embed/StorageFormats.hpp>
30 #include <com/sun/star/io/XInputStream.hpp>
31 #include <com/sun/star/io/XSeekable.hpp>
32 #include <com/sun/star/packages/zip/ZipIOException.hpp>
33 #include <com/sun/star/task/XInteractionHandler.hpp>
35 #include <sfx2/brokenpackageint.hxx>
36 #include <o3tl/string_view.hxx>
37 #include <tools/wldcrd.hxx>
38 #include <sal/log.hxx>
39 #include <framework/interaction.hxx>
40 #include <comphelper/diagnose_ex.hxx>
41 #include <tools/urlobj.hxx>
42 #include <comphelper/fileurl.hxx>
43 #include <comphelper/lok.hxx>
44 #include <comphelper/sequence.hxx>
45 #include <comphelper/scopeguard.hxx>
46 #include <utility>
48 #define DEBUG_TYPE_DETECTION 0
50 #if DEBUG_TYPE_DETECTION
51 #include <iostream>
52 using std::cout;
53 using std::endl;
54 #endif
56 using namespace com::sun::star;
58 namespace filter::config{
60 TypeDetection::TypeDetection(const css::uno::Reference< css::uno::XComponentContext >& rxContext)
61 : m_xContext(rxContext)
62 , m_xTerminateListener(new TerminateDetection(this))
63 , m_bCancel(false)
65 css::frame::Desktop::create(m_xContext)->addTerminateListener(m_xTerminateListener);
66 BaseContainer::init(u"com.sun.star.comp.filter.config.TypeDetection"_ustr ,
67 { u"com.sun.star.document.TypeDetection"_ustr },
68 FilterCache::E_TYPE );
72 TypeDetection::~TypeDetection()
74 css::frame::Desktop::create(m_xContext)->removeTerminateListener(m_xTerminateListener);
78 OUString SAL_CALL TypeDetection::queryTypeByURL(const OUString& sURL)
80 OUString sType;
82 // SAFE ->
83 std::unique_lock aLock(m_aMutex);
85 css::util::URL aURL;
86 aURL.Complete = sURL;
87 css::uno::Reference< css::util::XURLTransformer > xParser( css::util::URLTransformer::create(m_xContext) );
88 xParser->parseStrict(aURL);
90 // set std types as minimum requirement first!
91 // Only in case no type was found for given URL,
92 // use optional types too ...
93 auto & cache = GetTheFilterCache();
94 FlatDetection lFlatTypes;
95 cache.detectFlatForURL(aURL, lFlatTypes);
97 if (
98 (lFlatTypes.empty() ) &&
99 (!cache.isFillState(FilterCache::E_CONTAINS_TYPES))
102 cache.load(FilterCache::E_CONTAINS_TYPES);
103 cache.detectFlatForURL(aURL, lFlatTypes);
106 // first item is guaranteed as "preferred" one!
107 if (!lFlatTypes.empty())
109 const FlatDetectionInfo& aMatch = *(lFlatTypes.begin());
110 sType = aMatch.sType;
113 return sType;
114 // <- SAFE
117 namespace {
120 * Rank format types in order of complexity. More complex formats are
121 * ranked higher so that they get tested sooner over simpler formats.
123 * Guidelines to determine how complex a format is (subject to change):
125 * 1) compressed text (XML, HTML, etc)
126 * 2) binary
127 * 3) non-compressed text
128 * 3.1) structured text
129 * 3.1.1) dialect of a structured text (e.g. docbook XML)
130 * 3.1.2) generic structured text (e.g. generic XML)
131 * 3.2) non-structured text
133 * In each category, rank them from strictly-structured to
134 * loosely-structured.
136 int getFlatTypeRank(std::u16string_view rType)
138 // List formats from more complex to less complex.
139 // TODO: Add more.
140 static const char* ranks[] = {
142 // Compressed XML (ODF XML zip formats)
143 "writer8_template",
144 "writer8",
145 "calc8_template",
146 "calc8",
147 "impress8_template",
148 "impress8",
149 "draw8_template",
150 "draw8",
151 "chart8",
152 "math8",
153 "writerglobal8_template",
154 "writerglobal8",
155 "writerweb8_writer_template",
156 "StarBase",
158 // Compressed XML (OOXML)
159 "writer_OOXML_Text_Template",
160 "writer_OOXML",
161 "writer_MS_Word_2007_Template",
162 "writer_MS_Word_2007",
163 "Office Open XML Spreadsheet Template",
164 "Office Open XML Spreadsheet",
165 "MS Excel 2007 XML Template",
166 "MS Excel 2007 XML",
167 "MS PowerPoint 2007 XML Template",
168 "MS PowerPoint 2007 XML AutoPlay",
169 "MS PowerPoint 2007 XML",
171 // Compressed XML (Uniform/Unified Office Format)
172 "Unified_Office_Format_text",
173 "Unified_Office_Format_spreadsheet",
174 "Unified_Office_Format_presentation",
176 // Compressed XML (StarOffice XML zip formats)
177 "calc_StarOffice_XML_Calc",
178 "calc_StarOffice_XML_Calc_Template",
179 "chart_StarOffice_XML_Chart",
180 "draw_StarOffice_XML_Draw",
181 "draw_StarOffice_XML_Draw_Template",
182 "impress_StarOffice_XML_Impress",
183 "impress_StarOffice_XML_Impress_Template",
184 "math_StarOffice_XML_Math",
185 "writer_StarOffice_XML_Writer",
186 "writer_StarOffice_XML_Writer_Template",
187 "writer_globaldocument_StarOffice_XML_Writer_GlobalDocument",
188 "writer_web_StarOffice_XML_Writer_Web_Template",
190 // Compressed text
191 "pdf_Portable_Document_Format",
193 // Binary
194 "writer_T602_Document",
195 "writer_WordPerfect_Document",
196 "writer_MS_Works_Document",
197 "writer_MS_Word_97_Vorlage",
198 "writer_MS_Word_97",
199 "writer_MS_Word_95_Vorlage",
200 "writer_MS_Word_95",
201 "writer_MS_WinWord_60",
202 "writer_MS_WinWord_5",
203 "MS Excel 2007 Binary",
204 "calc_MS_Excel_97_VorlageTemplate",
205 "calc_MS_Excel_97",
206 "calc_MS_Excel_95_VorlageTemplate",
207 "calc_MS_Excel_95",
208 "calc_MS_Excel_5095_VorlageTemplate",
209 "calc_MS_Excel_5095",
210 "calc_MS_Excel_40_VorlageTemplate",
211 "calc_MS_Excel_40",
212 "calc_Pocket_Excel_File",
213 "impress_MS_PowerPoint_97_Vorlage",
214 "impress_MS_PowerPoint_97_AutoPlay",
215 "impress_MS_PowerPoint_97",
216 "calc_Lotus",
217 "calc_QPro",
218 "calc_SYLK",
219 "calc_DIF",
220 "calc_dBase",
221 "Apache Parquet",
223 // Binary (raster and vector image files)
224 "emf_MS_Windows_Metafile",
225 "wmf_MS_Windows_Metafile",
226 "met_OS2_Metafile",
227 "svm_StarView_Metafile",
228 "sgv_StarDraw_20",
229 "tif_Tag_Image_File",
230 "tga_Truevision_TARGA",
231 "sgf_StarOffice_Writer_SGF",
232 "ras_Sun_Rasterfile",
233 "psd_Adobe_Photoshop",
234 "png_Portable_Network_Graphic",
235 "jpg_JPEG",
236 "mov_MOV",
237 "gif_Graphics_Interchange",
238 "bmp_MS_Windows",
239 "pcx_Zsoft_Paintbrush",
240 "pct_Mac_Pict",
241 "pcd_Photo_CD_Base",
242 "pcd_Photo_CD_Base4",
243 "pcd_Photo_CD_Base16",
244 "webp_WebP",
245 "impress_CGM_Computer_Graphics_Metafile", // There is binary and ascii variants ?
246 "draw_WordPerfect_Graphics",
247 "draw_Visio_Document",
248 "draw_Publisher_Document",
249 "draw_Corel_Presentation_Exchange",
250 "draw_CorelDraw_Document",
251 "writer_LotusWordPro_Document",
252 "writer_MIZI_Hwp_97", // Hanword (Hancom Office)
254 // Non-compressed XML
255 "writer_ODT_FlatXML",
256 "calc_ODS_FlatXML",
257 "impress_ODP_FlatXML",
258 "draw_ODG_FlatXML",
259 "calc_ADO_rowset_XML",
260 "calc_MS_Excel_2003_XML",
261 "writer_MS_Word_2003_XML",
262 "writer_DocBook_File",
263 "XHTML_File",
264 "svg_Scalable_Vector_Graphics",
265 "math_MathML_XML_Math",
267 // Non-compressed text
268 "dxf_AutoCAD_Interchange",
269 "eps_Encapsulated_PostScript",
270 "pbm_Portable_Bitmap", // There is 'raw' and 'ascii' variants.
271 "ppm_Portable_Pixelmap", // There is 'raw' and 'ascii' variants.
272 "pgm_Portable_Graymap", // There is 'raw' and 'ascii' variants.
273 "xpm_XPM",
274 "xbm_X_Consortium",
275 "writer_Rich_Text_Format",
276 "writer_web_HTML_help",
277 "generic_HTML",
279 "generic_Text", // Plain text (catch all)
281 // Anything ranked lower than generic_Text will never be used during
282 // type detection (since generic_Text catches all).
284 // Export only
285 "writer_layout_dump_xml",
286 "writer_indexing_export",
287 "graphic_HTML",
289 // Internal use only
290 "StarBaseReportChart",
291 "StarBaseReport",
292 "math_MathType_3x", // MathType equation embedded in Word doc.
295 size_t n = std::size(ranks);
297 for (size_t i = 0; i < n; ++i)
299 if (o3tl::equalsAscii(rType, ranks[i]))
300 return n - i - 1;
303 // Not ranked. Treat them equally. Unranked formats have higher priority
304 // than the ranked internal ones since they may be defined externally.
305 return n;
309 * Types with matching pattern first, then extension, then custom ranks by
310 * types, then types that are supported by the document service come next.
311 * Lastly, sort them alphabetically.
313 struct SortByPriority
315 bool operator() (const FlatDetectionInfo& r1, const FlatDetectionInfo& r2) const
317 if (r1.bMatchByPattern != r2.bMatchByPattern)
318 return r1.bMatchByPattern;
320 if (r1.bMatchByExtension != r2.bMatchByExtension)
321 return r1.bMatchByExtension;
323 int rank1 = getFlatTypeRank(r1.sType);
324 int rank2 = getFlatTypeRank(r2.sType);
326 if (rank1 != rank2)
327 return rank1 > rank2;
329 if (r1.bPreselectedByDocumentService != r2.bPreselectedByDocumentService)
330 return r1.bPreselectedByDocumentService;
332 // All things being equal, sort them alphabetically.
333 return r1.sType > r2.sType;
337 struct SortByType
339 bool operator() (const FlatDetectionInfo& r1, const FlatDetectionInfo& r2) const
341 return r1.sType > r2.sType;
345 struct EqualByType
347 bool operator() (const FlatDetectionInfo& r1, const FlatDetectionInfo& r2) const
349 return r1.sType == r2.sType;
353 class FindByType
355 OUString maType;
356 public:
357 explicit FindByType(OUString aType) : maType(std::move(aType)) {}
358 bool operator() (const FlatDetectionInfo& rInfo) const
360 return rInfo.sType == maType;
364 #if DEBUG_TYPE_DETECTION
365 void printFlatDetectionList(const char* caption, const FlatDetection& types)
367 cout << "-- " << caption << " (size=" << types.size() << ")" << endl;
368 for (auto const& item : types)
370 cout << " type='" << item.sType << "'; match by extension (" << item.bMatchByExtension
371 << "); match by pattern (" << item.bMatchByPattern << "); pre-selected by doc service ("
372 << item.bPreselectedByDocumentService << ")" << endl;
374 cout << "--" << endl;
376 #endif
380 OUString SAL_CALL TypeDetection::queryTypeByDescriptor(css::uno::Sequence< css::beans::PropertyValue >& lDescriptor,
381 sal_Bool bAllowDeep )
383 // make the descriptor more usable :-)
384 utl::MediaDescriptor stlDescriptor(lDescriptor);
385 OUString sType, sURL;
389 // SAFE -> ----------------------------------
390 std::unique_lock aLock(m_aMutex);
392 // parse given URL to split it into e.g. main and jump marks ...
393 sURL = stlDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_URL, OUString());
395 #if OSL_DEBUG_LEVEL > 0
396 if (stlDescriptor.find( u"FileName"_ustr ) != stlDescriptor.end())
397 OSL_FAIL("Detect using of deprecated and already unsupported MediaDescriptor property \"FileName\"!");
398 #endif
400 css::util::URL aURL;
401 aURL.Complete = sURL;
402 css::uno::Reference< css::util::XURLTransformer > xParser(css::util::URLTransformer::create(m_xContext));
403 xParser->parseStrict(aURL);
405 OUString aSelectedFilter = stlDescriptor.getUnpackedValueOrDefault(
406 utl::MediaDescriptor::PROP_FILTERNAME, OUString());
407 if (!aSelectedFilter.isEmpty())
409 // Caller specified the filter type. Honor it. Just get the default
410 // type for that filter, and bail out.
411 if (impl_validateAndSetFilterOnDescriptor(stlDescriptor, aSelectedFilter))
412 return stlDescriptor[utl::MediaDescriptor::PROP_TYPENAME].get<OUString>();
415 FlatDetection lFlatTypes;
416 impl_getAllFormatTypes(aLock, aURL, stlDescriptor, lFlatTypes);
418 aLock.unlock();
419 // <- SAFE ----------------------------------
421 // Properly prioritize all candidate types.
422 std::stable_sort(lFlatTypes.begin(), lFlatTypes.end(), SortByPriority());
423 auto last = std::unique(lFlatTypes.begin(), lFlatTypes.end(), EqualByType());
424 lFlatTypes.erase(last, lFlatTypes.end());
426 OUString sLastChance;
428 // verify every flat detected (or preselected!) type
429 // by calling its registered deep detection service.
430 // But break this loop if a type match to the given descriptor
431 // by a URL pattern(!) or if deep detection isn't allowed from
432 // outside (bAllowDeep=sal_False) or break the whole detection by
433 // throwing an exception if creation of the might needed input
434 // stream failed by e.g. an IO exception ...
435 if (!lFlatTypes.empty())
436 sType = impl_detectTypeFlatAndDeep(stlDescriptor, lFlatTypes, bAllowDeep, sLastChance);
438 // flat detection failed
439 // pure deep detection failed
440 // => ask might existing InteractionHandler
441 // means: ask user for its decision
442 if (sType.isEmpty() && !m_bCancel)
443 sType = impl_askUserForTypeAndFilterIfAllowed(stlDescriptor);
446 // no real detected type - but a might valid one.
447 // update descriptor and set last chance for return.
448 if (sType.isEmpty() && !sLastChance.isEmpty() && !m_bCancel)
450 OSL_FAIL("set first flat detected type without a registered deep detection service as \"last chance\" ... nevertheless some other deep detections said \"NO\". I TRY IT!");
451 sType = sLastChance;
454 catch(const css::uno::RuntimeException&)
456 throw;
458 catch(const css::uno::Exception&)
460 TOOLS_WARN_EXCEPTION("filter.config", "caught exception while querying type of " << sURL);
461 sType.clear();
464 // adapt media descriptor, so it contains the right values
465 // for type/filter name/document service/ etcpp.
466 impl_checkResultsAndAddBestFilter(stlDescriptor, sType); // Attention: sType is used as IN/OUT param here and will might be changed inside this method !!!
467 impl_validateAndSetTypeOnDescriptor(stlDescriptor, sType);
469 stlDescriptor >> lDescriptor;
470 return sType;
474 void TypeDetection::impl_checkResultsAndAddBestFilter(utl::MediaDescriptor& rDescriptor,
475 OUString& sType )
477 // a)
478 // Don't overwrite a might preselected filter!
479 OUString sFilter = rDescriptor.getUnpackedValueOrDefault(
480 utl::MediaDescriptor::PROP_FILTERNAME,
481 OUString());
482 if (!sFilter.isEmpty())
483 return;
485 auto & cache = GetTheFilterCache();
487 // b)
488 // check a preselected document service too.
489 // Then we have to search a suitable filter within this module.
490 OUString sDocumentService = rDescriptor.getUnpackedValueOrDefault(
491 utl::MediaDescriptor::PROP_DOCUMENTSERVICE,
492 OUString());
493 if (!sDocumentService.isEmpty())
497 OUString sRealType = sType;
499 // SAFE ->
500 std::unique_lock aLock(m_aMutex);
502 // Attention: For executing next lines of code, We must be sure that
503 // all filters already loaded :-(
504 // That can disturb our "load on demand feature". But we have no other chance!
505 cache.load(FilterCache::E_CONTAINS_FILTERS);
507 css::beans::NamedValue lIProps[] {
508 { PROPNAME_DOCUMENTSERVICE, uno::Any(sDocumentService) },
509 { PROPNAME_TYPE, uno::Any(sRealType) } };
510 std::vector<OUString> lFilters = cache.getMatchingItemsByProps(FilterCache::E_FILTER, lIProps);
512 aLock.unlock();
513 // <- SAFE
515 for (auto const& filter : lFilters)
517 // SAFE ->
518 aLock.lock();
521 CacheItem aFilter = cache.getItem(FilterCache::E_FILTER, filter);
522 sal_Int32 nFlags = 0;
523 aFilter[PROPNAME_FLAGS] >>= nFlags;
525 if (static_cast<SfxFilterFlags>(nFlags) & SfxFilterFlags::IMPORT)
526 sFilter = filter;
527 if (static_cast<SfxFilterFlags>(nFlags) & SfxFilterFlags::PREFERED)
528 break;
530 catch(const css::uno::Exception&) {}
531 aLock.unlock();
532 // <- SAFE
535 if (!sFilter.isEmpty())
537 rDescriptor[utl::MediaDescriptor::PROP_TYPENAME ] <<= sRealType;
538 rDescriptor[utl::MediaDescriptor::PROP_FILTERNAME] <<= sFilter;
539 sType = sRealType;
540 return;
543 catch(const css::uno::Exception&)
547 // c)
548 // We can use the preferred filter for the specified type.
549 // Such preferred filter points:
550 // - to the default filter of the preferred application
551 // - or to any other filter if no preferred filter was set.
552 // Note: It's an optimization only!
553 // It's not guaranteed, that such preferred filter exists.
554 sFilter.clear();
557 CacheItem aType = cache.getItem(FilterCache::E_TYPE, sType);
558 aType[PROPNAME_PREFERREDFILTER] >>= sFilter;
559 cache.getItem(FilterCache::E_FILTER, sFilter);
561 // no exception => found valid type and filter => set it on the given descriptor
562 rDescriptor[utl::MediaDescriptor::PROP_TYPENAME ] <<= sType ;
563 rDescriptor[utl::MediaDescriptor::PROP_FILTERNAME] <<= sFilter;
564 return;
566 catch(const css::uno::Exception&)
569 // d)
570 // Search for any import(!) filter, which is registered for this type.
571 sFilter.clear();
574 // Attention: For executing next lines of code, We must be sure that
575 // all filters already loaded :-(
576 // That can disturb our "load on demand feature". But we have no other chance!
577 cache.load(FilterCache::E_CONTAINS_FILTERS);
579 css::beans::NamedValue lIProps[] {
580 { PROPNAME_TYPE, uno::Any(sType) } };
581 std::vector<OUString> lFilters = cache.getMatchingItemsByProps(FilterCache::E_FILTER, lIProps);
583 for (auto const& filter : lFilters)
585 sFilter = filter;
589 CacheItem aFilter = cache.getItem(FilterCache::E_FILTER, sFilter);
590 sal_Int32 nFlags = 0;
591 aFilter[PROPNAME_FLAGS] >>= nFlags;
593 if (static_cast<SfxFilterFlags>(nFlags) & SfxFilterFlags::IMPORT)
594 break;
596 catch(const css::uno::Exception&)
597 { continue; }
599 sFilter.clear();
602 if (!sFilter.isEmpty())
604 rDescriptor[utl::MediaDescriptor::PROP_TYPENAME ] <<= sType ;
605 rDescriptor[utl::MediaDescriptor::PROP_FILTERNAME] <<= sFilter;
606 return;
609 catch(const css::uno::Exception&)
614 bool TypeDetection::impl_getPreselectionForType(
615 std::unique_lock<std::mutex>& /*rGuard*/,
616 const OUString& sPreSelType, const util::URL& aParsedURL, FlatDetection& rFlatTypes, bool bDocService)
618 // Can be used to suppress execution of some parts of this method
619 // if it's already clear that detected type is valid or not.
620 // It's necessary to use shared code at the end, which update
621 // all return parameters consistency!
622 bool bBreakDetection = false;
624 // Further we must know if it matches by pattern
625 // Every flat detected type by pattern won't be detected deep!
626 bool bMatchByPattern = false;
628 // And we must know if a preselection must be preferred, because
629 // it matches by its extension too.
630 bool bMatchByExtension = false;
632 // validate type
633 OUString sType(sPreSelType);
634 CacheItem aType;
637 aType = GetTheFilterCache().getItem(FilterCache::E_TYPE, sType);
639 catch(const css::container::NoSuchElementException&)
641 sType.clear();
642 bBreakDetection = true;
645 if (!bBreakDetection)
647 // We can't check a preselected type for a given stream!
648 // So we must believe, that it can work ...
649 if ( aParsedURL.Complete == "private:stream" )
650 bBreakDetection = true;
653 if (!bBreakDetection)
655 // extract extension from URL .. to check it case-insensitive !
656 INetURLObject aParser (aParsedURL.Main);
657 OUString sExtension = aParser.getExtension(INetURLObject::LAST_SEGMENT ,
658 true ,
659 INetURLObject::DecodeMechanism::WithCharset);
660 sExtension = sExtension.toAsciiLowerCase();
662 // otherwise we must know, if it matches to the given URL really.
663 // especially if it matches by its extension or pattern registration.
664 const css::uno::Sequence<OUString> lExtensions = aType[PROPNAME_EXTENSIONS].get<css::uno::Sequence<OUString> >();
665 const css::uno::Sequence<OUString> lURLPattern = aType[PROPNAME_URLPATTERN].get<css::uno::Sequence<OUString> >();
667 for (auto const& extension : lExtensions)
669 OUString sCheckExtension(extension.toAsciiLowerCase());
670 if (sCheckExtension == sExtension)
672 bBreakDetection = true;
673 bMatchByExtension = true;
674 break;
678 if (!bBreakDetection)
680 for (auto const& elem : lURLPattern)
682 WildCard aCheck(elem);
683 if (aCheck.Matches(aParsedURL.Main))
685 bMatchByPattern = true;
686 break;
692 // if it's a valid type - set it on all return values!
693 if (!sType.isEmpty())
695 FlatDetection::iterator it = std::find_if(rFlatTypes.begin(), rFlatTypes.end(), FindByType(sType));
696 if (it != rFlatTypes.end())
698 if (bMatchByExtension)
699 it->bMatchByExtension = true;
700 if (bMatchByPattern)
701 it->bMatchByPattern = true;
702 if (bDocService)
703 it->bPreselectedByDocumentService = true;
706 return true;
709 // not valid!
710 return false;
713 void TypeDetection::impl_getPreselectionForDocumentService(
714 std::unique_lock<std::mutex>& rGuard,
715 const OUString& sPreSelDocumentService, const util::URL& aParsedURL, FlatDetection& rFlatTypes)
717 // get all filters, which match to this doc service
718 std::vector<OUString> lFilters;
721 // Attention: For executing next lines of code, We must be sure that
722 // all filters already loaded :-(
723 // That can disturb our "load on demand feature". But we have no other chance!
724 auto & cache = GetTheFilterCache();
725 cache.load(FilterCache::E_CONTAINS_FILTERS);
727 css::beans::NamedValue lIProps[] {
728 { PROPNAME_DOCUMENTSERVICE, css::uno::Any(sPreSelDocumentService) } };
729 lFilters = cache.getMatchingItemsByProps(FilterCache::E_FILTER, lIProps);
731 catch (const css::container::NoSuchElementException&)
733 lFilters.clear();
736 // step over all filters, and check if its registered type
737 // match the given URL.
738 // But use temp. list of "preselected types" instead of incoming rFlatTypes list!
739 // The reason behind: we must filter the obtained results. And copying stl entries
740 // is an easier job than removing them .-)
741 for (auto const& filter : lFilters)
743 OUString aType = impl_getTypeFromFilter(rGuard, filter);
744 if (aType.isEmpty())
745 continue;
747 impl_getPreselectionForType(rGuard, aType, aParsedURL, rFlatTypes, true);
751 OUString TypeDetection::impl_getTypeFromFilter(std::unique_lock<std::mutex>& /*rGuard*/, const OUString& rFilterName)
753 CacheItem aFilter;
756 aFilter = GetTheFilterCache().getItem(FilterCache::E_FILTER, rFilterName);
758 catch (const container::NoSuchElementException&)
760 return OUString();
763 OUString aType;
764 aFilter[PROPNAME_TYPE] >>= aType;
765 return aType;
768 void TypeDetection::impl_getAllFormatTypes(
769 std::unique_lock<std::mutex>& rGuard,
770 const util::URL& aParsedURL, utl::MediaDescriptor const & rDescriptor, FlatDetection& rFlatTypes)
772 rFlatTypes.clear();
774 // Get all filters that we have.
775 std::vector<OUString> aFilterNames;
778 auto & cache = GetTheFilterCache();
779 cache.load(FilterCache::E_CONTAINS_FILTERS);
780 aFilterNames = cache.getItemNames(FilterCache::E_FILTER);
782 catch (const container::NoSuchElementException&)
784 return;
787 // Retrieve the default type for each of these filters, and store them.
788 for (auto const& filterName : aFilterNames)
790 OUString aType = impl_getTypeFromFilter(rGuard, filterName);
792 if (aType.isEmpty())
793 continue;
795 FlatDetectionInfo aInfo; // all flags set to false by default.
796 aInfo.sType = aType;
797 rFlatTypes.push_back(aInfo);
801 // Get all types that match the URL alone.
802 FlatDetection aFlatByURL;
803 GetTheFilterCache().detectFlatForURL(aParsedURL, aFlatByURL);
804 for (auto const& elem : aFlatByURL)
806 FlatDetection::iterator itPos = std::find_if(rFlatTypes.begin(), rFlatTypes.end(), FindByType(elem.sType));
807 if (itPos == rFlatTypes.end())
808 // Not in the list yet.
809 rFlatTypes.push_back(elem);
810 else
812 // Already in the list. Update the flags.
813 FlatDetectionInfo& rInfo = *itPos;
814 const FlatDetectionInfo& rThisInfo = elem;
815 if (rThisInfo.bMatchByExtension)
816 rInfo.bMatchByExtension = true;
817 if (rThisInfo.bMatchByPattern)
818 rInfo.bMatchByPattern = true;
819 if (rThisInfo.bPreselectedByDocumentService)
820 rInfo.bPreselectedByDocumentService = true;
825 // Remove duplicates.
826 std::stable_sort(rFlatTypes.begin(), rFlatTypes.end(), SortByType());
827 auto last = std::unique(rFlatTypes.begin(), rFlatTypes.end(), EqualByType());
828 rFlatTypes.erase(last, rFlatTypes.end());
830 // Mark pre-selected type (if any) to have it prioritized.
831 OUString sSelectedType = rDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_TYPENAME, OUString());
832 if (!sSelectedType.isEmpty())
833 impl_getPreselectionForType(rGuard, sSelectedType, aParsedURL, rFlatTypes, false);
835 // Mark all types preferred by the current document service, to have it prioritized.
836 OUString sSelectedDoc = rDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_DOCUMENTSERVICE, OUString());
837 if (!sSelectedDoc.isEmpty())
838 impl_getPreselectionForDocumentService(rGuard, sSelectedDoc, aParsedURL, rFlatTypes);
842 static bool isBrokenZIP(const css::uno::Reference<css::io::XInputStream>& xStream,
843 const css::uno::Reference<css::uno::XComponentContext>& xContext)
847 // Only consider seekable streams starting with "PK", to avoid false detections
848 css::uno::Reference<css::io::XSeekable> xSeek(xStream, css::uno::UNO_QUERY_THROW);
849 comphelper::ScopeGuard restorePos(
850 [xSeek, nPos = xSeek->getPosition()]
854 xSeek->seek(nPos);
856 catch (const css::uno::Exception&)
860 css::uno::Sequence<sal_Int8> magic(2);
861 xStream->readBytes(magic, 2);
862 if (magic.getLength() < 2 || magic[0] != 'P' || magic[1] != 'K')
863 return false;
865 catch (const css::uno::Exception&)
867 return false;
870 std::vector<css::uno::Any> aArguments{
871 css::uno::Any(xStream),
872 css::uno::Any(css::beans::NamedValue(u"AllowRemoveOnInsert"_ustr, css::uno::Any(false))),
873 css::uno::Any(css::beans::NamedValue(u"StorageFormat"_ustr,
874 css::uno::Any(css::embed::StorageFormats::ZIP))),
878 // If this is a broken ZIP package, or not a ZIP, this would throw ZipIOException
879 xContext->getServiceManager()->createInstanceWithArgumentsAndContext(
880 u"com.sun.star.packages.comp.ZipPackage"_ustr, comphelper::containerToSequence(aArguments),
881 xContext);
883 catch (const css::packages::zip::ZipIOException&)
885 // Now test if repair will succeed
886 aArguments.emplace_back(css::beans::NamedValue(u"RepairPackage"_ustr, css::uno::Any(true)));
889 // If this is a broken ZIP package that can be repaired, this would succeed,
890 // and the result will be not empty
891 if (css::uno::Reference<css::beans::XPropertySet> xPackage{
892 xContext->getServiceManager()->createInstanceWithArgumentsAndContext(
893 u"com.sun.star.packages.comp.ZipPackage"_ustr,
894 comphelper::containerToSequence(aArguments), xContext),
895 css::uno::UNO_QUERY })
896 if (bool bHasElements; xPackage->getPropertyValue(u"HasElements"_ustr) >>= bHasElements)
897 return bHasElements;
899 catch (const css::uno::Exception&)
903 catch (const css::uno::Exception&)
906 // The package is either not broken, or is not a repairable ZIP
907 return false;
911 OUString TypeDetection::impl_detectTypeFlatAndDeep( utl::MediaDescriptor& rDescriptor ,
912 const FlatDetection& lFlatTypes ,
913 bool bAllowDeep ,
914 OUString& rLastChance )
916 // reset it everytimes, so the outside code can distinguish between
917 // a set and a not set value.
918 rLastChance.clear();
920 // tdf#96401: First of all, check if this is a broken ZIP package. Not doing this here would
921 // make some filters silently not recognize their content in broken packages, and some filters
922 // show a warning and mistakenly claim own content based on user choice.
923 if (bAllowDeep && !rDescriptor.getUnpackedValueOrDefault(u"RepairPackage"_ustr, false)
924 && rDescriptor.getUnpackedValueOrDefault(u"RepairAllowed"_ustr, true)
925 && rDescriptor.contains(utl::MediaDescriptor::PROP_INTERACTIONHANDLER))
929 // tdf#161573: do not interact with the user about possible unrelated failures (e.g.,
930 // missing file). If needed, that will happen later, in the main detection phase.
931 auto aInteraction(rDescriptor[utl::MediaDescriptor::PROP_INTERACTIONHANDLER]);
932 rDescriptor.erase(utl::MediaDescriptor::PROP_INTERACTIONHANDLER);
933 comphelper::ScopeGuard interactionHelperGuard([&rDescriptor, &aInteraction]
934 { rDescriptor[utl::MediaDescriptor::PROP_INTERACTIONHANDLER] = aInteraction; });
935 impl_openStream(rDescriptor);
936 if (auto xStream = rDescriptor.getUnpackedValueOrDefault(
937 utl::MediaDescriptor::PROP_INPUTSTREAM,
938 css::uno::Reference<css::io::XInputStream>()))
940 css::uno::Reference<css::uno::XComponentContext> xContext;
942 // SAFE ->
944 std::unique_lock aLock(m_aMutex);
945 xContext = m_xContext;
947 // <- SAFE
949 if (isBrokenZIP(xStream, xContext))
951 if (css::uno::Reference<css::task::XInteractionHandler> xInteraction{
952 aInteraction,
953 css::uno::UNO_QUERY })
955 INetURLObject aURL(rDescriptor.getUnpackedValueOrDefault(
956 utl::MediaDescriptor::PROP_URL, OUString()));
957 OUString aDocumentTitle
958 = aURL.getName(INetURLObject::LAST_SEGMENT, true,
959 INetURLObject::DecodeMechanism::WithCharset);
961 // Ask the user whether they wants to try to repair
962 RequestPackageReparation aRequest(aDocumentTitle);
963 xInteraction->handle(aRequest.GetRequest());
965 if (aRequest.isApproved())
967 // lok: we want to overwrite file in jail, so don't use template flag
968 const bool bIsLOK = comphelper::LibreOfficeKit::isActive();
969 rDescriptor[utl::MediaDescriptor::PROP_DOCUMENTTITLE] <<= aDocumentTitle;
970 rDescriptor[utl::MediaDescriptor::PROP_ASTEMPLATE] <<= !bIsLOK;
971 rDescriptor[u"RepairPackage"_ustr] <<= true;
973 else
974 rDescriptor[u"RepairAllowed"_ustr] <<= false; // Do not ask again
979 catch (const css::uno::Exception&)
981 // No problem
985 // step over all possible types for this URL.
986 // solutions:
987 // a) no types => no detection
988 // b) deep detection not allowed => return first valid type of list (because it's the preferred or the first valid one)
989 // or(!) match by URLPattern => in such case a deep detection will be suppressed!
990 // c) type has no detect service => safe the first occurred type without a detect service
991 // as "last chance"(!). It will be used outside of this method
992 // if no further type could be detected.
993 // It must be the first one, because it can be a preferred type.
994 // Our types list was sorted by such criteria!
995 // d) detect service return a valid result => return its decision
996 // e) detect service return an invalid result
997 // or any needed information could not be
998 // obtained from the cache => ignore it, and continue with search
1000 for (auto const& flatTypeInfo : lFlatTypes)
1002 if (m_bCancel)
1003 break;
1004 OUString sFlatType = flatTypeInfo.sType;
1006 if (!impl_validateAndSetTypeOnDescriptor(rDescriptor, sFlatType))
1007 continue;
1009 // b)
1010 if (
1011 (!bAllowDeep ) ||
1012 (flatTypeInfo.bMatchByPattern)
1015 return sFlatType;
1020 // SAFE -> ----------------------------------
1021 std::unique_lock aLock(m_aMutex);
1022 CacheItem aType = GetTheFilterCache().getItem(FilterCache::E_TYPE, sFlatType);
1023 aLock.unlock();
1025 OUString sDetectService;
1026 aType[PROPNAME_DETECTSERVICE] >>= sDetectService;
1028 // c)
1029 if (sDetectService.isEmpty())
1031 // flat detected types without any registered deep detection service and not
1032 // preselected by the user can be used as LAST CHANCE in case no other type could
1033 // be detected. Of course only the first type without deep detector can be used.
1034 // Further ones has to be ignored.
1035 if (rLastChance.isEmpty())
1036 rLastChance = sFlatType;
1038 continue;
1041 OUString sDeepType = impl_askDetectService(sDetectService, rDescriptor);
1043 // d)
1044 if (!sDeepType.isEmpty())
1045 return sDeepType;
1047 catch(const css::container::NoSuchElementException&)
1049 // e)
1052 return OUString();
1053 // <- SAFE ----------------------------------
1056 void TypeDetection::impl_seekStreamToZero(utl::MediaDescriptor const & rDescriptor)
1058 // try to seek to 0 ...
1059 // But because XSeekable is an optional interface ... try it only .-)
1060 css::uno::Reference< css::io::XInputStream > xStream = rDescriptor.getUnpackedValueOrDefault(
1061 utl::MediaDescriptor::PROP_INPUTSTREAM,
1062 css::uno::Reference< css::io::XInputStream >());
1063 css::uno::Reference< css::io::XSeekable > xSeek(xStream, css::uno::UNO_QUERY);
1064 if (!xSeek.is())
1065 return;
1069 xSeek->seek(0);
1071 catch(const css::uno::RuntimeException&)
1073 throw;
1075 catch(const css::uno::Exception&)
1080 OUString TypeDetection::impl_askDetectService(const OUString& sDetectService,
1081 utl::MediaDescriptor& rDescriptor )
1083 // Open the stream and add it to the media descriptor if this method is called for the first time.
1084 // All following requests to this method will detect, that there already exists a stream .-)
1085 // Attention: This method throws an exception if the stream could not be opened.
1086 // It's important to break any further detection in such case.
1087 // Catch it on the highest detection level only !!!
1088 impl_openStream(rDescriptor);
1090 // seek to 0 is an optional feature to be more robust against
1091 // "simple implemented detect services" .-)
1092 impl_seekStreamToZero(rDescriptor);
1094 css::uno::Reference< css::document::XExtendedFilterDetection > xDetector;
1095 css::uno::Reference< css::uno::XComponentContext > xContext;
1097 // SAFE ->
1099 std::unique_lock aLock(m_aMutex);
1100 xContext = m_xContext;
1102 // <- SAFE
1106 // Attention! If e.g. an office module was not installed sometimes we
1107 // find a registered detect service, which is referred inside the
1108 // configuration ... but not really installed. On the other side we use
1109 // third party components here, which can make trouble anyway. So we
1110 // should handle errors during creation of such services more
1111 // gracefully .-)
1112 xDetector.set(
1113 xContext->getServiceManager()->createInstanceWithContext(sDetectService, xContext),
1114 css::uno::UNO_QUERY_THROW);
1116 catch (...)
1120 if ( ! xDetector.is())
1121 return OUString();
1123 OUString sDeepType;
1126 // start deep detection
1127 // Don't forget to convert stl descriptor to its uno representation.
1129 /* Attention!
1130 You have to use an explicit instance of this uno sequence...
1131 Because it's used as an in out parameter. And in case of a temp. used object
1132 we will run into memory corruptions!
1134 css::uno::Sequence< css::beans::PropertyValue > lDescriptor;
1135 rDescriptor >> lDescriptor;
1136 sDeepType = xDetector->detect(lDescriptor);
1137 rDescriptor << lDescriptor;
1139 catch (...)
1141 // We should ignore errors here.
1142 // Thrown exceptions mostly will end in crash recovery...
1143 // But might be we find another deep detection service which can detect the same
1144 // document without a problem .-)
1145 sDeepType.clear();
1148 // seek to 0 is an optional feature to be more robust against
1149 // "simple implemented detect services" .-)
1150 impl_seekStreamToZero(rDescriptor);
1152 // analyze the results
1153 // a) detect service returns "" => return "" too and remove TYPE/FILTER prop from descriptor
1154 // b) returned type is unknown => return "" too and remove TYPE/FILTER prop from descriptor
1155 // c) returned type is valid => check TYPE/FILTER props inside descriptor and return the type
1157 // this special helper checks for a valid type
1158 // and set right values on the descriptor!
1159 bool bValidType = impl_validateAndSetTypeOnDescriptor(rDescriptor, sDeepType);
1160 if (bValidType)
1161 return sDeepType;
1163 return OUString();
1167 OUString TypeDetection::impl_askUserForTypeAndFilterIfAllowed(utl::MediaDescriptor& rDescriptor)
1169 css::uno::Reference< css::task::XInteractionHandler > xInteraction =
1170 rDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_INTERACTIONHANDLER,
1171 css::uno::Reference< css::task::XInteractionHandler >());
1173 if (!xInteraction.is())
1174 return OUString();
1176 OUString sURL =
1177 rDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_URL,
1178 OUString());
1180 css::uno::Reference< css::io::XInputStream > xStream =
1181 rDescriptor.getUnpackedValueOrDefault(utl::MediaDescriptor::PROP_INPUTSTREAM,
1182 css::uno::Reference< css::io::XInputStream >());
1184 // Don't disturb the user for "non existing files - means empty URLs" or
1185 // if we were forced to detect a stream.
1186 // Reason behind: we must be sure to ask user for "unknown contents" only...
1187 // and not for "missing files". Especially if detection is done by a stream only
1188 // we can't check if the stream points to an "existing content"!
1189 if (
1190 (sURL.isEmpty() ) || // "non existing file" ?
1191 (!xStream.is() ) || // non existing file !
1192 (sURL.equalsIgnoreAsciiCase("private:stream")) // not a good idea .-)
1194 return OUString();
1198 // create a new request to ask user for its decision about the usable filter
1199 ::framework::RequestFilterSelect aRequest(sURL);
1200 xInteraction->handle(aRequest.GetRequest());
1202 // "Cancel" pressed? => return with error
1203 if (aRequest.isAbort())
1204 return OUString();
1206 // "OK" pressed => verify the selected filter, get its corresponding
1207 // type and return it. (BTW: We must update the media descriptor here ...)
1208 // The user selected explicitly a filter ... but normally we are interested on
1209 // a type here only. But we must be sure, that the selected filter is used
1210 // too and no ambiguous filter registration disturb us .-)
1212 OUString sFilter = aRequest.getFilter();
1213 if (!impl_validateAndSetFilterOnDescriptor(rDescriptor, sFilter))
1214 return OUString();
1215 OUString sType;
1216 rDescriptor[utl::MediaDescriptor::PROP_TYPENAME] >>= sType;
1217 return sType;
1219 catch(const css::uno::Exception&)
1222 return OUString();
1226 void TypeDetection::impl_openStream(utl::MediaDescriptor& rDescriptor)
1228 bool bSuccess = false;
1229 OUString sURL = rDescriptor.getUnpackedValueOrDefault( utl::MediaDescriptor::PROP_URL, OUString() );
1230 bool bRequestedReadOnly = rDescriptor.getUnpackedValueOrDefault( utl::MediaDescriptor::PROP_READONLY, false );
1231 if ( comphelper::isFileUrl( sURL ) )
1233 // OOo uses own file locking mechanics in case of local file
1234 bSuccess = rDescriptor.addInputStreamOwnLock();
1236 else
1237 bSuccess = rDescriptor.addInputStream();
1239 if ( !bSuccess )
1240 throw css::uno::Exception(
1241 "Could not open stream for <" + sURL + ">",
1242 getXWeak());
1244 if ( !bRequestedReadOnly )
1246 // The MediaDescriptor implementation adds ReadOnly argument if the file can not be opened for writing
1247 // this argument should be either removed or an additional argument should be added so that application
1248 // can separate the case when the user explicitly requests readonly document.
1249 // The current solution is to remove it here.
1250 rDescriptor.erase( utl::MediaDescriptor::PROP_READONLY );
1255 void TypeDetection::impl_removeTypeFilterFromDescriptor(utl::MediaDescriptor& rDescriptor)
1257 utl::MediaDescriptor::iterator pItType = rDescriptor.find(utl::MediaDescriptor::PROP_TYPENAME );
1258 utl::MediaDescriptor::iterator pItFilter = rDescriptor.find(utl::MediaDescriptor::PROP_FILTERNAME);
1259 if (pItType != rDescriptor.end())
1260 rDescriptor.erase(pItType);
1261 if (pItFilter != rDescriptor.end())
1262 rDescriptor.erase(pItFilter);
1266 bool TypeDetection::impl_validateAndSetTypeOnDescriptor( utl::MediaDescriptor& rDescriptor,
1267 const OUString& sType )
1269 if (GetTheFilterCache().hasItem(FilterCache::E_TYPE, sType))
1271 rDescriptor[utl::MediaDescriptor::PROP_TYPENAME] <<= sType;
1272 return true;
1275 // remove all related information from the descriptor
1276 impl_removeTypeFilterFromDescriptor(rDescriptor);
1277 return false;
1281 bool TypeDetection::impl_validateAndSetFilterOnDescriptor( utl::MediaDescriptor& rDescriptor,
1282 const OUString& sFilter )
1286 auto & cache = GetTheFilterCache();
1287 CacheItem aFilter = cache.getItem(FilterCache::E_FILTER, sFilter);
1288 OUString sType;
1289 aFilter[PROPNAME_TYPE] >>= sType;
1291 // found valid type and filter => set it on the given descriptor
1292 rDescriptor[utl::MediaDescriptor::PROP_TYPENAME ] <<= sType ;
1293 rDescriptor[utl::MediaDescriptor::PROP_FILTERNAME] <<= sFilter;
1294 return true;
1296 catch(const css::container::NoSuchElementException&){}
1298 // remove all related information from the descriptor
1299 impl_removeTypeFilterFromDescriptor(rDescriptor);
1300 return false;
1303 } // namespace filter
1305 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
1306 filter_TypeDetection_get_implementation(
1307 css::uno::XComponentContext* context, css::uno::Sequence<css::uno::Any> const&)
1309 return cppu::acquire(new filter::config::TypeDetection(context));
1312 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */