From 1e0ee8141207a425b56592c136ac5e94fc821173 Mon Sep 17 00:00:00 2001
From: Miklos Vajna <vmiklos@collabora.com>
Date: Wed, 12 May 2021 10:51:09 +0200
Subject: [PATCH] vcl PDF tokenizer: fix EOF position when \r is not followed
 by \n
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Otherwise this would break partial tokenize when we only read a trailer
in the middle of the file: m_aEOFs.back() is one byte larger than
rStream.Tell(), so we reader past the end of the trailer, resulting in a
tokenize failure.

What's special about the bugdoc:

- it has 2 xrefs, the first is incomplete, and refers to a second which
is later in the file
- the object length is as indirect object, triggering an xref lookup
- the first EOF is followed by a \r, but then not with a \n

This results in reading past the end of the first trailer and then
triggering a lookup failure.

FWIW, pdfium does the same in
<https://pdfium.googlesource.com/pdfium/+/59d107323f6727bbd5f8a4d0843081790638a1dd/core/fpdfapi/parser/cpdf_syntax_parser.cpp#446>,
we're on in sync with it.

(cherry picked from commit 6b1d5bafdc722d07d3dc4980764275a6caa707ba)

Conflicts:
	vcl/qa/cppunit/filter/ipdf/ipdf.cxx

Change-Id: Ia556a25e333b5e4f1418d92a98d74358862120e2
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/115537
Tested-by: Jenkins CollaboraOffice <jenkinscollaboraoffice@gmail.com>
Reviewed-by: Tomaž Vajngerl <quikee@gmail.com>
---
 vcl/qa/cppunit/filter/ipdf/data/comment-end.pdf | 69 +++++++++++++++++++++++++
 vcl/qa/cppunit/filter/ipdf/ipdf.cxx             | 19 +++++++
 vcl/source/filter/ipdf/pdfdocument.cxx          |  7 ++-
 3 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 vcl/qa/cppunit/filter/ipdf/data/comment-end.pdf

diff --git a/vcl/qa/cppunit/filter/ipdf/data/comment-end.pdf b/vcl/qa/cppunit/filter/ipdf/data/comment-end.pdf
new file mode 100644
index 000000000000..6f1ad86f5c99
--- /dev/null
+++ b/vcl/qa/cppunit/filter/ipdf/data/comment-end.pdf
@@ -0,0 +1,69 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /MediaBox [0 0 200 300]
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Contents 4 0 R
+>>
+endobj
+4 0 obj <<
+  /Length 4
+>>
+stream
+q
+Q
+endstream
+endobj
+xref
+0 5
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000157 00000 n 
+0000000226 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 5
+  /Prev 541
+>>
+startxref
+280
+%%EOF%%TEST
+4 0 obj <<
+  /Length 5 0 R
+>>
+stream
+q
+Q
+endstream
+endobj
+5 0 obj
+4
+endobj
+xref
+0 6
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000157 00000 n 
+0000000466 00000 n 
+0000000524 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 6
+>>
+startxref
+280
+%%EOF
diff --git a/vcl/qa/cppunit/filter/ipdf/ipdf.cxx b/vcl/qa/cppunit/filter/ipdf/ipdf.cxx
index 5055e36a922e..3307db5c9743 100644
--- a/vcl/qa/cppunit/filter/ipdf/ipdf.cxx
+++ b/vcl/qa/cppunit/filter/ipdf/ipdf.cxx
@@ -168,6 +168,25 @@ CPPUNIT_TEST_FIXTURE(VclFilterIpdfTest, testDictArrayDict)
     CPPUNIT_ASSERT(pKey);
 }
 
+CPPUNIT_TEST_FIXTURE(VclFilterIpdfTest, testCommentEnd)
+{
+    // Load the test document:
+    // - it has two xrefs
+    // - second xref has an updated page content object with an indirect length
+    // - last startxref refers to the first xref
+    // - first xref has a /Prev to the second xref
+    // - first xref is terminated by a \r, which is not followed by a newline
+    // this means that if reading doesn't stop at the end of the first xref, then we'll try to look
+    // up the offset of the length object, which we don't yet have
+    OUString aSourceURL = m_directories.getURLFromSrc(DATA_DIRECTORY) + "comment-end.pdf";
+    SvFileStream aFile(aSourceURL, StreamMode::READ);
+    vcl::filter::PDFDocument aDocument;
+
+    // Without the accompanying fix in place, this test would have failed, because Tokenize() didn't
+    // stop at the end of the first xref.
+    CPPUNIT_ASSERT(aDocument.Read(aFile));
+}
+
 CPPUNIT_PLUGIN_IMPLEMENT();
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/vcl/source/filter/ipdf/pdfdocument.cxx b/vcl/source/filter/ipdf/pdfdocument.cxx
index 64cf9dc4ef90..8715000f1627 100644
--- a/vcl/source/filter/ipdf/pdfdocument.cxx
+++ b/vcl/source/filter/ipdf/pdfdocument.cxx
@@ -2145,9 +2145,14 @@ bool PDFCommentElement::Read(SvStream& rStream)
                 sal_uInt64 nPos = rStream.Tell();
                 if (ch == '\r')
                 {
+                    rStream.ReadChar(ch);
+                    rStream.SeekRel(-1);
                     // If the comment ends with a \r\n, count the \n as well to match Adobe Acrobat
                     // behavior.
-                    nPos += 1;
+                    if (ch == '\n')
+                    {
+                        nPos += 1;
+                    }
                 }
                 m_rDoc.PushBackEOF(nPos);
             }
-- 
2.11.4.GIT