1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/document_loader.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "net/http/http_util.h"
10 #include "ppapi/c/pp_errors.h"
11 #include "ppapi/cpp/url_loader.h"
12 #include "ppapi/cpp/url_request_info.h"
13 #include "ppapi/cpp/url_response_info.h"
15 namespace chrome_pdf
{
19 // Document below size will be downloaded in one chunk.
20 const uint32_t kMinFileSize
= 64 * 1024;
22 // If the headers have a byte-range response, writes the start and end
23 // positions and returns true if at least the start position was parsed.
24 // The end position will be set to 0 if it was not found or parsed from the
26 // Returns false if not even a start position could be parsed.
27 bool GetByteRange(const std::string
& headers
, uint32_t* start
, uint32_t* end
) {
28 net::HttpUtil::HeadersIterator
it(headers
.begin(), headers
.end(), "\n");
29 while (it
.GetNext()) {
30 if (base::LowerCaseEqualsASCII(it
.name(), "content-range")) {
31 std::string range
= it
.values().c_str();
32 if (base::StartsWith(range
, "bytes",
33 base::CompareCase::INSENSITIVE_ASCII
)) {
34 range
= range
.substr(strlen("bytes"));
35 std::string::size_type pos
= range
.find('-');
36 std::string range_end
;
37 if (pos
!= std::string::npos
)
38 range_end
= range
.substr(pos
+ 1);
39 TrimWhitespaceASCII(range
, base::TRIM_LEADING
, &range
);
40 TrimWhitespaceASCII(range_end
, base::TRIM_LEADING
, &range_end
);
41 *start
= atoi(range
.c_str());
42 *end
= atoi(range_end
.c_str());
50 // If the headers have a multi-part response, returns the boundary name.
51 // Otherwise returns an empty string.
52 std::string
GetMultiPartBoundary(const std::string
& headers
) {
53 net::HttpUtil::HeadersIterator
it(headers
.begin(), headers
.end(), "\n");
54 while (it
.GetNext()) {
55 if (base::LowerCaseEqualsASCII(it
.name(), "content-type")) {
56 std::string type
= base::ToLowerASCII(it
.values());
57 if (base::StartsWith(type
, "multipart/", base::CompareCase::SENSITIVE
)) {
58 const char* boundary
= strstr(type
.c_str(), "boundary=");
64 return std::string(boundary
+ 9);
71 bool IsValidContentType(const std::string
& type
) {
72 return (base::EndsWith(type
, "/pdf", base::CompareCase::INSENSITIVE_ASCII
) ||
73 base::EndsWith(type
, ".pdf", base::CompareCase::INSENSITIVE_ASCII
) ||
74 base::EndsWith(type
, "/x-pdf",
75 base::CompareCase::INSENSITIVE_ASCII
) ||
76 base::EndsWith(type
, "/*", base::CompareCase::INSENSITIVE_ASCII
) ||
77 base::EndsWith(type
, "/acrobat",
78 base::CompareCase::INSENSITIVE_ASCII
) ||
79 base::EndsWith(type
, "/unknown",
80 base::CompareCase::INSENSITIVE_ASCII
));
85 DocumentLoader::Client::~Client() {
88 DocumentLoader::DocumentLoader(Client
* client
)
89 : client_(client
), partial_document_(false), request_pending_(false),
90 current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
91 document_size_(0), header_request_(true), is_multipart_(false) {
92 loader_factory_
.Initialize(this);
95 DocumentLoader::~DocumentLoader() {
98 bool DocumentLoader::Init(const pp::URLLoader
& loader
,
99 const std::string
& url
,
100 const std::string
& headers
) {
101 DCHECK(url_
.empty());
105 std::string response_headers
;
106 if (!headers
.empty()) {
107 response_headers
= headers
;
109 pp::URLResponseInfo response
= loader_
.GetResponseInfo();
110 pp::Var headers_var
= response
.GetHeaders();
112 if (headers_var
.is_string()) {
113 response_headers
= headers_var
.AsString();
117 bool accept_ranges_bytes
= false;
118 bool content_encoded
= false;
119 uint32_t content_length
= 0;
121 std::string disposition
;
123 // This happens for PDFs not loaded from http(s) sources.
124 if (response_headers
== "Content-Type: text/plain") {
125 if (!base::StartsWith(url
, "http://",
126 base::CompareCase::INSENSITIVE_ASCII
) &&
127 !base::StartsWith(url
, "https://",
128 base::CompareCase::INSENSITIVE_ASCII
)) {
129 type
= "application/pdf";
132 if (type
.empty() && !response_headers
.empty()) {
133 net::HttpUtil::HeadersIterator
it(response_headers
.begin(),
134 response_headers
.end(), "\n");
135 while (it
.GetNext()) {
136 if (base::LowerCaseEqualsASCII(it
.name(), "content-length")) {
137 content_length
= atoi(it
.values().c_str());
138 } else if (base::LowerCaseEqualsASCII(it
.name(), "accept-ranges")) {
139 accept_ranges_bytes
= base::LowerCaseEqualsASCII(it
.values(), "bytes");
140 } else if (base::LowerCaseEqualsASCII(it
.name(), "content-encoding")) {
141 content_encoded
= true;
142 } else if (base::LowerCaseEqualsASCII(it
.name(), "content-type")) {
144 size_t semi_colon_pos
= type
.find(';');
145 if (semi_colon_pos
!= std::string::npos
) {
146 type
= type
.substr(0, semi_colon_pos
);
148 TrimWhitespace(type
, base::TRIM_ALL
, &type
);
149 } else if (base::LowerCaseEqualsASCII(it
.name(), "content-disposition")) {
150 disposition
= it
.values();
154 if (!type
.empty() && !IsValidContentType(type
))
156 if (base::StartsWith(disposition
, "attachment",
157 base::CompareCase::INSENSITIVE_ASCII
))
160 if (content_length
> 0)
161 chunk_stream_
.Preallocate(content_length
);
163 document_size_
= content_length
;
166 // Enable partial loading only if file size is above the threshold.
167 // It will allow avoiding latency for multiple requests.
168 if (content_length
> kMinFileSize
&&
169 accept_ranges_bytes
&&
171 LoadPartialDocument();
178 void DocumentLoader::LoadPartialDocument() {
179 partial_document_
= true;
180 // Force the main request to be cancelled, since if we're a full-frame plugin
181 // there could be other references to the loader.
183 loader_
= pp::URLLoader();
184 // Download file header.
185 header_request_
= true;
186 RequestData(0, std::min(GetRequestSize(), document_size_
));
189 void DocumentLoader::LoadFullDocument() {
190 partial_document_
= false;
191 chunk_buffer_
.clear();
195 bool DocumentLoader::IsDocumentComplete() const {
196 if (document_size_
== 0) // Document size unknown.
198 return IsDataAvailable(0, document_size_
);
201 uint32_t DocumentLoader::GetAvailableData() const {
202 if (document_size_
== 0) { // If document size is unknown.
206 std::vector
<std::pair
<size_t, size_t> > ranges
;
207 chunk_stream_
.GetMissedRanges(0, document_size_
, &ranges
);
208 uint32_t available
= document_size_
;
209 for (const auto& range
: ranges
)
210 available
-= range
.second
;
214 void DocumentLoader::ClearPendingRequests() {
215 // The first item in the queue is pending (need to keep it in the queue).
216 if (pending_requests_
.size() > 1) {
217 // Remove all elements except the first one.
218 pending_requests_
.erase(++pending_requests_
.begin(),
219 pending_requests_
.end());
223 bool DocumentLoader::GetBlock(uint32_t position
,
226 return chunk_stream_
.ReadData(position
, size
, buf
);
229 bool DocumentLoader::IsDataAvailable(uint32_t position
, uint32_t size
) const {
230 return chunk_stream_
.IsRangeAvailable(position
, size
);
233 void DocumentLoader::RequestData(uint32_t position
, uint32_t size
) {
234 DCHECK(partial_document_
);
236 // We have some artefact request from
237 // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
238 // document is complete.
239 // We need this fix in PDFIum. Adding this as a work around.
240 // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
242 // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
243 if (IsDocumentComplete())
246 pending_requests_
.push_back(std::pair
<size_t, size_t>(position
, size
));
247 DownloadPendingRequests();
250 void DocumentLoader::DownloadPendingRequests() {
251 if (request_pending_
|| pending_requests_
.empty())
254 // Remove already completed requests.
255 // By design DownloadPendingRequests() should have at least 1 request in the
256 // queue. ReadComplete() will remove the last pending comment from the queue.
257 while (pending_requests_
.size() > 1) {
258 if (IsDataAvailable(pending_requests_
.front().first
,
259 pending_requests_
.front().second
)) {
260 pending_requests_
.pop_front();
266 uint32_t pos
= pending_requests_
.front().first
;
267 uint32_t size
= pending_requests_
.front().second
;
268 if (IsDataAvailable(pos
, size
)) {
273 // If current request has been partially downloaded already, split it into
274 // a few smaller requests.
275 std::vector
<std::pair
<size_t, size_t> > ranges
;
276 chunk_stream_
.GetMissedRanges(pos
, size
, &ranges
);
277 if (!ranges
.empty()) {
278 pending_requests_
.pop_front();
279 pending_requests_
.insert(pending_requests_
.begin(),
280 ranges
.begin(), ranges
.end());
281 pos
= pending_requests_
.front().first
;
282 size
= pending_requests_
.front().second
;
285 uint32_t cur_request_size
= GetRequestSize();
286 // If size is less than default request, try to expand download range for
287 // more optimal download.
288 if (size
< cur_request_size
&& partial_document_
) {
289 // First, try to expand block towards the end of the file.
290 uint32_t new_pos
= pos
;
291 uint32_t new_size
= cur_request_size
;
292 if (pos
+ new_size
> document_size_
)
293 new_size
= document_size_
- pos
;
295 std::vector
<std::pair
<size_t, size_t> > ranges
;
296 if (chunk_stream_
.GetMissedRanges(new_pos
, new_size
, &ranges
)) {
297 new_pos
= ranges
[0].first
;
298 new_size
= ranges
[0].second
;
301 // Second, try to expand block towards the beginning of the file.
302 if (new_size
< cur_request_size
) {
303 uint32_t block_end
= new_pos
+ new_size
;
304 if (block_end
> cur_request_size
) {
305 new_pos
= block_end
- cur_request_size
;
309 new_size
= block_end
- new_pos
;
311 if (chunk_stream_
.GetMissedRanges(new_pos
, new_size
, &ranges
)) {
312 new_pos
= ranges
.back().first
;
313 new_size
= ranges
.back().second
;
320 size_t last_byte_before
= chunk_stream_
.GetLastByteBefore(pos
);
321 size_t first_byte_after
= chunk_stream_
.GetFirstByteAfter(pos
+ size
- 1);
322 if (pos
- last_byte_before
< cur_request_size
) {
323 size
= pos
+ size
- last_byte_before
;
324 pos
= last_byte_before
;
327 if ((pos
+ size
< first_byte_after
) &&
328 (pos
+ size
+ cur_request_size
>= first_byte_after
))
329 size
= first_byte_after
- pos
;
331 request_pending_
= true;
333 // Start downloading first pending request.
335 loader_
= client_
->CreateURLLoader();
336 pp::CompletionCallback callback
=
337 loader_factory_
.NewCallback(&DocumentLoader::DidOpen
);
338 pp::URLRequestInfo request
= GetRequest(pos
, size
);
340 int rv
= loader_
.Open(request
, callback
);
341 if (rv
!= PP_OK_COMPLETIONPENDING
)
345 pp::URLRequestInfo
DocumentLoader::GetRequest(uint32_t position
,
346 uint32_t size
) const {
347 pp::URLRequestInfo
request(client_
->GetPluginInstance());
348 request
.SetURL(url_
);
349 request
.SetMethod("GET");
350 request
.SetFollowRedirects(true);
351 request
.SetCustomReferrerURL(url_
);
353 const size_t kBufSize
= 100;
355 // According to rfc2616, byte range specifies position of the first and last
356 // bytes in the requested range inclusively. Therefore we should subtract 1
357 // from the position + size, to get index of the last byte that needs to be
359 base::snprintf(buf
, kBufSize
, "Range: bytes=%d-%d", position
,
360 position
+ size
- 1);
362 request
.SetHeaders(header
);
367 void DocumentLoader::DidOpen(int32_t result
) {
368 if (result
!= PP_OK
) {
373 int32_t http_code
= loader_
.GetResponseInfo().GetStatusCode();
374 if (http_code
>= 400 && http_code
< 500) {
375 // Error accessing resource. 4xx error indicate subsequent requests
377 // E.g. resource has been removed from the server while loading it.
378 // https://code.google.com/p/chromium/issues/detail?id=414827
382 is_multipart_
= false;
383 current_chunk_size_
= 0;
384 current_chunk_read_
= 0;
386 pp::Var headers_var
= loader_
.GetResponseInfo().GetHeaders();
388 if (headers_var
.is_string())
389 headers
= headers_var
.AsString();
391 std::string boundary
= GetMultiPartBoundary(headers
);
392 if (!boundary
.empty()) {
393 // Leave position untouched for now, when we read the data we'll get it.
394 is_multipart_
= true;
395 multipart_boundary_
= boundary
;
397 // Need to make sure that the server returned a byte-range, since it's
398 // possible for a server to just ignore our bye-range request and just
399 // return the entire document even if it supports byte-range requests.
400 // i.e. sniff response to
401 // http://www.act.org/compass/sample/pdf/geometry.pdf
403 uint32_t start_pos
, end_pos
;
404 if (GetByteRange(headers
, &start_pos
, &end_pos
)) {
405 current_pos_
= start_pos
;
406 if (end_pos
&& end_pos
> start_pos
)
407 current_chunk_size_
= end_pos
- start_pos
+ 1;
414 void DocumentLoader::ReadMore() {
415 pp::CompletionCallback callback
=
416 loader_factory_
.NewCallback(&DocumentLoader::DidRead
);
417 int rv
= loader_
.ReadResponseBody(buffer_
, sizeof(buffer_
), callback
);
418 if (rv
!= PP_OK_COMPLETIONPENDING
)
422 void DocumentLoader::DidRead(int32_t result
) {
424 char* start
= buffer_
;
425 size_t length
= result
;
426 if (is_multipart_
&& result
> 2) {
427 for (int i
= 2; i
< result
; ++i
) {
428 if ((buffer_
[i
- 1] == '\n' && buffer_
[i
- 2] == '\n') ||
430 buffer_
[i
- 1] == '\n' && buffer_
[i
- 2] == '\r' &&
431 buffer_
[i
- 3] == '\n' && buffer_
[i
- 4] == '\r')) {
432 uint32_t start_pos
, end_pos
;
433 if (GetByteRange(std::string(buffer_
, i
), &start_pos
, &end_pos
)) {
434 current_pos_
= start_pos
;
437 if (end_pos
&& end_pos
> start_pos
)
438 current_chunk_size_
= end_pos
- start_pos
+ 1;
444 // Reset this flag so we don't look inside the buffer in future calls of
445 // DidRead for this response. Note that this code DOES NOT handle multi-
446 // part responses with more than one part (we don't issue them at the
447 // moment, so they shouldn't arrive).
448 is_multipart_
= false;
451 if (current_chunk_size_
&&
452 current_chunk_read_
+ length
> current_chunk_size_
)
453 length
= current_chunk_size_
- current_chunk_read_
;
456 if (document_size_
> 0) {
457 chunk_stream_
.WriteData(current_pos_
, start
, length
);
459 // If we did not get content-length in the response, we can't
460 // preallocate buffer for the entire document. Resizing array causing
461 // memory fragmentation issues on the large files and OOM exceptions.
462 // To fix this, we collect all chunks of the file to the list and
463 // concatenate them together after request is complete.
464 chunk_buffer_
.push_back(std::vector
<unsigned char>());
465 chunk_buffer_
.back().resize(length
);
466 memcpy(&(chunk_buffer_
.back()[0]), start
, length
);
468 current_pos_
+= length
;
469 current_chunk_read_
+= length
;
470 client_
->OnNewDataAvailable();
473 } else if (result
== PP_OK
) {
480 void DocumentLoader::ReadComplete() {
481 if (!partial_document_
) {
482 if (document_size_
== 0) {
483 // For the document with no 'content-length" specified we've collected all
484 // the chunks already. Let's allocate final document buffer and copy them
486 chunk_stream_
.Preallocate(current_pos_
);
488 for (auto& chunk
: chunk_buffer_
) {
489 chunk_stream_
.WriteData(pos
, &(chunk
[0]), chunk
.size());
492 chunk_buffer_
.clear();
494 document_size_
= current_pos_
;
495 client_
->OnDocumentComplete();
499 request_pending_
= false;
500 pending_requests_
.pop_front();
502 // If there are more pending request - continue downloading.
503 if (!pending_requests_
.empty()) {
504 DownloadPendingRequests();
508 if (IsDocumentComplete()) {
509 client_
->OnDocumentComplete();
514 client_
->OnPartialDocumentLoaded();
516 client_
->OnPendingRequestComplete();
517 header_request_
= false;
519 // The OnPendingRequestComplete could have added more requests.
520 if (!pending_requests_
.empty()) {
521 DownloadPendingRequests();
523 // Document is not complete and we have no outstanding requests.
524 // Let's keep downloading PDF file in small chunks.
525 uint32_t pos
= chunk_stream_
.GetFirstMissingByte();
526 std::vector
<std::pair
<size_t, size_t> > ranges
;
527 chunk_stream_
.GetMissedRanges(pos
, GetRequestSize(), &ranges
);
528 DCHECK(!ranges
.empty());
529 RequestData(ranges
[0].first
, ranges
[0].second
);
533 uint32_t DocumentLoader::GetRequestSize() const {
534 // Document loading strategy:
535 // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
536 // double the size (64k), and so on, until we cap max request size at 2M for
537 // 71 or more requests.
538 uint32_t limited_count
= std::min(std::max(requests_count_
, 10u), 70u);
539 return 32 * 1024 * (1 << ((limited_count
- 1) / 10u));
542 } // namespace chrome_pdf