1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/document_loader.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "net/http/http_util.h"
10 #include "ppapi/c/pp_errors.h"
11 #include "ppapi/cpp/url_loader.h"
12 #include "ppapi/cpp/url_request_info.h"
13 #include "ppapi/cpp/url_response_info.h"
15 namespace chrome_pdf
{
17 // Document below size will be downloaded in one chunk.
18 const uint32 kMinFileSize
= 64*1024;
20 DocumentLoader::DocumentLoader(Client
* client
)
21 : client_(client
), partial_document_(false), request_pending_(false),
22 current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
23 document_size_(0), header_request_(true), is_multipart_(false) {
24 loader_factory_
.Initialize(this);
27 DocumentLoader::~DocumentLoader() {
30 bool DocumentLoader::Init(const pp::URLLoader
& loader
,
31 const std::string
& url
,
32 const std::string
& headers
) {
37 std::string response_headers
;
38 if (!headers
.empty()) {
39 response_headers
= headers
;
41 pp::URLResponseInfo response
= loader_
.GetResponseInfo();
42 pp::Var headers_var
= response
.GetHeaders();
44 if (headers_var
.is_string()) {
45 response_headers
= headers_var
.AsString();
49 bool accept_ranges_bytes
= false;
50 bool content_encoded
= false;
51 uint32 content_length
= 0;
53 std::string disposition
;
54 if (!response_headers
.empty()) {
55 net::HttpUtil::HeadersIterator
it(response_headers
.begin(),
56 response_headers
.end(), "\n");
57 while (it
.GetNext()) {
58 if (LowerCaseEqualsASCII(it
.name(), "content-length")) {
59 content_length
= atoi(it
.values().c_str());
60 } else if (LowerCaseEqualsASCII(it
.name(), "accept-ranges")) {
61 accept_ranges_bytes
= LowerCaseEqualsASCII(it
.values(), "bytes");
62 } else if (LowerCaseEqualsASCII(it
.name(), "content-encoding")) {
63 content_encoded
= true;
64 } else if (LowerCaseEqualsASCII(it
.name(), "content-type")) {
66 size_t semi_colon_pos
= type
.find(';');
67 if (semi_colon_pos
!= std::string::npos
) {
68 type
= type
.substr(0, semi_colon_pos
);
70 TrimWhitespace(type
, base::TRIM_ALL
, &type
);
71 } else if (LowerCaseEqualsASCII(it
.name(), "content-disposition")) {
72 disposition
= it
.values();
77 !EndsWith(type
, "/pdf", false) &&
78 !EndsWith(type
, ".pdf", false) &&
79 !EndsWith(type
, "/x-pdf", false) &&
80 !EndsWith(type
, "/*", false) &&
81 !EndsWith(type
, "/acrobat", false) &&
82 !EndsWith(type
, "/unknown", false) &&
83 !StartsWithASCII(url
, "blob:", false)) {
86 if (StartsWithASCII(disposition
, "attachment", false)) {
90 if (content_length
> 0)
91 chunk_stream_
.Preallocate(content_length
);
93 document_size_
= content_length
;
96 // Enable partial loading only if file size is above the threshold.
97 // It will allow avoiding latency for multiple requests.
98 if (content_length
> kMinFileSize
&&
99 accept_ranges_bytes
&&
101 LoadPartialDocument();
108 void DocumentLoader::LoadPartialDocument() {
109 partial_document_
= true;
110 // Force the main request to be cancelled, since if we're a full-frame plugin
111 // there could be other references to the loader.
113 loader_
= pp::URLLoader();
114 // Download file header.
115 header_request_
= true;
116 RequestData(0, std::min(GetRequestSize(), document_size_
));
119 void DocumentLoader::LoadFullDocument() {
120 partial_document_
= false;
121 chunk_buffer_
.clear();
125 bool DocumentLoader::IsDocumentComplete() const {
126 if (document_size_
== 0) // Document size unknown.
128 return IsDataAvailable(0, document_size_
);
131 uint32
DocumentLoader::GetAvailableData() const {
132 if (document_size_
== 0) { // If document size is unknown.
136 std::vector
<std::pair
<size_t, size_t> > ranges
;
137 chunk_stream_
.GetMissedRanges(0, document_size_
, &ranges
);
138 uint32 available
= document_size_
;
139 std::vector
<std::pair
<size_t, size_t> >::iterator it
;
140 for (it
= ranges
.begin(); it
!= ranges
.end(); ++it
) {
141 available
-= it
->second
;
146 void DocumentLoader::ClearPendingRequests() {
147 // The first item in the queue is pending (need to keep it in the queue).
148 if (pending_requests_
.size() > 1) {
149 // Remove all elements except the first one.
150 pending_requests_
.erase(++pending_requests_
.begin(),
151 pending_requests_
.end());
155 bool DocumentLoader::GetBlock(uint32 position
, uint32 size
, void* buf
) const {
156 return chunk_stream_
.ReadData(position
, size
, buf
);
159 bool DocumentLoader::IsDataAvailable(uint32 position
, uint32 size
) const {
160 return chunk_stream_
.IsRangeAvailable(position
, size
);
163 void DocumentLoader::RequestData(uint32 position
, uint32 size
) {
164 DCHECK(partial_document_
);
166 // We have some artefact request from
167 // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
168 // document is complete.
169 // We need this fix in PDFIum. Adding this as a work around.
170 // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
172 // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
173 if (IsDocumentComplete())
176 pending_requests_
.push_back(std::pair
<size_t, size_t>(position
, size
));
177 DownloadPendingRequests();
180 void DocumentLoader::DownloadPendingRequests() {
181 if (request_pending_
|| pending_requests_
.empty())
184 // Remove already completed requests.
185 // By design DownloadPendingRequests() should have at least 1 request in the
186 // queue. ReadComplete() will remove the last pending comment from the queue.
187 while (pending_requests_
.size() > 1) {
188 if (IsDataAvailable(pending_requests_
.front().first
,
189 pending_requests_
.front().second
)) {
190 pending_requests_
.pop_front();
196 uint32 pos
= pending_requests_
.front().first
;
197 uint32 size
= pending_requests_
.front().second
;
198 if (IsDataAvailable(pos
, size
)) {
203 // If current request has been partially downloaded already, split it into
204 // a few smaller requests.
205 std::vector
<std::pair
<size_t, size_t> > ranges
;
206 chunk_stream_
.GetMissedRanges(pos
, size
, &ranges
);
207 if (ranges
.size() > 0) {
208 pending_requests_
.pop_front();
209 pending_requests_
.insert(pending_requests_
.begin(),
210 ranges
.begin(), ranges
.end());
211 pos
= pending_requests_
.front().first
;
212 size
= pending_requests_
.front().second
;
215 uint32 cur_request_size
= GetRequestSize();
216 // If size is less than default request, try to expand download range for
217 // more optimal download.
218 if (size
< cur_request_size
&& partial_document_
) {
219 // First, try to expand block towards the end of the file.
220 uint32 new_pos
= pos
;
221 uint32 new_size
= cur_request_size
;
222 if (pos
+ new_size
> document_size_
)
223 new_size
= document_size_
- pos
;
225 std::vector
<std::pair
<size_t, size_t> > ranges
;
226 if (chunk_stream_
.GetMissedRanges(new_pos
, new_size
, &ranges
)) {
227 new_pos
= ranges
[0].first
;
228 new_size
= ranges
[0].second
;
231 // Second, try to expand block towards the beginning of the file.
232 if (new_size
< cur_request_size
) {
233 uint32 block_end
= new_pos
+ new_size
;
234 if (block_end
> cur_request_size
) {
235 new_pos
= block_end
- cur_request_size
;
239 new_size
= block_end
- new_pos
;
241 if (chunk_stream_
.GetMissedRanges(new_pos
, new_size
, &ranges
)) {
242 new_pos
= ranges
.back().first
;
243 new_size
= ranges
.back().second
;
250 size_t last_byte_before
= chunk_stream_
.GetLastByteBefore(pos
);
251 size_t first_byte_after
= chunk_stream_
.GetFirstByteAfter(pos
+ size
- 1);
252 if (pos
- last_byte_before
< cur_request_size
) {
253 size
= pos
+ size
- last_byte_before
;
254 pos
= last_byte_before
;
257 if ((pos
+ size
< first_byte_after
) &&
258 (pos
+ size
+ cur_request_size
>= first_byte_after
))
259 size
= first_byte_after
- pos
;
261 request_pending_
= true;
263 // Start downloading first pending request.
265 loader_
= client_
->CreateURLLoader();
266 pp::CompletionCallback callback
=
267 loader_factory_
.NewCallback(&DocumentLoader::DidOpen
);
268 pp::URLRequestInfo request
= GetRequest(pos
, size
);
270 int rv
= loader_
.Open(request
, callback
);
271 if (rv
!= PP_OK_COMPLETIONPENDING
)
275 pp::URLRequestInfo
DocumentLoader::GetRequest(uint32 position
,
277 pp::URLRequestInfo
request(client_
->GetPluginInstance());
278 request
.SetURL(url_
.c_str());
279 request
.SetMethod("GET");
280 request
.SetFollowRedirects(true);
282 const size_t kBufSize
= 100;
284 // According to rfc2616, byte range specifies position of the first and last
285 // bytes in the requested range inclusively. Therefore we should subtract 1
286 // from the position + size, to get index of the last byte that needs to be
288 base::snprintf(buf
, kBufSize
, "Range: bytes=%d-%d", position
,
289 position
+ size
- 1);
291 request
.SetHeaders(header
);
296 void DocumentLoader::DidOpen(int32_t result
) {
297 if (result
!= PP_OK
) {
302 is_multipart_
= false;
303 current_chunk_size_
= 0;
304 current_chunk_read_
= 0;
306 pp::Var headers_var
= loader_
.GetResponseInfo().GetHeaders();
308 if (headers_var
.is_string())
309 headers
= headers_var
.AsString();
311 std::string boundary
= GetMultiPartBoundary(headers
);
312 if (boundary
.size()) {
313 // Leave position untouched for now, when we read the data we'll get it.
314 is_multipart_
= true;
315 multipart_boundary_
= boundary
;
317 // Need to make sure that the server returned a byte-range, since it's
318 // possible for a server to just ignore our bye-range request and just
319 // return the entire document even if it supports byte-range requests.
320 // i.e. sniff response to
321 // http://www.act.org/compass/sample/pdf/geometry.pdf
323 uint32 start_pos
, end_pos
;
324 if (GetByteRange(headers
, &start_pos
, &end_pos
)) {
325 current_pos_
= start_pos
;
326 if (end_pos
&& end_pos
> start_pos
)
327 current_chunk_size_
= end_pos
- start_pos
+ 1;
334 bool DocumentLoader::GetByteRange(const std::string
& headers
, uint32
* start
,
336 net::HttpUtil::HeadersIterator
it(headers
.begin(), headers
.end(), "\n");
337 while (it
.GetNext()) {
338 if (LowerCaseEqualsASCII(it
.name(), "content-range")) {
339 std::string range
= it
.values().c_str();
340 if (StartsWithASCII(range
, "bytes", false)) {
341 range
= range
.substr(strlen("bytes"));
342 std::string::size_type pos
= range
.find('-');
343 std::string range_end
;
344 if (pos
!= std::string::npos
)
345 range_end
= range
.substr(pos
+ 1);
346 TrimWhitespaceASCII(range
, base::TRIM_LEADING
, &range
);
347 TrimWhitespaceASCII(range_end
, base::TRIM_LEADING
, &range_end
);
348 *start
= atoi(range
.c_str());
349 *end
= atoi(range_end
.c_str());
357 std::string
DocumentLoader::GetMultiPartBoundary(const std::string
& headers
) {
358 net::HttpUtil::HeadersIterator
it(headers
.begin(), headers
.end(), "\n");
359 while (it
.GetNext()) {
360 if (LowerCaseEqualsASCII(it
.name(), "content-type")) {
361 std::string type
= StringToLowerASCII(it
.values());
362 if (StartsWithASCII(type
, "multipart/", true)) {
363 const char* boundary
= strstr(type
.c_str(), "boundary=");
369 return std::string(boundary
+ 9);
373 return std::string();
376 void DocumentLoader::ReadMore() {
377 pp::CompletionCallback callback
=
378 loader_factory_
.NewCallback(&DocumentLoader::DidRead
);
379 int rv
= loader_
.ReadResponseBody(buffer_
, sizeof(buffer_
), callback
);
380 if (rv
!= PP_OK_COMPLETIONPENDING
)
384 void DocumentLoader::DidRead(int32_t result
) {
386 char* start
= buffer_
;
387 size_t length
= result
;
388 if (is_multipart_
&& result
> 2) {
389 for (int i
= 2; i
< result
; ++i
) {
390 if ((buffer_
[i
- 1] == '\n' && buffer_
[i
- 2] == '\n') ||
392 buffer_
[i
- 1] == '\n' && buffer_
[i
- 2] == '\r' &&
393 buffer_
[i
- 3] == '\n' && buffer_
[i
- 4] == '\r')) {
394 uint32 start_pos
, end_pos
;
395 if (GetByteRange(std::string(buffer_
, i
), &start_pos
, &end_pos
)) {
396 current_pos_
= start_pos
;
399 if (end_pos
&& end_pos
> start_pos
)
400 current_chunk_size_
= end_pos
- start_pos
+ 1;
406 // Reset this flag so we don't look inside the buffer in future calls of
407 // DidRead for this response. Note that this code DOES NOT handle multi-
408 // part responses with more than one part (we don't issue them at the
409 // moment, so they shouldn't arrive).
410 is_multipart_
= false;
413 if (current_chunk_size_
&&
414 current_chunk_read_
+ length
> current_chunk_size_
)
415 length
= current_chunk_size_
- current_chunk_read_
;
418 if (document_size_
> 0) {
419 chunk_stream_
.WriteData(current_pos_
, start
, length
);
421 // If we did not get content-length in the response, we can't
422 // preallocate buffer for the entire document. Resizing array causing
423 // memory fragmentation issues on the large files and OOM exceptions.
424 // To fix this, we collect all chunks of the file to the list and
425 // concatenate them together after request is complete.
426 chunk_buffer_
.push_back(std::vector
<unsigned char>());
427 chunk_buffer_
.back().resize(length
);
428 memcpy(&(chunk_buffer_
.back()[0]), start
, length
);
430 current_pos_
+= length
;
431 current_chunk_read_
+= length
;
432 client_
->OnNewDataAvailable();
435 } else if (result
== PP_OK
) {
442 void DocumentLoader::ReadComplete() {
443 if (!partial_document_
) {
444 if (document_size_
== 0) {
445 // For the document with no 'content-length" specified we've collected all
446 // the chunks already. Let's allocate final document buffer and copy them
448 chunk_stream_
.Preallocate(current_pos_
);
450 std::list
<std::vector
<unsigned char> >::iterator it
;
451 for (it
= chunk_buffer_
.begin(); it
!= chunk_buffer_
.end(); ++it
) {
452 chunk_stream_
.WriteData(pos
, &((*it
)[0]), it
->size());
455 chunk_buffer_
.clear();
457 document_size_
= current_pos_
;
458 client_
->OnDocumentComplete();
462 request_pending_
= false;
463 pending_requests_
.pop_front();
465 // If there are more pending request - continue downloading.
466 if (!pending_requests_
.empty()) {
467 DownloadPendingRequests();
471 if (IsDocumentComplete()) {
472 client_
->OnDocumentComplete();
477 client_
->OnPartialDocumentLoaded();
479 client_
->OnPendingRequestComplete();
480 header_request_
= false;
482 // The OnPendingRequestComplete could have added more requests.
483 if (!pending_requests_
.empty()) {
484 DownloadPendingRequests();
486 // Document is not complete and we have no outstanding requests.
487 // Let's keep downloading PDF file in small chunks.
488 uint32 pos
= chunk_stream_
.GetFirstMissingByte();
489 std::vector
<std::pair
<size_t, size_t> > ranges
;
490 chunk_stream_
.GetMissedRanges(pos
, GetRequestSize(), &ranges
);
491 DCHECK(ranges
.size() > 0);
492 RequestData(ranges
[0].first
, ranges
[0].second
);
496 uint32
DocumentLoader::GetRequestSize() const {
497 // Document loading strategy:
498 // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
499 // double the size (64k), and so on, until we cap max request size at 2M for
500 // 71 or more requests.
501 uint32 limited_count
= std::min(std::max(requests_count_
, 10u), 70u);
502 return 32*1024 * (1 << ((limited_count
- 1) / 10u));
505 } // namespace chrome_pdf