1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/document_loader.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "net/http/http_util.h"
10 #include "ppapi/c/pp_errors.h"
11 #include "ppapi/cpp/url_loader.h"
12 #include "ppapi/cpp/url_request_info.h"
13 #include "ppapi/cpp/url_response_info.h"
15 namespace chrome_pdf
{
17 // Document below size will be downloaded in one chunk.
18 const uint32 kMinFileSize
= 64*1024;
20 DocumentLoader::DocumentLoader(Client
* client
)
21 : client_(client
), partial_document_(false), request_pending_(false),
22 current_pos_(0), current_chunk_size_(0), current_chunk_read_(0),
23 document_size_(0), header_request_(true), is_multipart_(false) {
24 loader_factory_
.Initialize(this);
27 DocumentLoader::~DocumentLoader() {
30 bool DocumentLoader::Init(const pp::URLLoader
& loader
,
31 const std::string
& url
,
32 const std::string
& headers
) {
37 std::string response_headers
;
38 if (!headers
.empty()) {
39 response_headers
= headers
;
41 pp::URLResponseInfo response
= loader_
.GetResponseInfo();
42 pp::Var headers_var
= response
.GetHeaders();
44 if (headers_var
.is_string()) {
45 response_headers
= headers_var
.AsString();
49 bool accept_ranges_bytes
= false;
50 bool content_encoded
= false;
51 uint32 content_length
= 0;
53 std::string disposition
;
54 if (!response_headers
.empty()) {
55 net::HttpUtil::HeadersIterator
it(response_headers
.begin(),
56 response_headers
.end(), "\n");
57 while (it
.GetNext()) {
58 if (LowerCaseEqualsASCII(it
.name(), "content-length")) {
59 content_length
= atoi(it
.values().c_str());
60 } else if (LowerCaseEqualsASCII(it
.name(), "accept-ranges")) {
61 accept_ranges_bytes
= LowerCaseEqualsASCII(it
.values(), "bytes");
62 } else if (LowerCaseEqualsASCII(it
.name(), "content-encoding")) {
63 content_encoded
= true;
64 } else if (LowerCaseEqualsASCII(it
.name(), "content-type")) {
66 size_t semi_colon_pos
= type
.find(';');
67 if (semi_colon_pos
!= std::string::npos
) {
68 type
= type
.substr(0, semi_colon_pos
);
70 TrimWhitespace(type
, base::TRIM_ALL
, &type
);
71 } else if (LowerCaseEqualsASCII(it
.name(), "content-disposition")) {
72 disposition
= it
.values();
77 !EndsWith(type
, "/pdf", false) &&
78 !EndsWith(type
, ".pdf", false) &&
79 !EndsWith(type
, "/x-pdf", false) &&
80 !EndsWith(type
, "/*", false) &&
81 !EndsWith(type
, "/acrobat", false) &&
82 !EndsWith(type
, "/unknown", false)) {
85 if (StartsWithASCII(disposition
, "attachment", false)) {
89 if (content_length
> 0)
90 chunk_stream_
.Preallocate(content_length
);
92 document_size_
= content_length
;
95 // Enable partial loading only if file size is above the threshold.
96 // It will allow avoiding latency for multiple requests.
97 if (content_length
> kMinFileSize
&&
98 accept_ranges_bytes
&&
100 LoadPartialDocument();
107 void DocumentLoader::LoadPartialDocument() {
108 partial_document_
= true;
109 // Force the main request to be cancelled, since if we're a full-frame plugin
110 // there could be other references to the loader.
112 loader_
= pp::URLLoader();
113 // Download file header.
114 header_request_
= true;
115 RequestData(0, std::min(GetRequestSize(), document_size_
));
118 void DocumentLoader::LoadFullDocument() {
119 partial_document_
= false;
120 chunk_buffer_
.clear();
124 bool DocumentLoader::IsDocumentComplete() const {
125 if (document_size_
== 0) // Document size unknown.
127 return IsDataAvailable(0, document_size_
);
130 uint32
DocumentLoader::GetAvailableData() const {
131 if (document_size_
== 0) { // If document size is unknown.
135 std::vector
<std::pair
<size_t, size_t> > ranges
;
136 chunk_stream_
.GetMissedRanges(0, document_size_
, &ranges
);
137 uint32 available
= document_size_
;
138 std::vector
<std::pair
<size_t, size_t> >::iterator it
;
139 for (it
= ranges
.begin(); it
!= ranges
.end(); ++it
) {
140 available
-= it
->second
;
145 void DocumentLoader::ClearPendingRequests() {
146 // The first item in the queue is pending (need to keep it in the queue).
147 if (pending_requests_
.size() > 1) {
148 // Remove all elements except the first one.
149 pending_requests_
.erase(++pending_requests_
.begin(),
150 pending_requests_
.end());
154 bool DocumentLoader::GetBlock(uint32 position
, uint32 size
, void* buf
) const {
155 return chunk_stream_
.ReadData(position
, size
, buf
);
158 bool DocumentLoader::IsDataAvailable(uint32 position
, uint32 size
) const {
159 return chunk_stream_
.IsRangeAvailable(position
, size
);
162 void DocumentLoader::RequestData(uint32 position
, uint32 size
) {
163 DCHECK(partial_document_
);
165 // We have some artefact request from
166 // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after
167 // document is complete.
168 // We need this fix in PDFIum. Adding this as a work around.
169 // Bug: http://code.google.com/p/chromium/issues/detail?id=79996
171 // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf
172 if (IsDocumentComplete())
175 pending_requests_
.push_back(std::pair
<size_t, size_t>(position
, size
));
176 DownloadPendingRequests();
179 void DocumentLoader::DownloadPendingRequests() {
180 if (request_pending_
|| pending_requests_
.empty())
183 // Remove already completed requests.
184 // By design DownloadPendingRequests() should have at least 1 request in the
185 // queue. ReadComplete() will remove the last pending comment from the queue.
186 while (pending_requests_
.size() > 1) {
187 if (IsDataAvailable(pending_requests_
.front().first
,
188 pending_requests_
.front().second
)) {
189 pending_requests_
.pop_front();
195 uint32 pos
= pending_requests_
.front().first
;
196 uint32 size
= pending_requests_
.front().second
;
197 if (IsDataAvailable(pos
, size
)) {
202 // If current request has been partially downloaded already, split it into
203 // a few smaller requests.
204 std::vector
<std::pair
<size_t, size_t> > ranges
;
205 chunk_stream_
.GetMissedRanges(pos
, size
, &ranges
);
206 if (ranges
.size() > 0) {
207 pending_requests_
.pop_front();
208 pending_requests_
.insert(pending_requests_
.begin(),
209 ranges
.begin(), ranges
.end());
210 pos
= pending_requests_
.front().first
;
211 size
= pending_requests_
.front().second
;
214 uint32 cur_request_size
= GetRequestSize();
215 // If size is less than default request, try to expand download range for
216 // more optimal download.
217 if (size
< cur_request_size
&& partial_document_
) {
218 // First, try to expand block towards the end of the file.
219 uint32 new_pos
= pos
;
220 uint32 new_size
= cur_request_size
;
221 if (pos
+ new_size
> document_size_
)
222 new_size
= document_size_
- pos
;
224 std::vector
<std::pair
<size_t, size_t> > ranges
;
225 if (chunk_stream_
.GetMissedRanges(new_pos
, new_size
, &ranges
)) {
226 new_pos
= ranges
[0].first
;
227 new_size
= ranges
[0].second
;
230 // Second, try to expand block towards the beginning of the file.
231 if (new_size
< cur_request_size
) {
232 uint32 block_end
= new_pos
+ new_size
;
233 if (block_end
> cur_request_size
) {
234 new_pos
= block_end
- cur_request_size
;
238 new_size
= block_end
- new_pos
;
240 if (chunk_stream_
.GetMissedRanges(new_pos
, new_size
, &ranges
)) {
241 new_pos
= ranges
.back().first
;
242 new_size
= ranges
.back().second
;
249 size_t last_byte_before
= chunk_stream_
.GetLastByteBefore(pos
);
250 size_t first_byte_after
= chunk_stream_
.GetFirstByteAfter(pos
+ size
- 1);
251 if (pos
- last_byte_before
< cur_request_size
) {
252 size
= pos
+ size
- last_byte_before
;
253 pos
= last_byte_before
;
256 if ((pos
+ size
< first_byte_after
) &&
257 (pos
+ size
+ cur_request_size
>= first_byte_after
))
258 size
= first_byte_after
- pos
;
260 request_pending_
= true;
262 // Start downloading first pending request.
264 loader_
= client_
->CreateURLLoader();
265 pp::CompletionCallback callback
=
266 loader_factory_
.NewCallback(&DocumentLoader::DidOpen
);
267 pp::URLRequestInfo request
= GetRequest(pos
, size
);
269 int rv
= loader_
.Open(request
, callback
);
270 if (rv
!= PP_OK_COMPLETIONPENDING
)
274 pp::URLRequestInfo
DocumentLoader::GetRequest(uint32 position
,
276 pp::URLRequestInfo
request(client_
->GetPluginInstance());
277 request
.SetURL(url_
.c_str());
278 request
.SetMethod("GET");
279 request
.SetFollowRedirects(true);
281 const size_t kBufSize
= 100;
283 // According to rfc2616, byte range specifies position of the first and last
284 // bytes in the requested range inclusively. Therefore we should subtract 1
285 // from the position + size, to get index of the last byte that needs to be
287 base::snprintf(buf
, kBufSize
, "Range: bytes=%d-%d", position
,
288 position
+ size
- 1);
290 request
.SetHeaders(header
);
295 void DocumentLoader::DidOpen(int32_t result
) {
296 if (result
!= PP_OK
) {
301 int32_t http_code
= loader_
.GetResponseInfo().GetStatusCode();
302 if (http_code
>= 400 && http_code
< 500) {
303 // Error accessing resource. 4xx error indicate subsequent requests
305 // E.g. resource has been removed from the server while loading it.
306 // https://code.google.com/p/chromium/issues/detail?id=414827
310 is_multipart_
= false;
311 current_chunk_size_
= 0;
312 current_chunk_read_
= 0;
314 pp::Var headers_var
= loader_
.GetResponseInfo().GetHeaders();
316 if (headers_var
.is_string())
317 headers
= headers_var
.AsString();
319 std::string boundary
= GetMultiPartBoundary(headers
);
320 if (boundary
.size()) {
321 // Leave position untouched for now, when we read the data we'll get it.
322 is_multipart_
= true;
323 multipart_boundary_
= boundary
;
325 // Need to make sure that the server returned a byte-range, since it's
326 // possible for a server to just ignore our bye-range request and just
327 // return the entire document even if it supports byte-range requests.
328 // i.e. sniff response to
329 // http://www.act.org/compass/sample/pdf/geometry.pdf
331 uint32 start_pos
, end_pos
;
332 if (GetByteRange(headers
, &start_pos
, &end_pos
)) {
333 current_pos_
= start_pos
;
334 if (end_pos
&& end_pos
> start_pos
)
335 current_chunk_size_
= end_pos
- start_pos
+ 1;
342 bool DocumentLoader::GetByteRange(const std::string
& headers
, uint32
* start
,
344 net::HttpUtil::HeadersIterator
it(headers
.begin(), headers
.end(), "\n");
345 while (it
.GetNext()) {
346 if (LowerCaseEqualsASCII(it
.name(), "content-range")) {
347 std::string range
= it
.values().c_str();
348 if (StartsWithASCII(range
, "bytes", false)) {
349 range
= range
.substr(strlen("bytes"));
350 std::string::size_type pos
= range
.find('-');
351 std::string range_end
;
352 if (pos
!= std::string::npos
)
353 range_end
= range
.substr(pos
+ 1);
354 TrimWhitespaceASCII(range
, base::TRIM_LEADING
, &range
);
355 TrimWhitespaceASCII(range_end
, base::TRIM_LEADING
, &range_end
);
356 *start
= atoi(range
.c_str());
357 *end
= atoi(range_end
.c_str());
365 std::string
DocumentLoader::GetMultiPartBoundary(const std::string
& headers
) {
366 net::HttpUtil::HeadersIterator
it(headers
.begin(), headers
.end(), "\n");
367 while (it
.GetNext()) {
368 if (LowerCaseEqualsASCII(it
.name(), "content-type")) {
369 std::string type
= base::StringToLowerASCII(it
.values());
370 if (StartsWithASCII(type
, "multipart/", true)) {
371 const char* boundary
= strstr(type
.c_str(), "boundary=");
377 return std::string(boundary
+ 9);
381 return std::string();
384 void DocumentLoader::ReadMore() {
385 pp::CompletionCallback callback
=
386 loader_factory_
.NewCallback(&DocumentLoader::DidRead
);
387 int rv
= loader_
.ReadResponseBody(buffer_
, sizeof(buffer_
), callback
);
388 if (rv
!= PP_OK_COMPLETIONPENDING
)
392 void DocumentLoader::DidRead(int32_t result
) {
394 char* start
= buffer_
;
395 size_t length
= result
;
396 if (is_multipart_
&& result
> 2) {
397 for (int i
= 2; i
< result
; ++i
) {
398 if ((buffer_
[i
- 1] == '\n' && buffer_
[i
- 2] == '\n') ||
400 buffer_
[i
- 1] == '\n' && buffer_
[i
- 2] == '\r' &&
401 buffer_
[i
- 3] == '\n' && buffer_
[i
- 4] == '\r')) {
402 uint32 start_pos
, end_pos
;
403 if (GetByteRange(std::string(buffer_
, i
), &start_pos
, &end_pos
)) {
404 current_pos_
= start_pos
;
407 if (end_pos
&& end_pos
> start_pos
)
408 current_chunk_size_
= end_pos
- start_pos
+ 1;
414 // Reset this flag so we don't look inside the buffer in future calls of
415 // DidRead for this response. Note that this code DOES NOT handle multi-
416 // part responses with more than one part (we don't issue them at the
417 // moment, so they shouldn't arrive).
418 is_multipart_
= false;
421 if (current_chunk_size_
&&
422 current_chunk_read_
+ length
> current_chunk_size_
)
423 length
= current_chunk_size_
- current_chunk_read_
;
426 if (document_size_
> 0) {
427 chunk_stream_
.WriteData(current_pos_
, start
, length
);
429 // If we did not get content-length in the response, we can't
430 // preallocate buffer for the entire document. Resizing array causing
431 // memory fragmentation issues on the large files and OOM exceptions.
432 // To fix this, we collect all chunks of the file to the list and
433 // concatenate them together after request is complete.
434 chunk_buffer_
.push_back(std::vector
<unsigned char>());
435 chunk_buffer_
.back().resize(length
);
436 memcpy(&(chunk_buffer_
.back()[0]), start
, length
);
438 current_pos_
+= length
;
439 current_chunk_read_
+= length
;
440 client_
->OnNewDataAvailable();
443 } else if (result
== PP_OK
) {
450 void DocumentLoader::ReadComplete() {
451 if (!partial_document_
) {
452 if (document_size_
== 0) {
453 // For the document with no 'content-length" specified we've collected all
454 // the chunks already. Let's allocate final document buffer and copy them
456 chunk_stream_
.Preallocate(current_pos_
);
458 std::list
<std::vector
<unsigned char> >::iterator it
;
459 for (it
= chunk_buffer_
.begin(); it
!= chunk_buffer_
.end(); ++it
) {
460 chunk_stream_
.WriteData(pos
, &((*it
)[0]), it
->size());
463 chunk_buffer_
.clear();
465 document_size_
= current_pos_
;
466 client_
->OnDocumentComplete();
470 request_pending_
= false;
471 pending_requests_
.pop_front();
473 // If there are more pending request - continue downloading.
474 if (!pending_requests_
.empty()) {
475 DownloadPendingRequests();
479 if (IsDocumentComplete()) {
480 client_
->OnDocumentComplete();
485 client_
->OnPartialDocumentLoaded();
487 client_
->OnPendingRequestComplete();
488 header_request_
= false;
490 // The OnPendingRequestComplete could have added more requests.
491 if (!pending_requests_
.empty()) {
492 DownloadPendingRequests();
494 // Document is not complete and we have no outstanding requests.
495 // Let's keep downloading PDF file in small chunks.
496 uint32 pos
= chunk_stream_
.GetFirstMissingByte();
497 std::vector
<std::pair
<size_t, size_t> > ranges
;
498 chunk_stream_
.GetMissedRanges(pos
, GetRequestSize(), &ranges
);
499 DCHECK(ranges
.size() > 0);
500 RequestData(ranges
[0].first
, ranges
[0].second
);
504 uint32
DocumentLoader::GetRequestSize() const {
505 // Document loading strategy:
506 // For first 10 requests, we use 32k chunk sizes, for the next 10 requests we
507 // double the size (64k), and so on, until we cap max request size at 2M for
508 // 71 or more requests.
509 uint32 limited_count
= std::min(std::max(requests_count_
, 10u), 70u);
510 return 32*1024 * (1 << ((limited_count
- 1) / 10u));
513 } // namespace chrome_pdf