2 * Copyright (c) 2014 Sebastian Freundt
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 #include "archive_platform.h"
27 __FBSDID("$FreeBSD$");
30 * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
32 * For the purposes of this file we used the final draft from:
33 * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
36 * [ ] real-world warcs can contain resources at endpoints ending in /
37 * e.g. http://bibnum.bnf.fr/warc/
38 * if you're lucky their response contains a Content-Location: header
39 * pointing to a unix-compliant filename, in the example above it's
40 * Content-Location: http://bibnum.bnf.fr/warc/index.html
41 * however, that's not mandated and github for example doesn't follow
43 * We need a set of archive options to control what to do with
44 * entries like these, at the moment care is taken to skip them.
48 #ifdef HAVE_SYS_STAT_H
71 #include "archive_entry.h"
72 #include "archive_private.h"
73 #include "archive_read_private.h"
83 /* request, unsupported */
85 /* response, unsupported */
87 /* revisit, unsupported */
89 /* conversion, unsupported */
91 /* continutation, unsupported at the moment */
108 /* content length ahead */
110 /* and how much we've processed so far */
112 /* and how much we need to consume between calls */
117 /* previous version */
119 /* stringified format name */
120 struct archive_string sver
;
123 static int _warc_bid(struct archive_read
*a
, int);
124 static int _warc_cleanup(struct archive_read
*a
);
125 static int _warc_read(struct archive_read
*, const void**, size_t*, int64_t*);
126 static int _warc_skip(struct archive_read
*a
);
127 static int _warc_rdhdr(struct archive_read
*a
, struct archive_entry
*e
);
129 /* private routines */
130 static unsigned int _warc_rdver(const char buf
[10], size_t bsz
);
131 static unsigned int _warc_rdtyp(const char *buf
, size_t bsz
);
132 static warc_string_t
_warc_rduri(const char *buf
, size_t bsz
);
133 static ssize_t
_warc_rdlen(const char *buf
, size_t bsz
);
134 static time_t _warc_rdrtm(const char *buf
, size_t bsz
);
135 static time_t _warc_rdmtm(const char *buf
, size_t bsz
);
136 static const char *_warc_find_eoh(const char *buf
, size_t bsz
);
140 archive_read_support_format_warc(struct archive
*_a
)
142 struct archive_read
*a
= (struct archive_read
*)_a
;
146 archive_check_magic(_a
, ARCHIVE_READ_MAGIC
,
147 ARCHIVE_STATE_NEW
, "archive_read_support_format_warc");
149 if ((w
= malloc(sizeof(*w
))) == NULL
) {
150 archive_set_error(&a
->archive
, ENOMEM
,
151 "Can't allocate warc data");
152 return (ARCHIVE_FATAL
);
154 memset(w
, 0, sizeof(*w
));
156 r
= __archive_read_register_format(
158 _warc_bid
, NULL
, _warc_rdhdr
, _warc_read
,
159 _warc_skip
, NULL
, _warc_cleanup
, NULL
, NULL
);
161 if (r
!= ARCHIVE_OK
) {
169 _warc_cleanup(struct archive_read
*a
)
171 struct warc_s
*w
= a
->format
->data
;
173 if (w
->pool
.len
> 0U) {
176 archive_string_free(&w
->sver
);
178 a
->format
->data
= NULL
;
183 _warc_bid(struct archive_read
*a
, int best_bid
)
189 (void)best_bid
; /* UNUSED */
191 /* check first line of file, it should be a record already */
192 if ((hdr
= __archive_read_ahead(a
, 12U, &nrd
)) == NULL
) {
193 /* no idea what to do */
195 } else if (nrd
< 12) {
196 /* nah, not for us, our magic cookie is at least 12 bytes */
200 /* otherwise snarf the record's version number */
201 ver
= _warc_rdver(hdr
, nrd
);
202 if (ver
== 0U || ver
> 10000U) {
203 /* oh oh oh, best not to wager ... */
207 /* otherwise be confident */
212 _warc_rdhdr(struct archive_read
*a
, struct archive_entry
*entry
)
214 #define HDR_PROBE_LEN (12U)
215 struct warc_s
*w
= a
->format
->data
;
220 /* for the file name, saves some strndup()'ing */
222 /* warc record type, not that we really use it a lot */
224 /* content-length+error monad */
226 /* record time is the WARC-Date time we reinterpret it as ctime */
228 /* mtime is the Last-Modified time which will be the entry's mtime */
232 /* just use read_ahead() they keep track of unconsumed
233 * bits and bobs for us; no need to put an extra shift in
234 * and reproduce that functionality here */
235 buf
= __archive_read_ahead(a
, HDR_PROBE_LEN
, &nrd
);
240 &a
->archive
, ARCHIVE_ERRNO_MISC
,
241 "Bad record header");
242 return (ARCHIVE_FATAL
);
243 } else if (buf
== NULL
) {
244 /* there should be room for at least WARC/bla\r\n
245 * must be EOF therefore */
246 return (ARCHIVE_EOF
);
248 /* looks good so far, try and find the end of the header now */
249 eoh
= _warc_find_eoh(buf
, nrd
);
251 /* still no good, the header end might be beyond the
252 * probe we've requested, but then again who'd cram
253 * so much stuff into the header *and* be 28500-compliant */
255 &a
->archive
, ARCHIVE_ERRNO_MISC
,
256 "Bad record header");
257 return (ARCHIVE_FATAL
);
258 } else if ((ver
= _warc_rdver(buf
, eoh
- buf
)) > 10000U) {
259 /* nawww, I wish they promised backward compatibility
260 * anyhoo, in their infinite wisdom the 28500 guys might
261 * come up with something we can't possibly handle so
262 * best end things here */
264 &a
->archive
, ARCHIVE_ERRNO_MISC
,
265 "Unsupported record version");
266 return (ARCHIVE_FATAL
);
267 } else if ((cntlen
= _warc_rdlen(buf
, eoh
- buf
)) < 0) {
268 /* nightmare! the specs say content-length is mandatory
269 * so I don't feel overly bad stopping the reader here */
272 "Bad content length");
273 return (ARCHIVE_FATAL
);
274 } else if ((rtime
= _warc_rdrtm(buf
, eoh
- buf
)) == (time_t)-1) {
275 /* record time is mandatory as per WARC/1.0,
276 * so just barf here, fast and loud */
280 return (ARCHIVE_FATAL
);
283 /* let the world know we're a WARC archive */
284 a
->archive
.archive_format
= ARCHIVE_FORMAT_WARC
;
285 if (ver
!= w
->pver
) {
286 /* stringify this entry's version */
287 archive_string_sprintf(&w
->sver
,
288 "WARC/%u.%u", ver
/ 10000, ver
% 10000);
289 /* remember the version */
292 /* start off with the type */
293 ftyp
= _warc_rdtyp(buf
, eoh
- buf
);
294 /* and let future calls know about the content */
297 mtime
= 0;/* Avoid compiling error on some platform. */
302 /* only try and read the filename in the cases that are
303 * guaranteed to have one */
304 fnam
= _warc_rduri(buf
, eoh
- buf
);
305 /* check the last character in the URI to avoid creating
306 * directory endpoints as files, see Todo above */
307 if (fnam
.len
== 0 || fnam
.str
[fnam
.len
- 1] == '/') {
308 /* break here for now */
313 /* bang to our string pool, so we save a
314 * malloc()+free() roundtrip */
315 if (fnam
.len
+ 1U > w
->pool
.len
) {
316 w
->pool
.len
= ((fnam
.len
+ 64U) / 64U) * 64U;
317 w
->pool
.str
= realloc(w
->pool
.str
, w
->pool
.len
);
319 memcpy(w
->pool
.str
, fnam
.str
, fnam
.len
);
320 w
->pool
.str
[fnam
.len
] = '\0';
321 /* let noone else know about the pool, it's a secret, shhh */
322 fnam
.str
= w
->pool
.str
;
324 /* snarf mtime or deduce from rtime
325 * this is a custom header added by our writer, it's quite
326 * hard to believe anyone else would go through with it
327 * (apart from being part of some http responses of course) */
328 if ((mtime
= _warc_rdmtm(buf
, eoh
- buf
)) == (time_t)-1) {
338 /* now eat some of those delicious buffer bits */
339 __archive_read_consume(a
, eoh
- buf
);
345 /* populate entry object */
346 archive_entry_set_filetype(entry
, AE_IFREG
);
347 archive_entry_copy_pathname(entry
, fnam
.str
);
348 archive_entry_set_size(entry
, cntlen
);
349 archive_entry_set_perm(entry
, 0644);
350 /* rtime is the new ctime, mtime stays mtime */
351 archive_entry_set_ctime(entry
, rtime
, 0L);
352 archive_entry_set_mtime(entry
, mtime
, 0L);
357 /* consume the content and start over */
365 _warc_read(struct archive_read
*a
, const void **buf
, size_t *bsz
, int64_t *off
)
367 struct warc_s
*w
= a
->format
->data
;
371 if (w
->cntoff
>= w
->cntlen
) {
373 /* it's our lucky day, no work, we can leave early */
376 *off
= w
->cntoff
+ 4U/*for \r\n\r\n separator*/;
378 return (ARCHIVE_EOF
);
381 rab
= __archive_read_ahead(a
, 1U, &nrd
);
384 /* big catastrophe */
386 } else if (nrd
== 0) {
388 } else if ((size_t)nrd
> w
->cntlen
- w
->cntoff
) {
389 /* clamp to content-length */
390 nrd
= w
->cntlen
- w
->cntoff
;
397 w
->unconsumed
= (size_t)nrd
;
402 _warc_skip(struct archive_read
*a
)
404 struct warc_s
*w
= a
->format
->data
;
406 __archive_read_consume(a
, w
->cntlen
+ 4U/*\r\n\r\n separator*/);
413 /* private routines */
415 deconst(const void *c
)
417 return (char *)0x1 + (((const char *)c
) - (const char *)0x1);
421 xmemmem(const char *hay
, const size_t haysize
,
422 const char *needle
, const size_t needlesize
)
424 const char *const eoh
= hay
+ haysize
;
425 const char *const eon
= needle
+ needlesize
;
433 /* trivial checks first
434 * a 0-sized needle is defined to be found anywhere in haystack
435 * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
436 * that happens to begin with *NEEDLE) */
437 if (needlesize
== 0UL) {
439 } else if ((hay
= memchr(hay
, *needle
, haysize
)) == NULL
) {
444 /* First characters of haystack and needle are the same now. Both are
445 * guaranteed to be at least one character long. Now computes the sum
446 * of characters values of needle together with the sum of the first
447 * needle_len characters of haystack. */
448 for (hp
= hay
+ 1U, np
= needle
+ 1U, hsum
= *hay
, nsum
= *hay
, eqp
= 1U;
449 hp
< eoh
&& np
< eon
;
450 hsum
^= *hp
, nsum
^= *np
, eqp
&= *hp
== *np
, hp
++, np
++);
452 /* HP now references the (NEEDLESIZE + 1)-th character. */
454 /* haystack is smaller than needle, :O */
461 /* now loop through the rest of haystack,
462 * updating the sum iteratively */
463 for (cand
= hay
; hp
< eoh
; hp
++) {
467 /* Since the sum of the characters is already known to be
468 * equal at that point, it is enough to check just NEEDLESIZE - 1
469 * characters for equality,
470 * also CAND is by design < HP, so no need for range checks */
471 if (hsum
== nsum
&& memcmp(cand
, needle
, needlesize
- 1U) == 0) {
472 return deconst(cand
);
479 strtoi_lim(const char *str
, const char **ep
, int llim
, int ulim
)
483 /* we keep track of the number of digits via rulim */
486 for (sp
= str
, rulim
= ulim
> 10 ? ulim
: 10;
487 res
* 10 <= ulim
&& rulim
&& *sp
>= '0' && *sp
<= '9';
494 } else if (res
< llim
|| res
> ulim
) {
497 *ep
= (const char*)sp
;
502 time_from_tm(struct tm
*t
)
505 /* Use platform timegm() if available. */
507 #elif HAVE__MKGMTIME64
508 return (_mkgmtime64(t
));
510 /* Else use direct calculation using POSIX assumptions. */
511 /* First, fix up tm_yday based on the year/month/day. */
512 if (mktime(t
) == (time_t)-1)
514 /* Then we can compute timegm() from first principles. */
519 + (t
->tm_year
- 70) * 31536000
520 + ((t
->tm_year
- 69) / 4) * 86400
521 - ((t
->tm_year
- 1) / 100) * 86400
522 + ((t
->tm_year
+ 299) / 400) * 86400);
527 xstrpisotime(const char *s
, char **endptr
)
529 /** like strptime() but strictly for ISO 8601 Zulu strings */
531 time_t res
= (time_t)-1;
533 /* make sure tm is clean */
534 memset(&tm
, 0, sizeof(tm
));
536 /* as a courtesy to our callers, and since this is a non-standard
537 * routine, we skip leading whitespace */
538 for (; isspace(*s
); s
++);
541 if ((tm
.tm_year
= strtoi_lim(s
, &s
, 1583, 4095)) < 0 || *s
++ != '-') {
545 if ((tm
.tm_mon
= strtoi_lim(s
, &s
, 1, 12)) < 0 || *s
++ != '-') {
548 /* read day-of-month */
549 if ((tm
.tm_mday
= strtoi_lim(s
, &s
, 1, 31)) < 0 || *s
++ != 'T') {
553 if ((tm
.tm_hour
= strtoi_lim(s
, &s
, 0, 23)) < 0 || *s
++ != ':') {
557 if ((tm
.tm_min
= strtoi_lim(s
, &s
, 0, 59)) < 0 || *s
++ != ':') {
561 if ((tm
.tm_sec
= strtoi_lim(s
, &s
, 0, 60)) < 0 || *s
++ != 'Z') {
565 /* massage TM to fulfill some of POSIX' contraints */
569 /* now convert our custom tm struct to a unix stamp using UTC */
570 res
= time_from_tm(&tm
);
573 if (endptr
!= NULL
) {
574 *endptr
= deconst(s
);
580 _warc_rdver(const char buf
[10], size_t bsz
)
582 static const char magic
[] = "WARC/";
585 (void)bsz
; /* UNUSED */
587 if (memcmp(buf
, magic
, sizeof(magic
) - 1U) != 0) {
591 /* looks good so far, read the version number for a laugh */
592 buf
+= sizeof(magic
) - 1U;
593 /* most common case gets a quick-check here */
594 if (memcmp(buf
, "1.0\r\n", 5U) == 0) {
607 if (buf
[1U] == '.') {
610 /* set up major version */
611 ver
= (buf
[0U] - '0') * 10000U;
612 /* minor version, anyone? */
613 ver
+= (strtol(buf
+ 2U, &on
, 10)) * 100U;
614 /* don't parse anything else */
622 /* just make the version ridiculously high */
631 _warc_rdtyp(const char *buf
, size_t bsz
)
633 static const char _key
[] = "\r\nWARC-Type:";
634 const char *const eob
= buf
+ bsz
;
637 if ((val
= xmemmem(buf
, bsz
, _key
, sizeof(_key
) - 1U)) == NULL
) {
641 /* overread whitespace */
642 for (val
+= sizeof(_key
) - 1U; val
< eob
&& isspace(*val
); val
++);
644 if (val
+ 8U > eob
) {
646 } else if (memcmp(val
, "resource", 8U) == 0) {
648 } else if (memcmp(val
, "warcinfo", 8U) == 0) {
650 } else if (memcmp(val
, "metadata", 8U) == 0) {
652 } else if (memcmp(val
, "request", 7U) == 0) {
654 } else if (memcmp(val
, "response", 8U) == 0) {
656 } else if (memcmp(val
, "conversi", 8U) == 0) {
658 } else if (memcmp(val
, "continua", 8U) == 0) {
665 _warc_rduri(const char *buf
, size_t bsz
)
667 static const char _key
[] = "\r\nWARC-Target-URI:";
668 const char *const eob
= buf
+ bsz
;
672 warc_string_t res
= {0U, NULL
};
674 if ((val
= xmemmem(buf
, bsz
, _key
, sizeof(_key
) - 1U)) == NULL
) {
678 /* overread whitespace */
679 for (val
+= sizeof(_key
) - 1U; val
< eob
&& isspace(*val
); val
++);
681 /* overread URL designators */
682 if ((uri
= xmemmem(val
, eob
- val
, "://", 3U)) == NULL
) {
683 /* not touching that! */
685 } else if ((eol
= memchr(uri
, '\n', eob
- uri
)) == NULL
) {
686 /* no end of line? :O */
690 /* massage uri to point to after :// */
692 /* also massage eol to point to the first whitespace
693 * after the last non-whitespace character before
694 * the end of the line */
695 for (; eol
> uri
&& isspace(eol
[-1]); eol
--);
697 /* now then, inspect the URI */
698 if (memcmp(val
, "file", 4U) == 0) {
699 /* perfect, nothing left to do here */
701 } else if (memcmp(val
, "http", 4U) == 0 ||
702 memcmp(val
, "ftp", 3U) == 0) {
703 /* overread domain, and the first / */
704 while (uri
< eol
&& *uri
++ != '/');
706 /* not sure what to do? best to bugger off */
715 _warc_rdlen(const char *buf
, size_t bsz
)
717 static const char _key
[] = "\r\nContent-Length:";
722 if ((val
= xmemmem(buf
, bsz
, _key
, sizeof(_key
) - 1U)) == NULL
) {
727 /* strtol kindly overreads whitespace for us, so use that */
728 val
+= sizeof(_key
) - 1U;
729 len
= strtol(val
, &on
, 10);
730 if (on
== NULL
|| !isspace(*on
)) {
731 /* hm, can we trust that number? Best not. */
738 _warc_rdrtm(const char *buf
, size_t bsz
)
740 static const char _key
[] = "\r\nWARC-Date:";
745 if ((val
= xmemmem(buf
, bsz
, _key
, sizeof(_key
) - 1U)) == NULL
) {
750 /* xstrpisotime() kindly overreads whitespace for us, so use that */
751 val
+= sizeof(_key
) - 1U;
752 res
= xstrpisotime(val
, &on
);
753 if (on
== NULL
|| !isspace(*on
)) {
754 /* hm, can we trust that number? Best not. */
761 _warc_rdmtm(const char *buf
, size_t bsz
)
763 static const char _key
[] = "\r\nLast-Modified:";
768 if ((val
= xmemmem(buf
, bsz
, _key
, sizeof(_key
) - 1U)) == NULL
) {
773 /* xstrpisotime() kindly overreads whitespace for us, so use that */
774 val
+= sizeof(_key
) - 1U;
775 res
= xstrpisotime(val
, &on
);
776 if (on
== NULL
|| !isspace(*on
)) {
777 /* hm, can we trust that number? Best not. */
784 _warc_find_eoh(const char *buf
, size_t bsz
)
786 static const char _marker
[] = "\r\n\r\n";
787 const char *hit
= xmemmem(buf
, bsz
, _marker
, sizeof(_marker
) - 1U);
790 hit
+= sizeof(_marker
) - 1U;
795 /* archive_read_support_format_warc.c ends here */