contrib/libarchive/libarchive/archive_read_support_format_warc.c

   1 /*-
   2  * Copyright (c) 2014 Sebastian Freundt
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
  15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
  18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #include "archive_platform.h"
  27 __FBSDID("$FreeBSD$");
  28
  29 /**
  30  * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
  31  * ISO 28500:2009.
  32  * For the purposes of this file we used the final draft from:
  33  * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
  34  *
  35  * Todo:
  36  * [ ] real-world warcs can contain resources at endpoints ending in /
  37  *     e.g. http://bibnum.bnf.fr/warc/
  38  *     if you're lucky their response contains a Content-Location: header
  39  *     pointing to a unix-compliant filename, in the example above it's
  40  *     Content-Location: http://bibnum.bnf.fr/warc/index.html
  41  *     however, that's not mandated and github for example doesn't follow
  42  *     this convention.
  43  *     We need a set of archive options to control what to do with
  44  *     entries like these, at the moment care is taken to skip them.
  45  *
  46  **/
  47
  48 #ifdef HAVE_SYS_STAT_H
  49 #include <sys/stat.h>
  50 #endif
  51 #ifdef HAVE_ERRNO_H
  52 #include <errno.h>
  53 #endif
  54 #ifdef HAVE_STDLIB_H
  55 #include <stdlib.h>
  56 #endif
  57 #ifdef HAVE_STRING_H
  58 #include <string.h>
  59 #endif
  60 #ifdef HAVE_LIMITS_H
  61 #include <limits.h>
  62 #endif
  63 #ifdef HAVE_CTYPE_H
  64 #include <ctype.h>
  65 #endif
  66 #ifdef HAVE_TIME_H
  67 #include <time.h>
  68 #endif
  69
  70 #include "archive.h"
  71 #include "archive_entry.h"
  72 #include "archive_private.h"
  73 #include "archive_read_private.h"
  74
  75 typedef enum {
  76         WT_NONE,
  77         /* warcinfo */
  78         WT_INFO,
  79         /* metadata */
  80         WT_META,
  81         /* resource */
  82         WT_RSRC,
  83         /* request, unsupported */
  84         WT_REQ,
  85         /* response, unsupported */
  86         WT_RSP,
  87         /* revisit, unsupported */
  88         WT_RVIS,
  89         /* conversion, unsupported */
  90         WT_CONV,
  91         /* continutation, unsupported at the moment */
  92         WT_CONT,
  93         /* invalid type */
  94         LAST_WT
  95 } warc_type_t;
  96
  97 typedef struct {
  98         size_t len;
  99         const char *str;
 100 } warc_string_t;
 101
 102 typedef struct {
 103         size_t len;
 104         char *str;
 105 } warc_strbuf_t;
 106
 107 struct warc_s {
 108         /* content length ahead */
 109         size_t cntlen;
 110         /* and how much we've processed so far */
 111         size_t cntoff;
 112         /* and how much we need to consume between calls */
 113         size_t unconsumed;
 114
 115         /* string pool */
 116         warc_strbuf_t pool;
 117         /* previous version */
 118         unsigned int pver;
 119         /* stringified format name */
 120         struct archive_string sver;
 121 };
 122
 123 static int _warc_bid(struct archive_read *a, int);
 124 static int _warc_cleanup(struct archive_read *a);
 125 static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
 126 static int _warc_skip(struct archive_read *a);
 127 static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
 128
 129 /* private routines */
 130 static unsigned int _warc_rdver(const char buf[10], size_t bsz);
 131 static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
 132 static warc_string_t _warc_rduri(const char *buf, size_t bsz);
 133 static ssize_t _warc_rdlen(const char *buf, size_t bsz);
 134 static time_t _warc_rdrtm(const char *buf, size_t bsz);
 135 static time_t _warc_rdmtm(const char *buf, size_t bsz);
 136 static const char *_warc_find_eoh(const char *buf, size_t bsz);
 137
 138 \f
 139 int
 140 archive_read_support_format_warc(struct archive *_a)
 141 {
 142         struct archive_read *a = (struct archive_read *)_a;
 143         struct warc_s *w;
 144         int r;
 145
 146         archive_check_magic(_a, ARCHIVE_READ_MAGIC,
 147             ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
 148
 149         if ((w = malloc(sizeof(*w))) == NULL) {
 150                 archive_set_error(&a->archive, ENOMEM,
 151                     "Can't allocate warc data");
 152                 return (ARCHIVE_FATAL);
 153         }
 154         memset(w, 0, sizeof(*w));
 155
 156         r = __archive_read_register_format(
 157                 a, w, "warc",
 158                 _warc_bid, NULL, _warc_rdhdr, _warc_read,
 159                 _warc_skip, NULL, _warc_cleanup, NULL, NULL);
 160
 161         if (r != ARCHIVE_OK) {
 162                 free(w);
 163                 return (r);
 164         }
 165         return (ARCHIVE_OK);
 166 }
 167
 168 static int
 169 _warc_cleanup(struct archive_read *a)
 170 {
 171         struct warc_s *w = a->format->data;
 172
 173         if (w->pool.len > 0U) {
 174                 free(w->pool.str);
 175         }
 176         archive_string_free(&w->sver);
 177         free(w);
 178         a->format->data = NULL;
 179         return (ARCHIVE_OK);
 180 }
 181
 182 static int
 183 _warc_bid(struct archive_read *a, int best_bid)
 184 {
 185         const char *hdr;
 186         ssize_t nrd;
 187         unsigned int ver;
 188
 189         (void)best_bid; /* UNUSED */
 190
 191         /* check first line of file, it should be a record already */
 192         if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
 193                 /* no idea what to do */
 194                 return -1;
 195         } else if (nrd < 12) {
 196                 /* nah, not for us, our magic cookie is at least 12 bytes */
 197                 return -1;
 198         }
 199
 200         /* otherwise snarf the record's version number */
 201         ver = _warc_rdver(hdr, nrd);
 202         if (ver == 0U || ver > 10000U) {
 203                 /* oh oh oh, best not to wager ... */
 204                 return -1;
 205         }
 206
 207         /* otherwise be confident */
 208         return (64);
 209 }
 210
 211 static int
 212 _warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
 213 {
 214 #define HDR_PROBE_LEN           (12U)
 215         struct warc_s *w = a->format->data;
 216         unsigned int ver;
 217         const char *buf;
 218         ssize_t nrd;
 219         const char *eoh;
 220         /* for the file name, saves some strndup()'ing */
 221         warc_string_t fnam;
 222         /* warc record type, not that we really use it a lot */
 223         warc_type_t ftyp;
 224         /* content-length+error monad */
 225         ssize_t cntlen;
 226         /* record time is the WARC-Date time we reinterpret it as ctime */
 227         time_t rtime;
 228         /* mtime is the Last-Modified time which will be the entry's mtime */
 229         time_t mtime;
 230
 231 start_over:
 232         /* just use read_ahead() they keep track of unconsumed
 233          * bits and bobs for us; no need to put an extra shift in
 234          * and reproduce that functionality here */
 235         buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
 236
 237         if (nrd < 0) {
 238                 /* no good */
 239                 archive_set_error(
 240                         &a->archive, ARCHIVE_ERRNO_MISC,
 241                         "Bad record header");
 242                 return (ARCHIVE_FATAL);
 243         } else if (buf == NULL) {
 244                 /* there should be room for at least WARC/bla\r\n
 245                  * must be EOF therefore */
 246                 return (ARCHIVE_EOF);
 247         }
 248         /* looks good so far, try and find the end of the header now */
 249         eoh = _warc_find_eoh(buf, nrd);
 250         if (eoh == NULL) {
 251                 /* still no good, the header end might be beyond the
 252                  * probe we've requested, but then again who'd cram
 253                  * so much stuff into the header *and* be 28500-compliant */
 254                 archive_set_error(
 255                         &a->archive, ARCHIVE_ERRNO_MISC,
 256                         "Bad record header");
 257                 return (ARCHIVE_FATAL);
 258         } else if ((ver = _warc_rdver(buf, eoh - buf)) > 10000U) {
 259                 /* nawww, I wish they promised backward compatibility
 260                  * anyhoo, in their infinite wisdom the 28500 guys might
 261                  * come up with something we can't possibly handle so
 262                  * best end things here */
 263                 archive_set_error(
 264                         &a->archive, ARCHIVE_ERRNO_MISC,
 265                         "Unsupported record version");
 266                 return (ARCHIVE_FATAL);
 267         } else if ((cntlen = _warc_rdlen(buf, eoh - buf)) < 0) {
 268                 /* nightmare!  the specs say content-length is mandatory
 269                  * so I don't feel overly bad stopping the reader here */
 270                 archive_set_error(
 271                         &a->archive, EINVAL,
 272                         "Bad content length");
 273                 return (ARCHIVE_FATAL);
 274         } else if ((rtime = _warc_rdrtm(buf, eoh - buf)) == (time_t)-1) {
 275                 /* record time is mandatory as per WARC/1.0,
 276                  * so just barf here, fast and loud */
 277                 archive_set_error(
 278                         &a->archive, EINVAL,
 279                         "Bad record time");
 280                 return (ARCHIVE_FATAL);
 281         }
 282
 283         /* let the world know we're a WARC archive */
 284         a->archive.archive_format = ARCHIVE_FORMAT_WARC;
 285         if (ver != w->pver) {
 286                 /* stringify this entry's version */
 287                 archive_string_sprintf(&w->sver,
 288                         "WARC/%u.%u", ver / 10000, ver % 10000);
 289                 /* remember the version */
 290                 w->pver = ver;
 291         }
 292         /* start off with the type */
 293         ftyp = _warc_rdtyp(buf, eoh - buf);
 294         /* and let future calls know about the content */
 295         w->cntlen = cntlen;
 296         w->cntoff = 0U;
 297         mtime = 0;/* Avoid compiling error on some platform. */
 298
 299         switch (ftyp) {
 300         case WT_RSRC:
 301         case WT_RSP:
 302                 /* only try and read the filename in the cases that are
 303                  * guaranteed to have one */
 304                 fnam = _warc_rduri(buf, eoh - buf);
 305                 /* check the last character in the URI to avoid creating
 306                  * directory endpoints as files, see Todo above */
 307                 if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
 308                         /* break here for now */
 309                         fnam.len = 0U;
 310                         fnam.str = NULL;
 311                         break;
 312                 }
 313                 /* bang to our string pool, so we save a
 314                  * malloc()+free() roundtrip */
 315                 if (fnam.len + 1U > w->pool.len) {
 316                         w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
 317                         w->pool.str = realloc(w->pool.str, w->pool.len);
 318                 }
 319                 memcpy(w->pool.str, fnam.str, fnam.len);
 320                 w->pool.str[fnam.len] = '\0';
 321                 /* let noone else know about the pool, it's a secret, shhh */
 322                 fnam.str = w->pool.str;
 323
 324                 /* snarf mtime or deduce from rtime
 325                  * this is a custom header added by our writer, it's quite
 326                  * hard to believe anyone else would go through with it
 327                  * (apart from being part of some http responses of course) */
 328                 if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
 329                         mtime = rtime;
 330                 }
 331                 break;
 332         default:
 333                 fnam.len = 0U;
 334                 fnam.str = NULL;
 335                 break;
 336         }
 337
 338         /* now eat some of those delicious buffer bits */
 339         __archive_read_consume(a, eoh - buf);
 340
 341         switch (ftyp) {
 342         case WT_RSRC:
 343         case WT_RSP:
 344                 if (fnam.len > 0U) {
 345                         /* populate entry object */
 346                         archive_entry_set_filetype(entry, AE_IFREG);
 347                         archive_entry_copy_pathname(entry, fnam.str);
 348                         archive_entry_set_size(entry, cntlen);
 349                         archive_entry_set_perm(entry, 0644);
 350                         /* rtime is the new ctime, mtime stays mtime */
 351                         archive_entry_set_ctime(entry, rtime, 0L);
 352                         archive_entry_set_mtime(entry, mtime, 0L);
 353                         break;
 354                 }
 355                 /* FALLTHROUGH */
 356         default:
 357                 /* consume the content and start over */
 358                 _warc_skip(a);
 359                 goto start_over;
 360         }
 361         return (ARCHIVE_OK);
 362 }
 363
 364 static int
 365 _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
 366 {
 367         struct warc_s *w = a->format->data;
 368         const char *rab;
 369         ssize_t nrd;
 370
 371         if (w->cntoff >= w->cntlen) {
 372         eof:
 373                 /* it's our lucky day, no work, we can leave early */
 374                 *buf = NULL;
 375                 *bsz = 0U;
 376                 *off = w->cntoff + 4U/*for \r\n\r\n separator*/;
 377                 w->unconsumed = 0U;
 378                 return (ARCHIVE_EOF);
 379         }
 380
 381         rab = __archive_read_ahead(a, 1U, &nrd);
 382         if (nrd < 0) {
 383                 *bsz = 0U;
 384                 /* big catastrophe */
 385                 return (int)nrd;
 386         } else if (nrd == 0) {
 387                 goto eof;
 388         } else if ((size_t)nrd > w->cntlen - w->cntoff) {
 389                 /* clamp to content-length */
 390                 nrd = w->cntlen - w->cntoff;
 391         }
 392         *off = w->cntoff;
 393         *bsz = nrd;
 394         *buf = rab;
 395
 396         w->cntoff += nrd;
 397         w->unconsumed = (size_t)nrd;
 398         return (ARCHIVE_OK);
 399 }
 400
 401 static int
 402 _warc_skip(struct archive_read *a)
 403 {
 404         struct warc_s *w = a->format->data;
 405
 406         __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
 407         w->cntlen = 0U;
 408         w->cntoff = 0U;
 409         return (ARCHIVE_OK);
 410 }
 411
 412 \f
 413 /* private routines */
 414 static void*
 415 deconst(const void *c)
 416 {
 417         return (char *)0x1 + (((const char *)c) - (const char *)0x1);
 418 }
 419
 420 static char*
 421 xmemmem(const char *hay, const size_t haysize,
 422         const char *needle, const size_t needlesize)
 423 {
 424         const char *const eoh = hay + haysize;
 425         const char *const eon = needle + needlesize;
 426         const char *hp;
 427         const char *np;
 428         const char *cand;
 429         unsigned int hsum;
 430         unsigned int nsum;
 431         unsigned int eqp;
 432
 433         /* trivial checks first
 434          * a 0-sized needle is defined to be found anywhere in haystack
 435          * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
 436          * that happens to begin with *NEEDLE) */
 437         if (needlesize == 0UL) {
 438                 return deconst(hay);
 439         } else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
 440                 /* trivial */
 441                 return NULL;
 442         }
 443
 444         /* First characters of haystack and needle are the same now. Both are
 445          * guaranteed to be at least one character long.  Now computes the sum
 446          * of characters values of needle together with the sum of the first
 447          * needle_len characters of haystack. */
 448         for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
 449              hp < eoh && np < eon;
 450              hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
 451
 452         /* HP now references the (NEEDLESIZE + 1)-th character. */
 453         if (np < eon) {
 454                 /* haystack is smaller than needle, :O */
 455                 return NULL;
 456         } else if (eqp) {
 457                 /* found a match */
 458                 return deconst(hay);
 459         }
 460
 461         /* now loop through the rest of haystack,
 462          * updating the sum iteratively */
 463         for (cand = hay; hp < eoh; hp++) {
 464                 hsum ^= *cand++;
 465                 hsum ^= *hp;
 466
 467                 /* Since the sum of the characters is already known to be
 468                  * equal at that point, it is enough to check just NEEDLESIZE - 1
 469                  * characters for equality,
 470                  * also CAND is by design < HP, so no need for range checks */
 471                 if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
 472                         return deconst(cand);
 473                 }
 474         }
 475         return NULL;
 476 }
 477
 478 static int
 479 strtoi_lim(const char *str, const char **ep, int llim, int ulim)
 480 {
 481         int res = 0;
 482         const char *sp;
 483         /* we keep track of the number of digits via rulim */
 484         int rulim;
 485
 486         for (sp = str, rulim = ulim > 10 ? ulim : 10;
 487              res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
 488              sp++, rulim /= 10) {
 489                 res *= 10;
 490                 res += *sp - '0';
 491         }
 492         if (sp == str) {
 493                 res = -1;
 494         } else if (res < llim || res > ulim) {
 495                 res = -2;
 496         }
 497         *ep = (const char*)sp;
 498         return res;
 499 }
 500
 501 static time_t
 502 time_from_tm(struct tm *t)
 503 {
 504 #if HAVE_TIMEGM
 505         /* Use platform timegm() if available. */
 506         return (timegm(t));
 507 #elif HAVE__MKGMTIME64
 508         return (_mkgmtime64(t));
 509 #else
 510         /* Else use direct calculation using POSIX assumptions. */
 511         /* First, fix up tm_yday based on the year/month/day. */
 512         if (mktime(t) == (time_t)-1)
 513                 return ((time_t)-1);
 514         /* Then we can compute timegm() from first principles. */
 515         return (t->tm_sec
 516             + t->tm_min * 60
 517             + t->tm_hour * 3600
 518             + t->tm_yday * 86400
 519             + (t->tm_year - 70) * 31536000
 520             + ((t->tm_year - 69) / 4) * 86400
 521             - ((t->tm_year - 1) / 100) * 86400
 522             + ((t->tm_year + 299) / 400) * 86400);
 523 #endif
 524 }
 525
 526 static time_t
 527 xstrpisotime(const char *s, char **endptr)
 528 {
 529 /** like strptime() but strictly for ISO 8601 Zulu strings */
 530         struct tm tm;
 531         time_t res = (time_t)-1;
 532
 533         /* make sure tm is clean */
 534         memset(&tm, 0, sizeof(tm));
 535
 536         /* as a courtesy to our callers, and since this is a non-standard
 537          * routine, we skip leading whitespace */
 538         for (; isspace(*s); s++);
 539
 540         /* read year */
 541         if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
 542                 goto out;
 543         }
 544         /* read month */
 545         if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
 546                 goto out;
 547         }
 548         /* read day-of-month */
 549         if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
 550                 goto out;
 551         }
 552         /* read hour */
 553         if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
 554                 goto out;
 555         }
 556         /* read minute */
 557         if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
 558                 goto out;
 559         }
 560         /* read second */
 561         if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
 562                 goto out;
 563         }
 564
 565         /* massage TM to fulfill some of POSIX' contraints */
 566         tm.tm_year -= 1900;
 567         tm.tm_mon--;
 568
 569         /* now convert our custom tm struct to a unix stamp using UTC */
 570         res = time_from_tm(&tm);
 571
 572 out:
 573         if (endptr != NULL) {
 574                 *endptr = deconst(s);
 575         }
 576         return res;
 577 }
 578
 579 static unsigned int
 580 _warc_rdver(const char buf[10], size_t bsz)
 581 {
 582         static const char magic[] = "WARC/";
 583         unsigned int ver;
 584
 585         (void)bsz; /* UNUSED */
 586
 587         if (memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
 588                 /* nope */
 589                 return 99999U;
 590         }
 591         /* looks good so far, read the version number for a laugh */
 592         buf += sizeof(magic) - 1U;
 593         /* most common case gets a quick-check here */
 594         if (memcmp(buf, "1.0\r\n", 5U) == 0) {
 595                 ver = 10000U;
 596         } else {
 597                 switch (*buf) {
 598                 case '0':
 599                 case '1':
 600                 case '2':
 601                 case '3':
 602                 case '4':
 603                 case '5':
 604                 case '6':
 605                 case '7':
 606                 case '8':
 607                         if (buf[1U] == '.') {
 608                                 char *on;
 609
 610                                 /* set up major version */
 611                                 ver = (buf[0U] - '0') * 10000U;
 612                                 /* minor version, anyone? */
 613                                 ver += (strtol(buf + 2U, &on, 10)) * 100U;
 614                                 /* don't parse anything else */
 615                                 if (on > buf + 2U) {
 616                                         break;
 617                                 }
 618                         }
 619                         /* FALLTHROUGH */
 620                 case '9':
 621                 default:
 622                         /* just make the version ridiculously high */
 623                         ver = 999999U;
 624                         break;
 625                 }
 626         }
 627         return ver;
 628 }
 629
 630 static unsigned int
 631 _warc_rdtyp(const char *buf, size_t bsz)
 632 {
 633         static const char _key[] = "\r\nWARC-Type:";
 634         const char *const eob = buf + bsz;
 635         const char *val;
 636
 637         if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
 638                 /* no bother */
 639                 return WT_NONE;
 640         }
 641         /* overread whitespace */
 642         for (val += sizeof(_key) - 1U; val < eob && isspace(*val); val++);
 643
 644         if (val + 8U > eob) {
 645                 ;
 646         } else if (memcmp(val, "resource", 8U) == 0) {
 647                 return WT_RSRC;
 648         } else if (memcmp(val, "warcinfo", 8U) == 0) {
 649                 return WT_INFO;
 650         } else if (memcmp(val, "metadata", 8U) == 0) {
 651                 return WT_META;
 652         } else if (memcmp(val, "request", 7U) == 0) {
 653                 return WT_REQ;
 654         } else if (memcmp(val, "response", 8U) == 0) {
 655                 return WT_RSP;
 656         } else if (memcmp(val, "conversi", 8U) == 0) {
 657                 return WT_CONV;
 658         } else if (memcmp(val, "continua", 8U) == 0) {
 659                 return WT_CONT;
 660         }
 661         return WT_NONE;
 662 }
 663
 664 static warc_string_t
 665 _warc_rduri(const char *buf, size_t bsz)
 666 {
 667         static const char _key[] = "\r\nWARC-Target-URI:";
 668         const char *const eob = buf + bsz;
 669         const char *val;
 670         const char *uri;
 671         const char *eol;
 672         warc_string_t res = {0U, NULL};
 673
 674         if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
 675                 /* no bother */
 676                 return res;
 677         }
 678         /* overread whitespace */
 679         for (val += sizeof(_key) - 1U; val < eob && isspace(*val); val++);
 680
 681         /* overread URL designators */
 682         if ((uri = xmemmem(val, eob - val, "://", 3U)) == NULL) {
 683                 /* not touching that! */
 684                 return res;
 685         } else if ((eol = memchr(uri, '\n', eob - uri)) == NULL) {
 686                 /* no end of line? :O */
 687                 return res;
 688         }
 689
 690         /* massage uri to point to after :// */
 691         uri += 3U;
 692         /* also massage eol to point to the first whitespace
 693          * after the last non-whitespace character before
 694          * the end of the line */
 695         for (; eol > uri && isspace(eol[-1]); eol--);
 696
 697         /* now then, inspect the URI */
 698         if (memcmp(val, "file", 4U) == 0) {
 699                 /* perfect, nothing left to do here */
 700
 701         } else if (memcmp(val, "http", 4U) == 0 ||
 702                    memcmp(val, "ftp", 3U) == 0) {
 703                 /* overread domain, and the first / */
 704                 while (uri < eol && *uri++ != '/');
 705         } else {
 706                 /* not sure what to do? best to bugger off */
 707                 return res;
 708         }
 709         res.str = uri;
 710         res.len = eol - uri;
 711         return res;
 712 }
 713
 714 static ssize_t
 715 _warc_rdlen(const char *buf, size_t bsz)
 716 {
 717         static const char _key[] = "\r\nContent-Length:";
 718         const char *val;
 719         char *on = NULL;
 720         long int len;
 721
 722         if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
 723                 /* no bother */
 724                 return -1;
 725         }
 726
 727         /* strtol kindly overreads whitespace for us, so use that */
 728         val += sizeof(_key) - 1U;
 729         len = strtol(val, &on, 10);
 730         if (on == NULL || !isspace(*on)) {
 731                 /* hm, can we trust that number?  Best not. */
 732                 return -1;
 733         }
 734         return (size_t)len;
 735 }
 736
 737 static time_t
 738 _warc_rdrtm(const char *buf, size_t bsz)
 739 {
 740         static const char _key[] = "\r\nWARC-Date:";
 741         const char *val;
 742         char *on = NULL;
 743         time_t res;
 744
 745         if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
 746                 /* no bother */
 747                 return (time_t)-1;
 748         }
 749
 750         /* xstrpisotime() kindly overreads whitespace for us, so use that */
 751         val += sizeof(_key) - 1U;
 752         res = xstrpisotime(val, &on);
 753         if (on == NULL || !isspace(*on)) {
 754                 /* hm, can we trust that number?  Best not. */
 755                 return (time_t)-1;
 756         }
 757         return res;
 758 }
 759
 760 static time_t
 761 _warc_rdmtm(const char *buf, size_t bsz)
 762 {
 763         static const char _key[] = "\r\nLast-Modified:";
 764         const char *val;
 765         char *on = NULL;
 766         time_t res;
 767
 768         if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
 769                 /* no bother */
 770                 return (time_t)-1;
 771         }
 772
 773         /* xstrpisotime() kindly overreads whitespace for us, so use that */
 774         val += sizeof(_key) - 1U;
 775         res = xstrpisotime(val, &on);
 776         if (on == NULL || !isspace(*on)) {
 777                 /* hm, can we trust that number?  Best not. */
 778                 return (time_t)-1;
 779         }
 780         return res;
 781 }
 782
 783 static const char*
 784 _warc_find_eoh(const char *buf, size_t bsz)
 785 {
 786         static const char _marker[] = "\r\n\r\n";
 787         const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
 788
 789         if (hit != NULL) {
 790                 hit += sizeof(_marker) - 1U;
 791         }
 792         return hit;
 793 }
 794
 795 /* archive_read_support_format_warc.c ends here */