From 98c1dbcd75bd2b7452771ff051b6bd564db64775 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Fri, 2 Nov 2018 16:48:04 +1300 Subject: [PATCH] Refactor handling of input files We now try much harder to avoid opening input file more than once by reusing the file descriptor in more cases. We no longer call posix_fadvise() with POSIX_FADV_NOREUSE under Linux, since it's still not implemented there. We now only call posix_fadvise() with POSIX_FADV_DONTNEED under Linux, and we do it right before we close the file descriptor. (cherry picked from commit 67717d56f346af445a7df3781438e11a42ab504e) --- xapian-applications/omega/diritor.cc | 70 ++++++++++++++++++++++-- xapian-applications/omega/diritor.h | 56 +++++++++++-------- xapian-applications/omega/index_file.cc | 2 +- xapian-applications/omega/loadfile.cc | 96 ++++++++++++++++++++------------- xapian-applications/omega/loadfile.h | 35 +++++++----- xapian-applications/omega/md5wrap.cc | 54 ++++--------------- xapian-applications/omega/md5wrap.h | 15 +++--- 7 files changed, 201 insertions(+), 127 deletions(-) diff --git a/xapian-applications/omega/diritor.cc b/xapian-applications/omega/diritor.cc index 2a472bf65..e08c7ea0c 100644 --- a/xapian-applications/omega/diritor.cc +++ b/xapian-applications/omega/diritor.cc @@ -1,6 +1,7 @@ -/* diritor.cc: Iterator through entries in a directory. - * - * Copyright (C) 2007,2008,2010,2011,2012,2013,2014 Olly Betts +/** @file diritor.cc + * @brief Iterator through entries in a directory. + */ +/* Copyright (C) 2007,2008,2010,2011,2012,2013,2014,2018 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -96,6 +97,69 @@ DirectoryIterator::build_path() } void +DirectoryIterator::open_fd() +{ + build_path(); + mode_t mode = O_BINARY | O_RDONLY; +# if defined O_NOATIME && O_NOATIME != 0 + if (try_noatime()) mode |= O_NOATIME; +# endif + fd = open(path.c_str(), mode); +# if defined O_NOATIME && O_NOATIME != 0 + if (fd < 0 && (mode & O_NOATIME)) { + mode &= ~O_NOATIME; + fd = open(path.c_str(), mode); + } +# endif + + if (fd < 0) { + switch (errno) { + case ENOENT: + case ENOTDIR: + throw FileNotFound(); + case EACCES: { + string m("Failed to open file: "); + m += strerror(errno); + throw m; + } + } + // Commit changes to files processed so far. + throw CommitAndExit("Can't open file", path, errno); + } + +#ifdef HAVE_POSIX_FADVISE + // On Linux, POSIX_FADV_NOREUSE has been a no-op since 2.6.18 (released + // 2006) and before that it was incorrectly implemented as an alias for + // POSIX_FADV_WILLNEED. There have been a few attempts to make + // POSIX_FADV_NOREUSE actually work on Linux but nothing has been merged so + // for now let's not waste effort making a syscall we know to currently be + // a no-op. We can revise this conditional if it gets usefully + // implemented. +# ifndef __linux__ + posix_fadvise(fd, 0, 0, POSIX_FADV_NOREUSE); +# endif +#endif +} + +void +DirectoryIterator::close_fd() +{ +#ifdef HAVE_POSIX_FADVISE +# ifdef __linux__ + // Linux doesn't implement POSIX_FADV_NOREUSE so instead we use + // POSIX_FADV_DONTNEED just before closing the fd. This is a bit more + // aggressive than we ideally want - really we just want to stop our + // reads from pushing other pages out of the OS cache, but if the + // pages we read are already cached it would probably be better to leave + // them cached after the read. + posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); +# endif +#endif + close(fd); + fd = -1; +} + +void DirectoryIterator::start(const std::string & path_) { if (dir) closedir(dir); diff --git a/xapian-applications/omega/diritor.h b/xapian-applications/omega/diritor.h index e94049311..e15bc41ef 100644 --- a/xapian-applications/omega/diritor.h +++ b/xapian-applications/omega/diritor.h @@ -1,6 +1,7 @@ -/* diritor.h: Iterator through entries in a directory. - * - * Copyright (C) 2007,2008,2010,2011,2012,2013,2014,2015 Olly Betts +/** @file diritor.h + * @brief Iterator through entries in a directory. + */ +/* Copyright (C) 2007,2008,2010,2011,2012,2013,2014,2015,2018 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -41,6 +42,7 @@ #include "common/noreturn.h" #include "loadfile.h" +#include "md5wrap.h" #include "runfilter.h" // For class ReadError. struct FileNotFound { }; @@ -85,6 +87,10 @@ class DirectoryIterator { void build_path(); + void open_fd(); + + void close_fd(); + public: explicit DirectoryIterator(bool follow_symlinks_) @@ -92,7 +98,7 @@ class DirectoryIterator { ~DirectoryIterator() { if (dir) closedir(dir); - if (fd >= 0) close(fd); + if (fd >= 0) close_fd(); } /// Start iterating through entries in @a path. @@ -106,10 +112,7 @@ class DirectoryIterator { // // @return false if there are no more entries. bool next() { - if (fd >= 0) { - close(fd); - fd = -1; - } + if (fd >= 0) close_fd(); path.resize(path_len); errno = 0; do { @@ -233,28 +236,23 @@ class DirectoryIterator { std::string get_magic_mimetype(); std::string file_to_string() { - build_path(); std::string out; - int flags = NOCACHE; - if (try_noatime()) flags |= NOATIME; - fd = load_file_fd(path, out, flags); - if (fd < 0) { - if (errno == ENOENT || errno == ENOTDIR) throw FileNotFound(); - throw ReadError("load_file failed"); + if (!load_file_from_fd(get_fd(), out)) { + throw ReadError("loading file failed"); } return out; } std::string gzfile_to_string() { - build_path(); - std::string out; - gzFile zfh = gzopen(path.c_str(), "rb"); + int dup_fd = dup(get_fd()); + if (fd < 0) { + throw ReadError("dup() failed"); + } + gzFile zfh = gzdopen(dup_fd, "rb"); if (zfh == NULL) { - if (errno == ENOENT || errno == ENOTDIR) { - throw FileNotFound(); - } - throw ReadError("gzopen() failed"); + throw ReadError("gzdopen() failed"); } + std::string out; char buf[8192]; while (true) { int r = gzread(zfh, buf, sizeof(buf)); @@ -268,6 +266,20 @@ class DirectoryIterator { gzclose(zfh); return out; } + + int get_fd() { + if (fd < 0) { + open_fd(); + } else { + if (lseek(fd, 0, SEEK_SET) < 0) + throw CommitAndExit("Can't rewind file descriptor", path, errno); + } + return fd; + } + + bool md5(std::string& md5) { + return md5_fd(get_fd(), md5); + } }; #endif // OMEGA_INCLUDED_DIRITOR_H diff --git a/xapian-applications/omega/index_file.cc b/xapian-applications/omega/index_file.cc index 87660dd5f..bc6bc5a59 100644 --- a/xapian-applications/omega/index_file.cc +++ b/xapian-applications/omega/index_file.cc @@ -1012,7 +1012,7 @@ index_mimetype(const string & file, const string & urlterm, const string & url, } // Compute the MD5 of the file if we haven't already. - if (md5.empty() && md5_file(file, md5, d.try_noatime()) == 0) { + if (md5.empty() && !d.md5(md5)) { if (errno == ENOENT || errno == ENOTDIR) { skip(urlterm, context, "File removed during indexing", d.get_size(), d.get_mtime(), diff --git a/xapian-applications/omega/loadfile.cc b/xapian-applications/omega/loadfile.cc index e9abddd5b..7ea4a7f9f 100644 --- a/xapian-applications/omega/loadfile.cc +++ b/xapian-applications/omega/loadfile.cc @@ -1,6 +1,7 @@ -/* loadfile.cc: load a file into a std::string. - * - * Copyright (C) 2006,2010,2012,2015 Olly Betts +/** @file loadfile.cc + * @brief load a file into a std::string. + */ +/* Copyright (C) 2006,2010,2012,2015,2018 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,14 +20,6 @@ #include -#ifdef HAVE_POSIX_FADVISE -# ifdef __linux__ -# define _POSIX_C_SOURCE 200112L // for posix_fadvise from fcntl.h -# define _DEFAULT_SOURCE 1 // Needed to get lstat() for glibc >= 2.20 -# define _BSD_SOURCE 1 // Needed to get lstat() for glibc < 2.20 -# endif -#endif - #include "loadfile.h" #include @@ -40,12 +33,29 @@ using namespace std; -int -load_file_fd(const string &file_name, size_t max_to_read, int flags, - string &output, bool &truncated) +bool +load_file_from_fd(int fd, string& output) { - (void)flags; // Avoid possible "unused" warning. - mode_t mode = O_RDONLY; + output.resize(0); + char blk[4096]; + while (true) { + ssize_t c = read(fd, blk, sizeof(blk)); + if (c <= 0) { + if (c == 0) break; + if (errno == EINTR) continue; + return false; + } + output.append(blk, c); + } + + return true; +} + +bool +load_file(const string& file_name, size_t max_to_read, int flags, + string& output, bool* truncated) +{ + mode_t mode = O_BINARY | O_RDONLY; #if defined O_NOATIME && O_NOATIME != 0 if (flags & NOATIME) mode |= O_NOATIME; #endif @@ -57,11 +67,20 @@ load_file_fd(const string &file_name, size_t max_to_read, int flags, fd = open(file_name.c_str(), mode); } #endif - if (fd < 0) return -1; + if (fd < 0) return false; #ifdef HAVE_POSIX_FADVISE +# ifndef __linux__ + // On Linux, POSIX_FADV_NOREUSE has been a no-op since 2.6.18 (released + // 2006) and before that it was incorrectly implemented as an alias for + // POSIX_FADV_WILLNEED. There have been a few attempts to make + // POSIX_FADV_NOREUSE actually work on Linux but nothing has been merged so + // for now let's not waste effort making a syscall we know to currently be + // a no-op. We can revise this conditional if it gets usefully + // implemented. if (flags & NOCACHE) - posix_fadvise(fd, 0, 0, POSIX_FADV_NOREUSE); // or POSIX_FADV_SEQUENTIAL + posix_fadvise(fd, 0, 0, POSIX_FADV_NOREUSE); +# endif #endif struct stat st; @@ -69,48 +88,51 @@ load_file_fd(const string &file_name, size_t max_to_read, int flags, int errno_save = errno; close(fd); errno = errno_save; - return -1; + return false; } if (!S_ISREG(st.st_mode)) { close(fd); errno = EINVAL; - return -1; + return false; } - char blk[4096]; size_t n = st.st_size; - truncated = (max_to_read && max_to_read < n); - if (truncated) { + if (max_to_read && max_to_read < n) { n = max_to_read; - truncated = true; + if (truncated) *truncated = true; + } else { + if (truncated) *truncated = false; } output.resize(0); output.reserve(n); while (n) { + char blk[4096]; int c = read(fd, blk, min(n, sizeof(blk))); if (c <= 0) { - if (c < 0 && errno == EINTR) continue; - break; + if (c == 0) break; + if (errno == EINTR) continue; + return false; } output.append(blk, c); n -= c; } + if (flags & NOCACHE) { #ifdef HAVE_POSIX_FADVISE - if (flags & NOCACHE) +# ifdef __linux__ + // Linux doesn't implement POSIX_FADV_NOREUSE so instead we use + // POSIX_FADV_DONTNEED just before closing the fd. This is a bit more + // aggressive than we ideally want - really we just want to stop our + // reads from pushing other pages out of the OS cache, but if the + // pages we read are already cached it would probably be better to + // leave them cached after the read. posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); +# endif #endif + } + close(fd); - return fd; -} - -bool -load_file(const string &file_name, size_t max_to_read, int flags, - string &output, bool &truncated) { - int fd = load_file_fd(file_name, max_to_read, flags, output, truncated); - if (fd >= 0) - close(fd); - return (fd >= 0); + return true; } diff --git a/xapian-applications/omega/loadfile.h b/xapian-applications/omega/loadfile.h index 96999d1f8..cc525c1a7 100644 --- a/xapian-applications/omega/loadfile.h +++ b/xapian-applications/omega/loadfile.h @@ -1,6 +1,7 @@ -/* loadfile.h: load a file into a std::string. - * - * Copyright (C) 2006,2010,2012 Olly Betts +/** @file loadfile.h + * @brief load a file into a std::string. + */ +/* Copyright (C) 2006,2010,2012,2018 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -24,24 +25,30 @@ enum { NOCACHE = 0x1, NOATIME = 0x2 }; -int load_file_fd(const std::string &file_name, size_t max_to_read, int flags, - std::string &output, bool &truncated); +bool load_file_from_fd(int fd, std::string& output); -inline int -load_file_fd(const std::string &file_name, std::string &output, int flags = 0) +static inline bool +load_file_from_fd(int fd, size_t size_hint, std::string& output) { - bool dummy; - return load_file_fd(file_name, 0, flags, output, dummy); + output.resize(0); + output.reserve(size_hint); + return load_file_from_fd(fd, output); } -bool load_file(const std::string &file_name, size_t max_to_read, int flags, - std::string &output, bool &truncated); +bool load_file(const std::string& file_name, size_t max_to_read, int flags, + std::string& output, bool* truncated); + +inline bool +load_file(const std::string& file_name, size_t max_to_read, int flags, + std::string& output, bool& truncated) +{ + return load_file(file_name, max_to_read, flags, output, &truncated); +} inline bool -load_file(const std::string &file_name, std::string &output, int flags = 0) +load_file(const std::string& file_name, std::string& output, int flags = 0) { - bool dummy; - return load_file(file_name, 0, flags, output, dummy); + return load_file(file_name, 0, flags, output, nullptr); } #endif // OMEGA_INCLUDED_LOADFILE_H diff --git a/xapian-applications/omega/md5wrap.cc b/xapian-applications/omega/md5wrap.cc index 69e65618d..a3cd9fba5 100644 --- a/xapian-applications/omega/md5wrap.cc +++ b/xapian-applications/omega/md5wrap.cc @@ -1,6 +1,7 @@ -/* md5wrap.cc: wrapper functions to allow easy use of MD5 from C++. - * - * Copyright (C) 2006,2010,2012,2013,2015 Olly Betts +/** @file md5wrap.cc + * @brief wrapper functions to allow easy use of MD5 from C++. + */ +/* Copyright (C) 2006,2010,2012,2013,2015,2018 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,48 +20,20 @@ #include -#ifdef HAVE_POSIX_FADVISE -# ifdef __linux__ -# define _POSIX_C_SOURCE 200112L // for posix_fadvise from fcntl.h -# define _DEFAULT_SOURCE 1 // Needed to get lstat() for glibc >= 2.20 -# define _BSD_SOURCE 1 // Needed to get lstat() for glibc < 2.20 -# endif -#endif +#include "md5wrap.h" #include #include -#include "safefcntl.h" #include "safeunistd.h" #include "md5.h" -#include "md5wrap.h" using namespace std; bool -md5_file(const string &file_name, string &md5, bool try_noatime) +md5_fd(int fd, string& md5) { - mode_t mode = O_RDONLY; -#if defined O_NOATIME && O_NOATIME != 0 - if (try_noatime) mode |= O_NOATIME; -#else - (void)try_noatime; -#endif - - int fd = open(file_name.c_str(), mode); -#if defined O_NOATIME && O_NOATIME != 0 - if (fd < 0 && (mode & O_NOATIME)) { - mode &= ~O_NOATIME; - fd = open(file_name.c_str(), mode); - } -#endif - if (fd < 0) return false; - -#ifdef HAVE_POSIX_FADVISE - posix_fadvise(fd, 0, 0, POSIX_FADV_NOREUSE); // or POSIX_FADV_SEQUENTIAL -#endif - MD5Context md5_ctx; MD5Init(&md5_ctx); @@ -71,32 +44,25 @@ md5_file(const string &file_name, string &md5, bool try_noatime) if (c == 0) break; if (c < 0) { if (errno == EINTR) continue; - close(fd); return false; } MD5Update(&md5_ctx, blk, c); } -#ifdef HAVE_POSIX_FADVISE - posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); -#endif - - close(fd); - MD5Final(blk, &md5_ctx); - md5.assign(reinterpret_cast(blk), 16); + md5.assign(reinterpret_cast(blk), 16); return true; } void -md5_block(const char * p, size_t len, string &md5) +md5_block(const char* p, size_t len, string& md5) { unsigned char blk[16]; MD5Context md5_ctx; MD5Init(&md5_ctx); - MD5Update(&md5_ctx, reinterpret_cast(p), len); + MD5Update(&md5_ctx, reinterpret_cast(p), len); MD5Final(blk, &md5_ctx); - md5.assign(reinterpret_cast(blk), 16); + md5.assign(reinterpret_cast(blk), 16); } diff --git a/xapian-applications/omega/md5wrap.h b/xapian-applications/omega/md5wrap.h index 2d6a5fa7e..220c5686c 100644 --- a/xapian-applications/omega/md5wrap.h +++ b/xapian-applications/omega/md5wrap.h @@ -1,6 +1,7 @@ -/* md5wrap.h: wrapper functions to allow easy use of MD5 from C++. - * - * Copyright (C) 2006,2007,2010,2013 Olly Betts +/** @file md5wrap.h + * @brief wrapper functions to allow easy use of MD5 from C++. + */ +/* Copyright (C) 2006,2007,2010,2013,2018 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,9 +23,11 @@ #include -bool md5_file(const std::string &file_name, std::string &md5, bool try_noatime); -void md5_block(const char * p, size_t len, std::string &md5); -inline void md5_string(const std::string &str, std::string &md5) { +bool md5_fd(int fd, std::string& md5); + +void md5_block(const char* p, size_t len, std::string& md5); + +inline void md5_string(const std::string& str, std::string& md5) { md5_block(str.data(), str.size(), md5); } -- 2.11.4.GIT