[ci] Fix netbsd job to upgrade existing packages
[xapian.git] / xapian-applications / omega / diritor.cc
blob6a75d1c01e080324aab451d1289fccf5763528a6
1 /** @file
2 * @brief Iterator through entries in a directory.
3 */
4 /* Copyright (C) 2007,2008,2010,2011,2012,2013,2014,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "diritor.h"
25 #include "safeunistd.h"
26 #include <sys/types.h>
28 #include <cerrno>
29 #include <cstring>
31 using namespace std;
33 CommitAndExit::CommitAndExit(const char * msg_, const std::string & path,
34 int errno_)
36 msg = msg_;
37 msg += " \"";
38 msg += path;
39 msg += "\" (";
40 msg += strerror(errno_);
41 msg += ")";
44 CommitAndExit::CommitAndExit(const char * msg_, int errno_)
46 msg = msg_;
47 msg += " (";
48 msg += strerror(errno_);
49 msg += ")";
52 CommitAndExit::CommitAndExit(const char * msg_, const char * error)
54 msg = msg_;
55 msg += " (";
56 msg += error;
57 msg += ")";
60 #if defined O_NOATIME && O_NOATIME != 0
61 uid_t DirectoryIterator::euid = geteuid();
62 #endif
64 magic_t DirectoryIterator::magic_cookie = NULL;
66 void
67 DirectoryIterator::call_stat()
69 build_path();
70 int retval;
71 if (fd >= 0) {
72 retval = fstat(fd, &statbuf);
73 #ifdef HAVE_LSTAT
74 } else if (!follow_symlinks) {
75 retval = lstat(path.c_str(), &statbuf);
76 #endif
77 } else {
78 retval = stat(path.c_str(), &statbuf);
80 if (retval == -1) {
81 if (errno == ENOENT || errno == ENOTDIR)
82 throw FileNotFound();
83 if (errno == EACCES)
84 throw string(strerror(errno));
85 // Commit changes to files processed so far.
86 throw CommitAndExit("Can't stat", path, errno);
90 void
91 DirectoryIterator::build_path()
93 if (path.length() == path_len) {
94 path += '/';
95 path += leafname();
99 void
100 DirectoryIterator::open_fd()
102 build_path();
103 mode_t mode = O_BINARY | O_RDONLY;
104 # if defined O_NOATIME && O_NOATIME != 0
105 if (try_noatime()) mode |= O_NOATIME;
106 # endif
107 fd = open(path.c_str(), mode);
108 # if defined O_NOATIME && O_NOATIME != 0
109 if (fd < 0 && (mode & O_NOATIME)) {
110 mode &= ~O_NOATIME;
111 fd = open(path.c_str(), mode);
113 # endif
115 if (fd < 0) {
116 switch (errno) {
117 case ENOENT:
118 case ENOTDIR:
119 throw FileNotFound();
120 case EACCES: {
121 string m("Failed to open file: ");
122 m += strerror(errno);
123 throw m;
126 // Commit changes to files processed so far.
127 throw CommitAndExit("Can't open file", path, errno);
130 #ifdef HAVE_POSIX_FADVISE
131 // On Linux, POSIX_FADV_NOREUSE has been a no-op since 2.6.18 (released
132 // 2006) and before that it was incorrectly implemented as an alias for
133 // POSIX_FADV_WILLNEED. There have been a few attempts to make
134 // POSIX_FADV_NOREUSE actually work on Linux but nothing has been merged so
135 // for now let's not waste effort making a syscall we know to currently be
136 // a no-op. We can revise this conditional if it gets usefully
137 // implemented.
138 # ifndef __linux__
139 posix_fadvise(fd, 0, 0, POSIX_FADV_NOREUSE);
140 # endif
141 #endif
144 void
145 DirectoryIterator::close_fd()
147 #ifdef HAVE_POSIX_FADVISE
148 # ifdef __linux__
149 // Linux doesn't implement POSIX_FADV_NOREUSE so instead we use
150 // POSIX_FADV_DONTNEED just before closing the fd. This is a bit more
151 // aggressive than we ideally want - really we just want to stop our
152 // reads from pushing other pages out of the OS cache, but if the
153 // pages we read are already cached it would probably be better to leave
154 // them cached after the read.
155 posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
156 # endif
157 #endif
158 close(fd);
159 fd = -1;
162 void
163 DirectoryIterator::start(const std::string & path_)
165 if (dir) closedir(dir);
166 path = path_;
167 path_len = path.length();
168 dir = opendir(path.c_str());
169 if (dir == NULL) {
170 if (errno == ENOENT || errno == ENOTDIR)
171 throw FileNotFound();
172 if (errno == EACCES)
173 throw string(strerror(errno));
174 // Commit changes to files processed so far.
175 throw CommitAndExit("Can't open directory", path, errno);
179 void
180 DirectoryIterator::next_failed() const
182 // The Linux getdents() syscall (which readdir uses internally) is
183 // documented as being able to return ENOENT and ENOTDIR. Also,
184 // EACCES has been observed here on CIFS mounts.
185 if (errno == ENOENT || errno == ENOTDIR)
186 throw FileNotFound();
187 if (errno == EACCES)
188 throw string(strerror(errno));
189 throw CommitAndExit("Can't read next entry from directory", path, errno);
192 string
193 DirectoryIterator::get_magic_mimetype()
195 if (rare(magic_cookie == NULL)) {
196 #ifdef MAGIC_MIME_TYPE
197 magic_cookie = magic_open(MAGIC_SYMLINK|MAGIC_MIME_TYPE|MAGIC_ERROR);
198 #else
199 // MAGIC_MIME_TYPE was added in 4.22, released 2007-12-27. If we don't
200 // have it then use MAGIC_MIME instead and trim any encoding off below.
201 magic_cookie = magic_open(MAGIC_SYMLINK|MAGIC_MIME|MAGIC_ERROR);
202 #endif
203 if (magic_cookie == NULL) {
204 // Commit changes to files processed so far.
205 throw CommitAndExit("Failed to initialise the file magic library",
206 errno);
208 if (magic_load(magic_cookie, NULL) == -1) {
209 // Commit changes to files processed so far.
210 const char * err = magic_error(magic_cookie);
211 throw CommitAndExit("Failed to load the file magic database", err);
215 const char * res = NULL;
216 // Prior to 5.15, magic_descriptor() closed the fd passed, so avoid it.
217 #if defined MAGIC_VERSION && MAGIC_VERSION - 0 >= 515
218 if (fd >= 0) {
219 if (lseek(fd, 0, SEEK_SET) == 0)
220 res = magic_descriptor(magic_cookie, fd);
221 } else
222 #endif
224 build_path();
225 res = magic_file(magic_cookie, path.c_str());
227 if (!res) {
228 const char * err = magic_error(magic_cookie);
229 if (rare(err)) {
230 int eno = magic_errno(magic_cookie);
231 if (eno == ENOENT || eno == ENOTDIR)
232 throw FileNotFound();
233 string m("Failed to use magic on file: ");
234 m += err;
235 throw m;
237 return string();
240 // Sometimes libmagic returns this string instead of a mime-type for some
241 // Microsoft documents, so pick a suitable MIME content-type based on the
242 // extension. Newer versions seem to return "application/CDFV2-corrupt"
243 // instead for this case (on Debian, file 5.11 gives the former and file
244 // 5.18 the latter).
245 #define COMPOSITE_DOC "Composite Document File V2 Document"
246 if (strncmp(res, COMPOSITE_DOC, sizeof(COMPOSITE_DOC) - 1) == 0 ||
247 strcmp(res, "application/CDFV2-corrupt") == 0) {
248 // Default to something self-explanatory.
249 res = "application/x-compound-document-file";
250 const char * leaf = leafname();
251 const char * ext = strrchr(leaf, '.');
252 if (ext && strlen(++ext) == 3) {
253 char e[3];
254 for (int i = 0; i != 3; ++i) {
255 if (ext[i] <= 'Z' && ext[i] >= 'A')
256 e[i] = ext[i] + ('a' - 'A');
257 else
258 e[i] = ext[i];
260 switch (e[0]) {
261 case 'd':
262 if (e[1] == 'o')
263 res = "application/msword";
264 break;
265 case 'm':
266 if (e[1] == 's' && e[2] == 'g')
267 res = "application/vnd.ms-outlook";
268 break;
269 case 'p':
270 if (e[1] == 'p' || e[1] == 'o')
271 res = "application/vnd.ms-powerpoint";
272 else if (e[1] == 'u' && e[2] == 'b')
273 res = "application/x-mspublisher";
274 break;
275 case 'x':
276 if (e[1] == 'l')
277 res = "application/vnd.ms-excel";
278 break;
279 case 'w':
280 if (e[1] == 'p' && e[2] != 'd')
281 res = "application/vnd.ms-works";
282 break;
285 } else {
286 #ifndef MAGIC_MIME_TYPE
287 // Discard any encoding from mime type value. Prior to version 5.0 the
288 // return value just had a space separator, e.g.:
290 // text/plain charset=us-ascii
292 // 5.0 changed that (but version 4.22 and later have MAGIC_MIME_TYPE
293 // so we don't need to handle this variant here):
295 // text/plain; charset=us-ascii
296 const char* spc = strchr(res, ' ');
297 if (spc) {
298 return string(res, spc - res);
300 #endif
303 return res;