src/encoding/encoding.c

   1 /* Stream reading and decoding (mostly decompression) */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <errno.h>
   8 #include <stdio.h>
   9 #include <string.h>
  10 #include <sys/stat.h> /* OS/2 needs this after sys/types.h */
  11 #include <sys/types.h>
  12 #ifdef HAVE_FCNTL_H
  13 #include <fcntl.h> /* OS/2 needs this after sys/types.h */
  14 #endif
  15 #ifdef HAVE_UNISTD_H
  16 #include <unistd.h>
  17 #endif
  18
  19 #include "elinks.h"
  20
  21 #include "config/options.h"
  22 #include "encoding/encoding.h"
  23 #include "network/state.h"
  24 #include "osdep/osdep.h"
  25 #include "util/memory.h"
  26 #include "util/string.h"
  27
  28
  29 /*************************************************************************
  30   Dummy encoding (ENCODING_NONE)
  31 *************************************************************************/
  32
  33 struct dummy_enc_data {
  34         int fd;
  35 };
  36
  37 static int
  38 dummy_open(struct stream_encoded *stream, int fd)
  39 {
  40         stream->data = mem_alloc(sizeof(struct dummy_enc_data));
  41         if (!stream->data) return -1;
  42
  43         ((struct dummy_enc_data *) stream->data)->fd = fd;
  44
  45         return 0;
  46 }
  47
  48 static int
  49 dummy_read(struct stream_encoded *stream, unsigned char *data, int len)
  50 {
  51         return safe_read(((struct dummy_enc_data *) stream->data)->fd, data, len);
  52 }
  53
  54 static unsigned char *
  55 dummy_decode_buffer(struct stream_encoded *stream, unsigned char *data, int len, int *new_len)
  56 {
  57         unsigned char *buffer = memacpy(data, len);
  58
  59         if (!buffer) return NULL;
  60
  61         *new_len = len;
  62         return buffer;
  63 }
  64
  65 static void
  66 dummy_close(struct stream_encoded *stream)
  67 {
  68         close(((struct dummy_enc_data *) stream->data)->fd);
  69         mem_free(stream->data);
  70 }
  71
  72 static const unsigned char *const dummy_extensions[] = { NULL };
  73
  74 static const struct decoding_backend dummy_decoding_backend = {
  75         "none",
  76         dummy_extensions,
  77         dummy_open,
  78         dummy_read,
  79         dummy_decode_buffer,
  80         dummy_close,
  81 };
  82
  83
  84 /* Dynamic backend area */
  85
  86 #include "encoding/brotli.h"
  87 #include "encoding/bzip2.h"
  88 #include "encoding/deflate.h"
  89 #include "encoding/lzma.h"
  90
  91 static const struct decoding_backend *const decoding_backends[] = {
  92         &dummy_decoding_backend,
  93         &gzip_decoding_backend,
  94         &bzip2_decoding_backend,
  95         &lzma_decoding_backend,
  96         &deflate_decoding_backend,
  97         &brotli_decoding_backend,
  98 };
  99
 100
 101 /*************************************************************************
 102   Public functions
 103 *************************************************************************/
 104
 105
 106 /* Associates encoded stream with a fd. */
 107 struct stream_encoded *
 108 open_encoded(int fd, enum stream_encoding encoding)
 109 {
 110         struct stream_encoded *stream;
 111
 112         stream = mem_alloc(sizeof(*stream));
 113         if (!stream) return NULL;
 114
 115         stream->encoding = encoding;
 116         if (decoding_backends[stream->encoding]->open(stream, fd) >= 0)
 117                 return stream;
 118
 119         mem_free(stream);
 120         return NULL;
 121 }
 122
 123 /* Read available data from stream and decode them. Note that when data change
 124  * their size during decoding, 'len' indicates desired size of _returned_ data,
 125  * not desired size of data read from stream. */
 126 int
 127 read_encoded(struct stream_encoded *stream, unsigned char *data, int len)
 128 {
 129         return decoding_backends[stream->encoding]->read(stream, data, len);
 130 }
 131
 132 /* Decode an entire file from a buffer. This function is not suitable
 133  * for parts of files. @data contains the original data, @len bytes
 134  * long. The resulting decoded data chunk is *@new_len bytes long. */
 135 unsigned char *
 136 decode_encoded_buffer(struct stream_encoded *stream, enum stream_encoding encoding, unsigned char *data, int len,
 137                       int *new_len)
 138 {
 139         return decoding_backends[encoding]->decode_buffer(stream, data, len, new_len);
 140 }
 141
 142 /* Closes encoded stream. Note that fd associated with the stream will be
 143  * closed here. */
 144 void
 145 close_encoded(struct stream_encoded *stream)
 146 {
 147         decoding_backends[stream->encoding]->close(stream);
 148         mem_free(stream);
 149 }
 150
 151
 152 /* Return a list of extensions associated with that encoding. */
 153 const unsigned char *const *listext_encoded(enum stream_encoding encoding)
 154 {
 155         return decoding_backends[encoding]->extensions;
 156 }
 157
 158 enum stream_encoding
 159 guess_encoding(unsigned char *filename)
 160 {
 161         int fname_len = strlen(filename);
 162         unsigned char *fname_end = filename + fname_len;
 163         int enc;
 164
 165         for (enc = 1; enc < ENCODINGS_KNOWN; enc++) {
 166                 const unsigned char *const *ext = decoding_backends[enc]->extensions;
 167
 168                 while (ext && *ext) {
 169                         int len = strlen(*ext);
 170
 171                         if (fname_len >= len && !strcmp(fname_end - len, *ext))
 172                                 return enc;
 173
 174                         ext++;
 175                 }
 176         }
 177
 178         return ENCODING_NONE;
 179 }
 180
 181 const unsigned char *
 182 get_encoding_name(enum stream_encoding encoding)
 183 {
 184         return decoding_backends[encoding]->name;
 185 }
 186
 187
 188 /* File reading */
 189
 190 /* Tries to open @prefixname with each of the supported encoding extensions
 191  * appended. */
 192 static inline enum stream_encoding
 193 try_encoding_extensions(struct string *filename, int *fd)
 194 {
 195         int length = filename->length;
 196         int encoding;
 197
 198         /* No file of that name was found, try some others names. */
 199         for (encoding = 1; encoding < ENCODINGS_KNOWN; encoding++) {
 200                 const unsigned char *const *ext = listext_encoded(encoding);
 201
 202                 for (; ext && *ext; ext++) {
 203                         add_to_string(filename, *ext);
 204
 205                         /* We try with some extensions. */
 206                         *fd = open(filename->source, O_RDONLY | O_NOCTTY);
 207
 208                         if (*fd >= 0)
 209                                 /* Ok, found one, use it. */
 210                                 return encoding;
 211
 212                         filename->source[length] = 0;
 213                         filename->length = length;
 214                 }
 215         }
 216
 217         return ENCODING_NONE;
 218 }
 219
 220 /** Reads the file from @a stream in chunks of size @a readsize.
 221  *
 222  * @a stream should be in blocking mode.  If it is in non-blocking
 223  * mode, this function can return an empty string in @a page just
 224  * because no more data is available yet, and the caller cannot know
 225  * whether the true end of the stream has been reached.
 226  *
 227  * @return a connection state. S_OK if all is well. */
 228 struct connection_state
 229 read_file(struct stream_encoded *stream, int readsize, struct string *page)
 230 {
 231         if (!init_string(page)) return connection_state(S_OUT_OF_MEM);
 232
 233         /* We read with granularity of stt.st_size (given as @readsize) - this
 234          * does best job for uncompressed files, and doesn't hurt for
 235          * compressed ones anyway - very large files usually tend to inflate
 236          * fast anyway. At least I hope ;).  --pasky */
 237         /* Also there because of bug in Linux. Read returns -EACCES when
 238          * reading 0 bytes to invalid address so ensure never to try and
 239          * allocate zero number of bytes. */
 240         if (!readsize) readsize = 4096;
 241
 242         while (realloc_string(page, page->length + readsize)) {
 243                 unsigned char *string_pos = page->source + page->length;
 244                 int readlen = read_encoded(stream, string_pos, readsize);
 245
 246                 if (readlen < 0) {
 247                         done_string(page);
 248
 249                         /* If it is some I/O error (and errno is set) that will
 250                          * do. Since errno == 0 == S_WAIT and we cannot have
 251                          * that. */
 252                         if (errno)
 253                                 return connection_state_for_errno(errno);
 254
 255                         /* FIXME: This is indeed an internal error. If readed from a
 256                          * corrupted encoded file nothing or only some of the
 257                          * data will be read. */
 258                         return connection_state(S_ENCODE_ERROR);
 259
 260                 } else if (readlen == 0) {
 261                         /* NUL-terminate just in case */
 262                         page->source[page->length] = '\0';
 263                         return connection_state(S_OK);
 264                 }
 265
 266                 page->length += readlen;
 267 #if 0
 268                 /* This didn't work so well as it should (I had to implement
 269                  * end of stream handling to bzip2 anyway), so I rather
 270                  * disabled this. */
 271                 if (readlen < readsize) {
 272                         /* This is much safer. It should always mean that we
 273                          * already read everything possible, and it permits us
 274                          * more elegant of handling end of file with bzip2. */
 275                         break;
 276                 }
 277 #endif
 278         }
 279
 280         done_string(page);
 281         return connection_state(S_OUT_OF_MEM);
 282 }
 283
 284 static inline int
 285 is_stdin_pipe(struct stat *stt, struct string *filename)
 286 {
 287         /* On Mac OS X, /dev/stdin has type S_IFSOCK. (bug 616) */
 288         return !strlcmp(filename->source, filename->length, "/dev/stdin", 10)
 289                 && (
 290 #ifdef S_ISSOCK
 291                         S_ISSOCK(stt->st_mode) ||
 292 #endif
 293                         S_ISFIFO(stt->st_mode));
 294 }
 295
 296 struct connection_state
 297 read_encoded_file(struct string *filename, struct string *page)
 298 {
 299         struct stream_encoded *stream;
 300         struct stat stt;
 301         enum stream_encoding encoding = ENCODING_NONE;
 302         int fd = open(filename->source, O_RDONLY | O_NOCTTY);
 303         struct connection_state state = connection_state_for_errno(errno);
 304
 305         if (fd == -1 && get_opt_bool("protocol.file.try_encoding_extensions", NULL)) {
 306                 encoding = try_encoding_extensions(filename, &fd);
 307
 308         } else if (fd != -1) {
 309                 encoding = guess_encoding(filename->source);
 310         }
 311
 312         if (fd == -1) {
 313 #ifdef HAVE_SYS_CYGWIN_H
 314                 /* There is no /dev/stdin on Cygwin. */
 315                 if (!strlcmp(filename->source, filename->length, "/dev/stdin", 10)) {
 316                         fd = STDIN_FILENO;
 317                 } else
 318 #endif
 319                 return state;
 320         }
 321
 322         /* Some file was opened so let's get down to bi'ness */
 323         set_bin(fd);
 324
 325         /* Do all the necessary checks before trying to read the file.
 326          * @state code is used to block further progress. */
 327         if (fstat(fd, &stt)) {
 328                 state = connection_state_for_errno(errno);
 329
 330         } else if (!S_ISREG(stt.st_mode) && encoding != ENCODING_NONE) {
 331                 /* We only want to open regular encoded files. */
 332                 /* Leave @state being the saved errno */
 333
 334         } else if (!S_ISREG(stt.st_mode) && !is_stdin_pipe(&stt, filename)
 335                    && !get_opt_bool("protocol.file.allow_special_files", NULL)) {
 336                 state = connection_state(S_FILE_TYPE);
 337
 338         } else if (!(stream = open_encoded(fd, encoding))) {
 339                 state = connection_state(S_OUT_OF_MEM);
 340
 341         } else {
 342                 int readsize = (int) stt.st_size;
 343
 344                 /* Check if st_size will cause overflow. */
 345                 /* FIXME: See bug 497 for info about support for big files. */
 346                 if (readsize != stt.st_size || readsize < 0) {
 347 #ifdef EFBIG
 348                         state = connection_state_for_errno(EFBIG);
 349 #else
 350                         state = connection_state(S_FILE_ERROR);
 351 #endif
 352
 353                 } else {
 354                         state = read_file(stream, stt.st_size, page);
 355                 }
 356                 close_encoded(stream);
 357         }
 358
 359         close(fd);
 360         return state;
 361 }