From 365e2e24c733188823840bb2b13fafa286d6e028 Mon Sep 17 00:00:00 2001
From: Ali Gholami Rudi <ali@rudi.ir>
Date: Thu, 19 Feb 2015 08:53:36 +0330
Subject: [PATCH] stb_image: update stb_image.h to v2.02

---
 stb_image.c | 2875 ++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 2264 insertions(+), 611 deletions(-)

diff --git a/stb_image.c b/stb_image.c
index 9f8f8e4..c3945c2 100644
--- a/stb_image.c
+++ b/stb_image.c
@@ -1,19 +1,27 @@
-/* stb_image - v1.42 - public domain JPEG/PNG reader - http://nothings.org/stb_image.c
-   when you control the images you're loading
+/* stb_image - v2.02 - public domain image loader - http://nothings.org/stb_image.h
                                      no warranty implied; use at your own risk
 
    Do this:
       #define STB_IMAGE_IMPLEMENTATION
    before you include this file in *one* C or C++ file to create the implementation.
 
-   #define STBI_ASSERT(x) to avoid using assert.h.
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
 
    QUICK NOTES:
       Primarily of interest to game developers and other people who can
           avoid problematic images and only need the trivial interface
 
-      JPEG baseline (no JPEG progressive)
-      PNG 8-bit-per-channel only
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8-bit-per-channel (16 bpc not supported)
 
       TGA (not sure what subset, if a subset)
       BMP non-1bpp, non-RLE
@@ -22,29 +30,140 @@
       GIF (*comp always reports as 4-channel)
       HDR (radiance rgbE format)
       PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
 
       - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
       - decode from arbitrary I/O callbacks
-      - overridable dequantizing-IDCT, YCbCr-to-RGB conversion (define STBI_SIMD)
-
-   Latest revisions:
-      1.42 (2014-07-09) no _CRT_SECURE_NO_WARNINGS; error-path fixes; STBI_ASSERT
-      1.41 (2014-06-25) fix search&replace that messed up comments/error messages
-      1.40 (2014-06-22) gcc warning
-      1.39 (2014-06-15) TGA optimization fix, multiple BMP fixes
-      1.38 (2014-06-06) suppress MSVC run-time warnings, fix accidental rename of 'skip'
-      1.37 (2014-06-04) remove duplicate typedef
-      1.36 (2014-06-03) converted to header file, allow reading incorrect iphoned-images without iphone flag
-      1.35 (2014-05-27) warnings, bugfixes, TGA optimization, etc
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+   Revision 2.00 release notes:
+
+      - Progressive JPEG is now supported.
+
+      - PPM and PGM binary formats are now supported, thanks to Ken Miller.
+
+      - x86 platforms now make use of SSE2 SIMD instructions for
+        JPEG decoding, and ARM platforms can use NEON SIMD if requested.
+        This work was done by Fabian "ryg" Giesen. SSE2 is used by
+        default, but NEON must be enabled explicitly; see docs.
+
+        With other JPEG optimizations included in this version, we see
+        2x speedup on a JPEG on an x86 machine, and a 1.5x speedup
+        on a JPEG on an ARM machine, relative to previous versions of this
+        library. The same results will not obtain for all JPGs and for all
+        x86/ARM machines. (Note that progressive JPEGs are significantly
+        slower to decode than regular JPEGs.) This doesn't mean that this
+        is the fastest JPEG decoder in the land; rather, it brings it
+        closer to parity with standard libraries. If you want the fastest
+        decode, look elsewhere. (See "Philosophy" section of docs below.)
+
+        See final bullet items below for more info on SIMD.
+
+      - Added STBI_MALLOC, STBI_REALLOC, and STBI_FREE macros for replacing
+        the memory allocator. Unlike other STBI libraries, these macros don't
+        support a context parameter, so if you need to pass a context in to
+        the allocator, you'll have to store it in a global or a thread-local
+        variable.
+
+      - Split existing STBI_NO_HDR flag into two flags, STBI_NO_HDR and
+        STBI_NO_LINEAR.
+            STBI_NO_HDR:     suppress implementation of .hdr reader format
+            STBI_NO_LINEAR:  suppress high-dynamic-range light-linear float API
+
+      - You can suppress implementation of any of the decoders to reduce
+        your code footprint by #defining one or more of the following
+        symbols before creating the implementation.
+
+            STBI_NO_JPEG
+            STBI_NO_PNG
+            STBI_NO_BMP
+            STBI_NO_PSD
+            STBI_NO_TGA
+            STBI_NO_GIF
+            STBI_NO_HDR
+            STBI_NO_PIC
+            STBI_NO_PNM   (.ppm and .pgm)
+
+      - You can request *only* certain decoders and suppress all other ones
+        (this will be more forward-compatible, as addition of new decoders
+        doesn't require you to disable them explicitly):
+
+            STBI_ONLY_JPEG
+            STBI_ONLY_PNG
+            STBI_ONLY_BMP
+            STBI_ONLY_PSD
+            STBI_ONLY_TGA
+            STBI_ONLY_GIF
+            STBI_ONLY_HDR
+            STBI_ONLY_PIC
+            STBI_ONLY_PNM   (.ppm and .pgm)
+
+         Note that you can define multiples of these, and you will get all
+         of them ("only x" and "only y" is interpreted to mean "only x&y").
+
+       - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+         want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+
+      - Compilation of all SIMD code can be suppressed with
+            #define STBI_NO_SIMD
+        It should not be necessary to disable SIMD unless you have issues
+        compiling (e.g. using an x86 compiler which doesn't support SSE
+        intrinsics or that doesn't support the method used to detect
+        SSE2 support at run-time), and even those can be reported as
+        bugs so I can refine the built-in compile-time checking to be
+        smarter.
+
+      - The old STBI_SIMD system which allowed installing a user-defined
+        IDCT etc. has been removed. If you need this, don't upgrade. My
+        assumption is that almost nobody was doing this, and those who
+        were will find the built-in SIMD more satisfactory anyway.
+
+      - RGB values computed for JPEG images are slightly different from
+        previous versions of stb_image. (This is due to using less
+        integer precision in SIMD.) The C code has been adjusted so
+        that the same RGB values will be computed regardless of whether
+        SIMD support is available, so your app should always produce
+        consistent results. But these results are slightly different from
+        previous versions. (Specifically, about 3% of available YCbCr values
+        will compute different RGB results from pre-1.49 versions by +-1;
+        most of the deviating values are one smaller in the G channel.)
+
+      - If you must produce consistent results with previous versions of
+        stb_image, #define STBI_JPEG_OLD and you will get the same results
+        you used to; however, you will not get the SIMD speedups for
+        the YCbCr-to-RGB conversion step (although you should still see
+        significant JPEG speedup from the other changes).
+
+        Please note that STBI_JPEG_OLD is a temporary feature; it will be
+        removed in future versions of the library. It is only intended for
+        near-term back-compatibility use.
+
+
+   Latest revision history:
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPEG, including x86 SSE2 & ARM NEON SIMD
+                         progressive JPEG
+                         PGM/PPM support
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         STBI_NO_*, STBI_ONLY_*
+                         GIF bugfix
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support (both grayscale and paletted)
+                         optimize PNG
+                         fix bug in interlaced PNG with user-specified channel count
+      1.46  (2014-08-26) fix broken tRNS chunk in non-paletted PNG
+      1.45  (2014-08-16) workaround MSVC-ARM internal compiler error by wrapping malloc
 
    See end of file for full revision history.
 
-   TODO:
-      stbi_info support for BMP,PSD,HDR,PIC
-
 
  ============================    Contributors    =========================
-              
+
  Image formats                                Bug fixes & warning fixes
     Sean Barrett (jpeg, png, bmp)                Marc LeBlanc
     Nicolas Schulz (hdr, psd)                    Christpher Lloyd
@@ -52,40 +171,55 @@
     Jean-Marc Lienher (gif)                      Won Chun
     Tom Seddon (pic)                             the Horde3D community
     Thatcher Ulrich (psd)                        Janez Zemva
-                                                 Jonathan Blow
+    Ken Miller (pgm, ppm)                        Jonathan Blow
                                                  Laurent Gomila
- Extensions, features                            Aruelien Pocheville
-    Jetro Lauha (stbi_info)                      Ryamond Barbiero
-    James "moose2000" Brown (iPhone PNG)         David Woo
-    Ben "Disch" Wenger (io callbacks)            Roy Eltham
-    Martin "SpartanJ" Golini                     Luke Graham
-                                                 Thomas Ruf
+                                                 Aruelien Pocheville
+ Extensions, features                            Ryamond Barbiero
+    Jetro Lauha (stbi_info)                      David Woo
+    Martin "SpartanJ" Golini (stbi_info)         Martin Golini
+    James "moose2000" Brown (iPhone PNG)         Roy Eltham
+    Ben "Disch" Wenger (io callbacks)            Luke Graham
+    Omar Cornut (1/2/4-bit PNG)                  Thomas Ruf
                                                  John Bartholomew
- Optimizations & bugfixes                        Ken Hamada
-    Fabian "ryg" Giesen                          Cort Stratton
-    Arseny Kapoulkine                            Blazej Dariusz Roszkowski
-                                                 Thibault Reuille
- If your name should be here but                 Paul Du Bois
- isn't, let Sean know.                           Guillaume George
-                                                 Jerry Jansson
-                                                 Hayaki Saito
+                                                 Ken Hamada
+ Optimizations & bugfixes                        Cort Stratton
+    Fabian "ryg" Giesen                          Blazej Dariusz Roszkowski
+    Arseny Kapoulkine                            Thibault Reuille
+                                                 Paul Du Bois
+                                                 Guillaume George
+  If your name should be here but                Jerry Jansson
+  isn't, let Sean know.                          Hayaki Saito
                                                  Johan Duparc
+                                                 Ronny Chevalier
+                                                 Michal Cichon
+                                                 Tero Hanninen
+                                                 Sergio Gonzalez
+                                                 Cass Everitt
+                                                 Engin Manap
+
+License:
+   This software is in the public domain. Where that dedication is not
+   recognized, you are granted a perpetual, irrevocable license to copy
+   and modify this file however you want.
+
 */
 
 #ifndef STBI_INCLUDE_STB_IMAGE_H
 #define STBI_INCLUDE_STB_IMAGE_H
 
+// DOCUMENTATION
+//
 // Limitations:
-//    - no jpeg progressive support
-//    - non-HDR formats support 8-bit samples only (jpeg, png)
-//    - no delayed line count (jpeg) -- IJG doesn't support either
+//    - no 16-bit-per-channel PNG
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
 //    - no 1-bit BMP
 //    - GIF always returns *comp=4
 //
-// Basic usage (see HDR discussion below):
+// Basic usage (see HDR discussion below for HDR usage):
 //    int x,y,n;
 //    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
-//    // ... process data if not NULL ... 
+//    // ... process data if not NULL ...
 //    // ... x = width, y = height, n = # 8-bit components per pixel ...
 //    // ... replace '0' with '1'..'4' to force that many components per pixel
 //    // ... but 'n' will always be the number that it would have been if you said 0
@@ -98,14 +232,16 @@
 //    int req_comp -- if non-zero, # of image components requested in result
 //
 // The return value from an image loader is an 'unsigned char *' which points
-// to the pixel data. The pixel data consists of *y scanlines of *x pixels,
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
 // with each pixel consisting of N interleaved 8-bit components; the first
 // pixel pointed to is top-left-most in the image. There is no padding between
 // image scanlines or between pixels, regardless of format. The number of
 // components N is 'req_comp' if req_comp is non-zero, or *comp otherwise.
 // If req_comp is non-zero, *comp has the number of components that _would_
 // have been output otherwise. E.g. if you set req_comp to 4, you will always
-// get RGBA output, but you can check *comp to easily see if it's opaque.
+// get RGBA output, but you can check *comp to see if it's trivially opaque
+// because e.g. there were only 3 channels in the source image.
 //
 // An output image with N components has the following components interleaved
 // in this order in each pixel:
@@ -127,18 +263,66 @@
 //
 // ===========================================================================
 //
-// iPhone PNG support:
+// Philosophy
 //
-// By default we convert iphone-formatted PNGs back to RGB; nominally they
-// would silently load as BGR, except the existing code should have just
-// failed on such iPhone PNGs. But you can disable this conversion by
-// by calling stbi_convert_iphone_png_to_rgb(0), in which case
-// you will always just get the native iphone "format" through.
+// stb libraries are designed with the following priorities:
 //
-// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
-// pixel to remove any premultiplied alpha *only* if the image file explicitly
-// says there's premultiplied data (currently only happens in iPhone images,
-// and only if iPhone convert-to-rgb processing is on).
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy to use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries do not emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// make more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// The output of the JPEG decoder is slightly different from versions where
+// SIMD support was introduced (that is, for versions before 1.49). The
+// difference is only +-1 in the 8-bit RGB channels, and only on a small
+// fraction of pixels. You can force the pre-1.49 behavior by defining
+// STBI_JPEG_OLD, but this will disable some of the SIMD decoding path
+// and hence cost some performance.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
 //
 // ===========================================================================
 //
@@ -161,7 +345,7 @@
 // (linear) floats to preserve the full dynamic range:
 //
 //    float *data = stbi_loadf(filename, &x, &y, &n, 0);
-// 
+//
 // If you load LDR images through this interface, those images will
 // be promoted to floating point values, run through the inverse of
 // constants corresponding to the above:
@@ -178,15 +362,19 @@
 //
 // ===========================================================================
 //
-// I/O callbacks
+// iPhone PNG support:
 //
-// I/O callbacks allow you to read from arbitrary sources, like packaged
-// files or some other source. Data read from callbacks are processed
-// through a small internal buffer (currently 128 bytes) to try to reduce
-// overhead. 
+// By default we convert iphone-formatted PNGs back to RGB, even though
+// they are internally encoded differently. You can disable this conversion
+// by by calling stbi_convert_iphone_png_to_rgb(0), in which case
+// you will always just get the native iphone "format" through (which
+// is BGR stored in RGB).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
 //
-// The three functions you must define are "read" (reads some bytes of data),
-// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
 
 
 #ifndef STBI_NO_STDIO
@@ -226,41 +414,43 @@ extern "C" {
 // load image by filename, open file, or memory buffer
 //
 
-STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
-
-#ifndef STBI_NO_STDIO
-STBIDEF stbi_uc *stbi_load            (char const *filename,     int *x, int *y, int *comp, int req_comp);
-STBIDEF stbi_uc *stbi_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
-// for stbi_load_from_file, file pointer is left pointing immediately after image
-#endif
-
 typedef struct
 {
-   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read 
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
    void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
    int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
 } stbi_io_callbacks;
 
-STBIDEF stbi_uc *stbi_load_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load               (char              const *filename,           int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *comp, int req_comp);
 
-#ifndef STBI_NO_HDR
-   STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf                 (char const *filename,           int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp);
 
    #ifndef STBI_NO_STDIO
-   STBIDEF float *stbi_loadf            (char const *filename,   int *x, int *y, int *comp, int req_comp);
    STBIDEF float *stbi_loadf_from_file  (FILE *f,                int *x, int *y, int *comp, int req_comp);
    #endif
-   
-   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp);
+#endif
 
+#ifndef STBI_NO_HDR
    STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
    STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif
 
+#ifndef STBI_NO_LINEAR
    STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
    STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
 #endif // STBI_NO_HDR
 
-// stbi_is_hdr is always defined
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
 STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
 STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
 #ifndef STBI_NO_STDIO
@@ -271,7 +461,7 @@ STBIDEF int      stbi_is_hdr_from_file(FILE *f);
 
 // get a VERY brief reason for failure
 // NOT THREADSAFE
-STBIDEF const char *stbi_failure_reason  (void); 
+STBIDEF const char *stbi_failure_reason  (void);
 
 // free the loaded image -- this is just free()
 STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
@@ -309,26 +499,6 @@ STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int
 STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
 
 
-// define faster low-level operations (typically SIMD support)
-#ifdef STBI_SIMD
-typedef void (*stbi_idct_8x8)(stbi_uc *out, int out_stride, short data[64], unsigned short *dequantize);
-// compute an integer IDCT on "input"
-//     input[x] = data[x] * dequantize[x]
-//     write results to 'out': 64 samples, each run of 8 spaced by 'out_stride'
-//                             CLAMP results to 0..255
-typedef void (*stbi_YCbCr_to_RGB_run)(stbi_uc *output, stbi_uc const  *y, stbi_uc const *cb, stbi_uc const *cr, int count, int step);
-// compute a conversion from YCbCr to RGB
-//     'count' pixels
-//     write pixels to 'output'; each pixel is 'step' bytes (either 3 or 4; if 4, write '255' as 4th), order R,G,B
-//     y: Y input channel
-//     cb: Cb input channel; scale/biased to be 0..255
-//     cr: Cr input channel; scale/biased to be 0..255
-
-STBIDEF void stbi_install_idct(stbi_idct_8x8 func);
-STBIDEF void stbi_install_YCbCr_to_RGB(stbi_YCbCr_to_RGB_run func);
-#endif // STBI_SIMD
-
-
 #ifdef __cplusplus
 }
 #endif
@@ -340,22 +510,62 @@ STBIDEF void stbi_install_YCbCr_to_RGB(stbi_YCbCr_to_RGB_run func);
 
 #ifdef STB_IMAGE_IMPLEMENTATION
 
-#ifndef STBI_NO_HDR
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
 #include <math.h>  // ldexp
-#include <string.h> // strcmp, strtok
 #endif
 
 #ifndef STBI_NO_STDIO
 #include <stdio.h>
 #endif
-#include <stdlib.h>
-#include <string.h>
+
 #ifndef STBI_ASSERT
 #include <assert.h>
 #define STBI_ASSERT(x) assert(x)
 #endif
-#include <stdarg.h>
-#include <stddef.h> // ptrdiff_t on osx
+
 
 #ifndef _MSC_VER
    #ifdef __cplusplus
@@ -400,6 +610,94 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
    #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
 #endif
 
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && defined(STBI_REALLOC)
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)    malloc(sz)
+#define STBI_REALLOC(p,sz) realloc(p,sz)
+#define STBI_FREE(p)       free(p)
+#endif
+
+#if defined(__GNUC__) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
+// this is just broken and gcc are jerks for not fixing it properly
+// http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+static int stbi__sse2_available()
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+static int stbi__sse2_available()
+{
+#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 // GCC 4.8 or later
+   // GCC 4.8+ has a nice way to do this
+   return __builtin_cpu_supports("sse2");
+#else
+   // portable way to do this, preferably without using GCC inline ASM?
+   // just bail for now.
+   return 0;
+#endif
+}
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+// assume GCC or Clang on ARM targets
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
 ///////////////////////////////////////////////
 //
 //  stbi__context struct and start_xxx functions
@@ -410,7 +708,7 @@ typedef struct
 {
    stbi__uint32 img_x, img_y;
    int img_n, img_out_n;
-   
+
    stbi_io_callbacks io;
    void *io_user_data;
 
@@ -486,29 +784,59 @@ static void stbi__rewind(stbi__context *s)
    s->img_buffer = s->img_buffer_original;
 }
 
+#ifndef STBI_NO_JPEG
 static int      stbi__jpeg_test(stbi__context *s);
 static stbi_uc *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
 static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
 static int      stbi__png_test(stbi__context *s);
 static stbi_uc *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
 static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_BMP
 static int      stbi__bmp_test(stbi__context *s);
 static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
 static int      stbi__tga_test(stbi__context *s);
 static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
 static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
 static int      stbi__psd_test(stbi__context *s);
 static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
 #ifndef STBI_NO_HDR
 static int      stbi__hdr_test(stbi__context *s);
 static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
+
+#ifndef STBI_NO_PIC
 static int      stbi__pic_test(stbi__context *s);
 static stbi_uc *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
 static int      stbi__gif_test(stbi__context *s);
 static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
 static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
 
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
 
 // this is not threadsafe
 static const char *stbi__g_failure_reason;
@@ -524,6 +852,11 @@ static int stbi__err(const char *str)
    return 0;
 }
 
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
 // stbi__err - error
 // stbi__errpf - error returning pointer to float
 // stbi__errpuc - error returning pointer to unsigned char
@@ -541,22 +874,40 @@ static int stbi__err(const char *str)
 
 STBIDEF void stbi_image_free(void *retval_from_stbi_load)
 {
-   free(retval_from_stbi_load);
+   STBI_FREE(retval_from_stbi_load);
 }
 
-#ifndef STBI_NO_HDR
+#ifndef STBI_NO_LINEAR
 static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
 static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
 #endif
 
 static unsigned char *stbi_load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
 {
+   #ifndef STBI_NO_JPEG
    if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PNG
    if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_BMP
    if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_GIF
    if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PSD
    if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PIC
    if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp);
+   #endif
 
    #ifndef STBI_NO_HDR
    if (stbi__hdr_test(s)) {
@@ -565,28 +916,31 @@ static unsigned char *stbi_load_main(stbi__context *s, int *x, int *y, int *comp
    }
    #endif
 
+   #ifndef STBI_NO_TGA
    // test tga last because it's a crappy test!
    if (stbi__tga_test(s))
       return stbi__tga_load(s,x,y,comp,req_comp);
+   #endif
+
    return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
 }
 
 #ifndef STBI_NO_STDIO
 
-FILE *stbi__fopen(char const *filename, char const *mode)
+static FILE *stbi__fopen(char const *filename, char const *mode)
 {
    FILE *f;
-#if _MSC_VER >= 1400
-   if (0 != fopen_s(&f, filename, "rb"))
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
       f=0;
 #else
-   f = fopen(filename, "rb");
+   f = fopen(filename, mode);
 #endif
    return f;
 }
 
 
-STBIDEF unsigned char *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
 {
    FILE *f = stbi__fopen(filename, "rb");
    unsigned char *result;
@@ -596,7 +950,7 @@ STBIDEF unsigned char *stbi_load(char const *filename, int *x, int *y, int *comp
    return result;
 }
 
-STBIDEF unsigned char *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
 {
    unsigned char *result;
    stbi__context s;
@@ -610,23 +964,22 @@ STBIDEF unsigned char *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, i
 }
 #endif //!STBI_NO_STDIO
 
-STBIDEF unsigned char *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_mem(&s,buffer,len);
    return stbi_load_main(&s,x,y,comp,req_comp);
 }
 
-unsigned char *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
    return stbi_load_main(&s,x,y,comp,req_comp);
 }
 
-#ifndef STBI_NO_HDR
-
-float *stbi_loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+#ifndef STBI_NO_LINEAR
+static float *stbi_loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
 {
    unsigned char *data;
    #ifndef STBI_NO_HDR
@@ -639,14 +992,14 @@ float *stbi_loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp
    return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
 }
 
-float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_mem(&s,buffer,len);
    return stbi_loadf_main(&s,x,y,comp,req_comp);
 }
 
-float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
@@ -654,7 +1007,7 @@ float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int
 }
 
 #ifndef STBI_NO_STDIO
-float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
 {
    float *result;
    FILE *f = stbi__fopen(filename, "rb");
@@ -664,7 +1017,7 @@ float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
    return result;
 }
 
-float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_file(&s,f);
@@ -672,13 +1025,13 @@ float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
 }
 #endif // !STBI_NO_STDIO
 
-#endif // !STBI_NO_HDR
+#endif // !STBI_NO_LINEAR
 
-// these is-hdr-or-not is defined independent of whether STBI_NO_HDR is
-// defined, for API simplicity; if STBI_NO_HDR is defined, it always
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
 // reports false!
 
-int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
 {
    #ifndef STBI_NO_HDR
    stbi__context s;
@@ -726,17 +1079,17 @@ STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void
    #endif
 }
 
-#ifndef STBI_NO_HDR
 static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
 static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
 
-void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
-void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
-
-void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
-void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#ifndef STBI_NO_LINEAR
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
 #endif
 
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -745,9 +1098,9 @@ void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
 
 enum
 {
-   SCAN_load=0,
-   SCAN_type,
-   SCAN_header
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
 };
 
 static void stbi__refill_buffer(stbi__context *s)
@@ -755,7 +1108,7 @@ static void stbi__refill_buffer(stbi__context *s)
    int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
    if (n == 0) {
       // at end of file, treat same as if from memory, but need to handle case
-      // where s->img_buffer isn't pointing to safe memory, stbi__err.g. 0-byte file
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
       s->read_from_callbacks = 0;
       s->img_buffer = s->buffer_start;
       s->img_buffer_end = s->buffer_start+1;
@@ -786,7 +1139,7 @@ stbi_inline static int stbi__at_eof(stbi__context *s)
       if (s->read_from_callbacks == 0) return 1;
    }
 
-   return s->img_buffer >= s->img_buffer_end;   
+   return s->img_buffer >= s->img_buffer_end;
 }
 
 static void stbi__skip(stbi__context *s, int n)
@@ -810,7 +1163,7 @@ static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
          int res, count;
 
          memcpy(buffer, s->img_buffer, blen);
-         
+
          count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
          res = (count == (n-blen));
          s->img_buffer = s->img_buffer_end;
@@ -850,10 +1203,13 @@ static stbi__uint32 stbi__get32le(stbi__context *s)
    return z + (stbi__get16le(s) << 16);
 }
 
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+
 //////////////////////////////////////////////////////////////////////////////
 //
 //  generic converter from built-in img_n to req_comp
-//    individual types do this automatically as much as possible (stbi__err.g. jpeg
+//    individual types do this automatically as much as possible (e.g. jpeg
 //    does all cases internally since it needs to colorspace convert anyway,
 //    and it never has alpha, so very few cases ). png can automatically
 //    interleave an alpha=255 channel, but falls back to this for other cases
@@ -874,9 +1230,9 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
    if (req_comp == img_n) return data;
    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-   good = (unsigned char *) malloc(req_comp * x * y);
+   good = (unsigned char *) stbi__malloc(req_comp * x * y);
    if (good == NULL) {
-      free(data);
+      STBI_FREE(data);
       return stbi__errpuc("outofmem", "Out of memory");
    }
 
@@ -906,16 +1262,16 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
       #undef CASE
    }
 
-   free(data);
+   STBI_FREE(data);
    return good;
 }
 
-#ifndef STBI_NO_HDR
+#ifndef STBI_NO_LINEAR
 static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 {
    int i,k,n;
-   float *output = (float *) malloc(x * y * comp * sizeof(float));
-   if (output == NULL) { free(data); return stbi__errpf("outofmem", "Out of memory"); }
+   float *output = (float *) stbi__malloc(x * y * comp * sizeof(float));
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
    // compute number of non-alpha components
    if (comp & 1) n = comp; else n = comp-1;
    for (i=0; i < x*y; ++i) {
@@ -924,16 +1280,18 @@ static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
       }
       if (k < comp) output[i*comp + k] = data[i*comp+k]/255.0f;
    }
-   free(data);
+   STBI_FREE(data);
    return output;
 }
+#endif
 
+#ifndef STBI_NO_HDR
 #define stbi__float2int(x)   ((int) (x))
 static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
 {
    int i,k,n;
-   stbi_uc *output = (stbi_uc *) malloc(x * y * comp);
-   if (output == NULL) { free(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   stbi_uc *output = (stbi_uc *) stbi__malloc(x * y * comp);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
    // compute number of non-alpha components
    if (comp & 1) n = comp; else n = comp-1;
    for (i=0; i < x*y; ++i) {
@@ -950,17 +1308,16 @@ static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
          output[i*comp + k] = (stbi_uc) stbi__float2int(z);
       }
    }
-   free(data);
+   STBI_FREE(data);
    return output;
 }
 #endif
 
 //////////////////////////////////////////////////////////////////////////////
 //
-//  "baseline" JPEG/JFIF decoder (not actually fully baseline implementation)
+//  "baseline" JPEG/JFIF decoder
 //
 //    simple implementation
-//      - channel subsampling of at most 2 in each dimension
 //      - doesn't support delayed output of y-dimension
 //      - simple interface (only one output format: 8-bit interleaved RGB)
 //      - doesn't try to recover corrupt jpegs
@@ -974,13 +1331,10 @@ static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
 //      - quality integer IDCT derived from IJG's 'slow'
 //    performance
 //      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
 //      - uses a lot of intermediate memory, could cache poorly
-//      - load http://nothings.org/remote/anemones.jpg 3 times on 2.8Ghz P4
-//          stb_jpeg:   1.34 seconds (MSVC6, default release build)
-//          stb_jpeg:   1.06 seconds (MSVC6, processor = Pentium Pro)
-//          IJL11.dll:  1.08 seconds (compiled by intel)
-//          IJG 1998:   0.98 seconds (MSVC6, makefile provided by IJG)
-//          IJG 1998:   0.95 seconds (MSVC6, makefile + proc=PPro)
+
+#ifndef STBI_NO_JPEG
 
 // huffman decoding acceleration
 #define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
@@ -998,13 +1352,11 @@ typedef struct
 
 typedef struct
 {
-   #ifdef STBI_SIMD
-   unsigned short dequant2[4][64];
-   #endif
    stbi__context *s;
    stbi__huffman huff_dc[4];
    stbi__huffman huff_ac[4];
    stbi_uc dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
 
 // sizes for components, interleaved MCUs
    int img_h_max, img_v_max;
@@ -1022,17 +1374,31 @@ typedef struct
 
       int x,y,w2,h2;
       stbi_uc *data;
-      void *raw_data;
+      void *raw_data, *raw_coeff;
       stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
    } img_comp[4];
 
-   stbi__uint32         code_buffer; // jpeg entropy-coded buffer
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
    int            code_bits;   // number of valid bits
    unsigned char  marker;      // marker seen while filling entropy buffer
    int            nomore;      // flag if we saw a marker so must stop
 
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+
    int scan_n, order[4];
    int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
 } stbi__jpeg;
 
 static int stbi__build_huffman(stbi__huffman *h, int *count)
@@ -1076,6 +1442,33 @@ static int stbi__build_huffman(stbi__huffman *h, int *count)
    return 1;
 }
 
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (-1 << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k << 8) + (run << 4) + (len + magbits));
+         }
+      }
+   }
+}
+
 static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
 {
    do {
@@ -1146,31 +1539,45 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
    return h->values[c];
 }
 
+// bias[n] = (-1<<n) + 1
+static int const stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
 // combined JPEG 'receive' and JPEG 'extend', since baseline
 // always extends everything it receives.
 stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
 {
-   unsigned int m = 1 << (n-1);
    unsigned int k;
+   int sgn;
    if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
 
-   #if 1
+   sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
    k = stbi_lrot(j->code_buffer, n);
    j->code_buffer = k & ~stbi__bmask[n];
    k &= stbi__bmask[n];
    j->code_bits -= n;
-   #else
-   k = (j->code_buffer >> (32 - n)) & stbi__bmask[n];
+   return k + (stbi__jbias[n] & ~sgn);
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
    j->code_bits -= n;
-   j->code_buffer <<= n;
-   #endif
-   // the following test is probably a random branch that won't
-   // predict well. I tried to table accelerate it but failed.
-   // maybe it's compiling as a conditional move?
-   if (k < m)
-      return (-1 << n) + k + 1;
-   else
-      return k;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
 }
 
 // given a value that's at position X in the zigzag stream,
@@ -1191,10 +1598,13 @@ static stbi_uc stbi__jpeg_dezigzag[64+15] =
 };
 
 // decode one 64-entry block--
-static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, int b)
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi_uc *dequant)
 {
    int diff,dc,k;
-   int t = stbi__jpeg_huff_decode(j, hdc);
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
    if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
 
    // 0 all the ac values now so we can do it 32-bits at a time
@@ -1203,28 +1613,189 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
    diff = t ? stbi__extend_receive(j, t) : 0;
    dc = j->img_comp[b].dc_pred + diff;
    j->img_comp[b].dc_pred = dc;
-   data[0] = (short) dc;
+   data[0] = (short) (dc * dequant[0]);
 
    // decode AC components, see JPEG spec
    k = 1;
    do {
-      int r,s;
-      int rs = stbi__jpeg_huff_decode(j, hac);
-      if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-      s = rs & 15;
-      r = rs >> 4;
-      if (s == 0) {
-         if (rs != 0xf0) break; // end block
-         k += 16;
-      } else {
-         k += r;
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         j->code_buffer <<= s;
+         j->code_bits -= s;
          // decode into unzigzag'd location
-         data[stbi__jpeg_dezigzag[k++]] = (short) stbi__extend_receive(j,s);
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
       }
    } while (k < 64);
    return 1;
 }
 
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      data[0] = (short) (dc << j->succ_low);
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) << shift);
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) << shift);
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else
+                  r = 16; // r=15 is the code for 16 0s
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+                  ++k;
+               } else {
+                  if (r == 0) {
+                     if (s)
+                        data[stbi__jpeg_dezigzag[k++]] = (short) s;
+                     break;
+                  }
+                  --r;
+                  ++k;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
 // take a -128..127 value and stbi__clamp it and convert to 0..255
 stbi_inline static stbi_uc stbi__clamp(int x)
 {
@@ -1236,21 +1807,21 @@ stbi_inline static stbi_uc stbi__clamp(int x)
    return (stbi_uc) x;
 }
 
-#define stbi__f2f(x)  (int) (((x) * 4096 + 0.5))
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
 #define stbi__fsh(x)  ((x) << 12)
 
 // derived from jidctint -- DCT_ISLOW
-#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7)       \
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
    int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
    p2 = s2;                                    \
    p3 = s6;                                    \
-   p1 = (p2+p3) * stbi__f2f(0.5411961f);             \
-   t2 = p1 + p3*stbi__f2f(-1.847759065f);            \
-   t3 = p1 + p2*stbi__f2f( 0.765366865f);            \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
    p2 = s0;                                    \
    p3 = s4;                                    \
-   t0 = stbi__fsh(p2+p3);                            \
-   t1 = stbi__fsh(p2-p3);                            \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
    x0 = t0+t3;                                 \
    x3 = t0-t3;                                 \
    x1 = t1+t2;                                 \
@@ -1263,36 +1834,28 @@ stbi_inline static stbi_uc stbi__clamp(int x)
    p4 = t1+t3;                                 \
    p1 = t0+t3;                                 \
    p2 = t1+t2;                                 \
-   p5 = (p3+p4)*stbi__f2f( 1.175875602f);            \
-   t0 = t0*stbi__f2f( 0.298631336f);                 \
-   t1 = t1*stbi__f2f( 2.053119869f);                 \
-   t2 = t2*stbi__f2f( 3.072711026f);                 \
-   t3 = t3*stbi__f2f( 1.501321110f);                 \
-   p1 = p5 + p1*stbi__f2f(-0.899976223f);            \
-   p2 = p5 + p2*stbi__f2f(-2.562915447f);            \
-   p3 = p3*stbi__f2f(-1.961570560f);                 \
-   p4 = p4*stbi__f2f(-0.390180644f);                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
    t3 += p1+p4;                                \
    t2 += p2+p3;                                \
    t1 += p2+p4;                                \
    t0 += p1+p3;
 
-#ifdef STBI_SIMD
-typedef unsigned short stbi_dequantize_t;
-#else
-typedef stbi_uc stbi_dequantize_t;
-#endif
-
-// .344 seconds on 3*anemones.jpg
-static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64], stbi_dequantize_t *dequantize)
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
 {
    int i,val[64],*v=val;
-   stbi_dequantize_t *dq = dequantize;
    stbi_uc *o;
    short *d = data;
 
    // columns
-   for (i=0; i < 8; ++i,++d,++dq, ++v) {
+   for (i=0; i < 8; ++i,++d, ++v) {
       // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
       if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
            && d[40]==0 && d[48]==0 && d[56]==0) {
@@ -1300,11 +1863,10 @@ static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64], stbi_
          //    (1|2|3|4|5|6|7)==0          0     seconds
          //    all separate               -0.047 seconds
          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
-         int dcterm = d[0] * dq[0] << 2;
+         int dcterm = d[0] << 2;
          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
       } else {
-         STBI__IDCT_1D(d[ 0]*dq[ 0],d[ 8]*dq[ 8],d[16]*dq[16],d[24]*dq[24],
-                 d[32]*dq[32],d[40]*dq[40],d[48]*dq[48],d[56]*dq[56])
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
          // constants scaled things up by 1<<12; let's bring them back
          // down, but keep 2 extra bits of precision
          x0 += 512; x1 += 512; x2 += 512; x3 += 512;
@@ -1345,14 +1907,394 @@ static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64], stbi_
    }
 }
 
-#ifdef STBI_SIMD
-static stbi_idct_8x8 stbi__idct_installed = stbi__idct_block;
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
 
-STBIDEF void stbi_install_idct(stbi_idct_8x8 func)
-{
-   stbi__idct_installed = func;
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
 }
-#endif
+
+#endif // STBI_NEON
 
 #define STBI__MARKER_none  0xff
 // if there's a pending marker from the entropy stream, return that
@@ -1383,6 +2325,7 @@ static void stbi__jpeg_reset(stbi__jpeg *j)
    j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0;
    j->marker = STBI__MARKER_none;
    j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
    // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
    // since we don't even allow 1<<30 pixels
 }
@@ -1390,73 +2333,151 @@ static void stbi__jpeg_reset(stbi__jpeg *j)
 static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
 {
    stbi__jpeg_reset(z);
-   if (z->scan_n == 1) {
-      int i,j;
-      #ifdef STBI_SIMD
-      __declspec(align(16))
-      #endif
-      short data[64];
-      int n = z->order[0];
-      // non-interleaved data, we just need to process one block at a time,
-      // in trivial scanline order
-      // number of blocks to do just depends on how many actual "pixels" this
-      // component has, independent of interleaved MCU blocking and such
-      int w = (z->img_comp[n].x+7) >> 3;
-      int h = (z->img_comp[n].y+7) >> 3;
-      for (j=0; j < h; ++j) {
-         for (i=0; i < w; ++i) {
-            if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+z->img_comp[n].ha, n)) return 0;
-            #ifdef STBI_SIMD
-            stbi__idct_installed(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data, z->dequant2[z->img_comp[n].tq]);
-            #else
-            stbi__idct_block(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data, z->dequant[z->img_comp[n].tq]);
-            #endif
-            // every data block is an MCU, so countdown the restart interval
-            if (--z->todo <= 0) {
-               if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-               // if it's NOT a restart, then just bail, so we get corrupt data
-               // rather than no data
-               if (!STBI__RESTART(z->marker)) return 1;
-               stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
             }
          }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
       }
-   } else { // interleaved!
-      int i,j,k,x,y;
-      short data[64];
-      for (j=0; j < z->img_mcu_y; ++j) {
-         for (i=0; i < z->img_mcu_x; ++i) {
-            // scan an interleaved mcu... process scan_n components in order
-            for (k=0; k < z->scan_n; ++k) {
-               int n = z->order[k];
-               // scan out an mcu's worth of this component; that's just determined
-               // by the basic H and V specified for the component
-               for (y=0; y < z->img_comp[n].v; ++y) {
-                  for (x=0; x < z->img_comp[n].h; ++x) {
-                     int x2 = (i*z->img_comp[n].h + x)*8;
-                     int y2 = (j*z->img_comp[n].v + y)*8;
-                     if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+z->img_comp[n].ha, n)) return 0;
-                     #ifdef STBI_SIMD
-                     stbi__idct_installed(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data, z->dequant2[z->img_comp[n].tq]);
-                     #else
-                     stbi__idct_block(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data, z->dequant[z->img_comp[n].tq]);
-                     #endif
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
                   }
                }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
             }
-            // after all interleaved components, that's an interleaved MCU,
-            // so now count down the restart interval
-            if (--z->todo <= 0) {
-               if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-               // if it's NOT a restart, then just bail, so we get corrupt data
-               // rather than no data
-               if (!STBI__RESTART(z->marker)) return 1;
-               stbi__jpeg_reset(z);
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi_uc *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
             }
          }
       }
    }
-   return 1;
 }
 
 static int stbi__process_marker(stbi__jpeg *z, int m)
@@ -1466,9 +2487,6 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
       case STBI__MARKER_none: // no marker found
          return stbi__err("expected marker","Corrupt JPEG");
 
-      case 0xC2: // stbi__SOF - progressive
-         return stbi__err("progressive jpeg","JPEG format not supported (progressive)");
-
       case 0xDD: // DRI - specify restart interval
          if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
          z->restart_interval = stbi__get16be(z->s);
@@ -1484,10 +2502,6 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
             if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
             for (i=0; i < 64; ++i)
                z->dequant[t][stbi__jpeg_dezigzag[i]] = stbi__get8(z->s);
-            #ifdef STBI_SIMD
-            for (i=0; i < 64; ++i)
-               z->dequant2[t][i] = z->dequant[t][i];
-            #endif
             L -= 65;
          }
          return L==0;
@@ -1515,6 +2529,8 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
             }
             for (i=0; i < n; ++i)
                v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
             L -= n;
          }
          return L==0;
@@ -1527,28 +2543,42 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
    return 0;
 }
 
-// after we see stbi__SOS
+// after we see SOS
 static int stbi__process_scan_header(stbi__jpeg *z)
 {
    int i;
    int Ls = stbi__get16be(z->s);
    z->scan_n = stbi__get8(z->s);
-   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad stbi__SOS component count","Corrupt JPEG");
-   if (Ls != 6+2*z->scan_n) return stbi__err("bad stbi__SOS len","Corrupt JPEG");
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
    for (i=0; i < z->scan_n; ++i) {
       int id = stbi__get8(z->s), which;
       int q = stbi__get8(z->s);
       for (which = 0; which < z->s->img_n; ++which)
          if (z->img_comp[which].id == id)
             break;
-      if (which == z->s->img_n) return 0;
+      if (which == z->s->img_n) return 0; // no match
       z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
       z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
       z->order[i] = which;
    }
-   if (stbi__get8(z->s) != 0) return stbi__err("bad stbi__SOS","Corrupt JPEG");
-   stbi__get8(z->s); // should be 63, but might be 0
-   if (stbi__get8(z->s) != 0) return stbi__err("bad stbi__SOS","Corrupt JPEG");
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
 
    return 1;
 }
@@ -1557,8 +2587,8 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 {
    stbi__context *s = z->s;
    int Lf,p,i,q, h_max=1,v_max=1,c;
-   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad stbi__SOF len","Corrupt JPEG"); // JPEG
-   p  = stbi__get8(s);          if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
    s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
    s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
    c = stbi__get8(s);
@@ -1569,7 +2599,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
       z->img_comp[i].linebuf = NULL;
    }
 
-   if (Lf != 8+3*s->img_n) return stbi__err("bad stbi__SOF len","Corrupt JPEG");
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
 
    for (i=0; i < s->img_n; ++i) {
       z->img_comp[i].id = stbi__get8(s);
@@ -1582,7 +2612,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
       z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
    }
 
-   if (scan != SCAN_load) return 1;
+   if (scan != STBI__SCAN_load) return 1;
 
    if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
 
@@ -1600,64 +2630,78 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
    z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
 
    for (i=0; i < s->img_n; ++i) {
-      // number of effective pixels (stbi__err.g. for non-interleaved MCU)
+      // number of effective pixels (e.g. for non-interleaved MCU)
       z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
       z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
       // to simplify generation, we'll allocate enough memory to decode
       // the bogus oversized data from using interleaved MCUs and their
-      // big blocks (stbi__err.g. a 16x16 iMCU on an image of width 33); we won't
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
       // discard the extra data until colorspace conversion
       z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
       z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
-      z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
+      z->img_comp[i].raw_data = stbi__malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
+
       if (z->img_comp[i].raw_data == NULL) {
          for(--i; i >= 0; --i) {
-            free(z->img_comp[i].raw_data);
+            STBI_FREE(z->img_comp[i].raw_data);
             z->img_comp[i].data = NULL;
          }
          return stbi__err("outofmem", "Out of memory");
       }
-      // align blocks for installable-idct using mmx/sse
+      // align blocks for idct using mmx/sse
       z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
       z->img_comp[i].linebuf = NULL;
+      if (z->progressive) {
+         z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
+         z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
+         z->img_comp[i].raw_coeff = STBI_MALLOC(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      } else {
+         z->img_comp[i].coeff = 0;
+         z->img_comp[i].raw_coeff = 0;
+      }
    }
 
    return 1;
 }
 
-// use comparisons since in some cases we handle more than one case (stbi__err.g. stbi__SOF)
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
 #define stbi__DNL(x)         ((x) == 0xdc)
 #define stbi__SOI(x)         ((x) == 0xd8)
 #define stbi__EOI(x)         ((x) == 0xd9)
-#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
 #define stbi__SOS(x)         ((x) == 0xda)
 
-static int decode_jpeg_header(stbi__jpeg *z, int scan)
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
 {
    int m;
    z->marker = STBI__MARKER_none; // initialize cached marker to empty
    m = stbi__get_marker(z);
-   if (!stbi__SOI(m)) return stbi__err("no stbi__SOI","Corrupt JPEG");
-   if (scan == SCAN_type) return 1;
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
    m = stbi__get_marker(z);
    while (!stbi__SOF(m)) {
       if (!stbi__process_marker(z,m)) return 0;
       m = stbi__get_marker(z);
       while (m == STBI__MARKER_none) {
          // some files have extra padding after their blocks, so ok, we'll scan
-         if (stbi__at_eof(z->s)) return stbi__err("no stbi__SOF", "Corrupt JPEG");
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
          m = stbi__get_marker(z);
       }
    }
+   z->progressive = stbi__SOF_progressive(m);
    if (!stbi__process_frame_header(z, scan)) return 0;
    return 1;
 }
 
-static int decode_jpeg_image(stbi__jpeg *j)
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
 {
    int m;
    j->restart_interval = 0;
-   if (!decode_jpeg_header(j, SCAN_load)) return 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
    m = stbi__get_marker(j);
    while (!stbi__EOI(m)) {
       if (stbi__SOS(m)) {
@@ -1671,7 +2715,7 @@ static int decode_jpeg_image(stbi__jpeg *j)
                   j->marker = stbi__get8(j->s);
                   break;
                } else if (x != 0) {
-                  return 0;
+                  return stbi__err("junk before marker", "Corrupt JPEG");
                }
             }
             // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
@@ -1681,6 +2725,8 @@ static int decode_jpeg_image(stbi__jpeg *j)
       }
       m = stbi__get_marker(j);
    }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
    return 1;
 }
 
@@ -1764,6 +2810,123 @@ static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc
    return out;
 }
 
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
 static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 {
    // resample with nearest-neighbor
@@ -1775,10 +2938,10 @@ static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_
    return out;
 }
 
+#ifdef STBI_JPEG_OLD
+// this is the same YCbCr-to-RGB calculation that stb_image has used
+// historically before the algorithm changes in 1.49
 #define float2fixed(x)  ((int) ((x) * 65536 + 0.5))
-
-// 0.38 seconds on 3*anemones.jpg   (0.25 with processor = Pro)
-// VC6 without processor=Pro is generating multiple LEAs per multiply!
 static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
 {
    int i;
@@ -1803,16 +2966,196 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
       out += step;
    }
 }
+#else
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* float2fixed(1.40200f);
+      g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                               +   cb* float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
 
-#ifdef STBI_SIMD
-static stbi_YCbCr_to_RGB_run stbi__YCbCr_installed = stbi__YCbCr_to_RGB_row;
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
 
-STBIDEF void stbi_install_YCbCr_to_RGB(stbi_YCbCr_to_RGB_run func)
-{
-   stbi__YCbCr_installed = func;
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* float2fixed(1.40200f);
+      g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                             +   cb* float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
 }
 #endif
 
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      #ifndef STBI_JPEG_OLD
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      #endif
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   #ifndef STBI_JPEG_OLD
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   #endif
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
 
 // clean up the temporary component buffers
 static void stbi__cleanup_jpeg(stbi__jpeg *j)
@@ -1820,12 +3163,17 @@ static void stbi__cleanup_jpeg(stbi__jpeg *j)
    int i;
    for (i=0; i < j->s->img_n; ++i) {
       if (j->img_comp[i].raw_data) {
-         free(j->img_comp[i].raw_data);
+         STBI_FREE(j->img_comp[i].raw_data);
          j->img_comp[i].raw_data = NULL;
          j->img_comp[i].data = NULL;
       }
+      if (j->img_comp[i].raw_coeff) {
+         STBI_FREE(j->img_comp[i].raw_coeff);
+         j->img_comp[i].raw_coeff = 0;
+         j->img_comp[i].coeff = 0;
+      }
       if (j->img_comp[i].linebuf) {
-         free(j->img_comp[i].linebuf);
+         STBI_FREE(j->img_comp[i].linebuf);
          j->img_comp[i].linebuf = NULL;
       }
    }
@@ -1836,7 +3184,7 @@ typedef struct
    resample_row_func resample;
    stbi_uc *line0,*line1;
    int hs,vs;   // expansion factor in each axis
-   int w_lores; // horizontal pixels pre-expansion 
+   int w_lores; // horizontal pixels pre-expansion
    int ystep;   // how far through vertical expansion we are
    int ypos;    // which pre-expansion row we're on
 } stbi__resample;
@@ -1849,8 +3197,8 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
    // validate req_comp
    if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
 
-   // load a jpeg image from whichever source
-   if (!decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
 
    // determine actual number of components to generate
    n = req_comp ? req_comp : z->s->img_n;
@@ -1874,7 +3222,7 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
 
          // allocate line buffer big enough for upsampling off the edges
          // with upsample factor of 4
-         z->img_comp[k].linebuf = (stbi_uc *) malloc(z->s->img_x + 3);
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
          if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
 
          r->hs      = z->img_h_max / z->img_comp[k].h;
@@ -1887,12 +3235,12 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
          if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
          else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
          else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
-         else if (r->hs == 2 && r->vs == 2) r->resample = stbi__resample_row_hv_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
          else                               r->resample = stbi__resample_row_generic;
       }
 
       // can't error after this so, this is safe
-      output = (stbi_uc *) malloc(n * z->s->img_x * z->s->img_y + 1);
+      output = (stbi_uc *) stbi__malloc(n * z->s->img_x * z->s->img_y + 1);
       if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
 
       // now go ahead and resample
@@ -1915,11 +3263,7 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
          if (n >= 3) {
             stbi_uc *y = coutput[0];
             if (z->s->img_n == 3) {
-               #ifdef STBI_SIMD
-               stbi__YCbCr_installed(out, y, coutput[1], coutput[2], z->s->img_x, n);
-               #else
-               stbi__YCbCr_to_RGB_row(out, y, coutput[1], coutput[2], z->s->img_x, n);
-               #endif
+               z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
             } else
                for (i=0; i < z->s->img_x; ++i) {
                   out[0] = out[1] = out[2] = y[i];
@@ -1946,6 +3290,7 @@ static unsigned char *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *com
 {
    stbi__jpeg j;
    j.s = s;
+   stbi__setup_jpeg(&j);
    return load_jpeg_image(&j, x,y,comp,req_comp);
 }
 
@@ -1954,14 +3299,15 @@ static int stbi__jpeg_test(stbi__context *s)
    int r;
    stbi__jpeg j;
    j.s = s;
-   r = decode_jpeg_header(&j, SCAN_type);
+   stbi__setup_jpeg(&j);
+   r = stbi__decode_jpeg_header(&j, STBI__SCAN_type);
    stbi__rewind(s);
    return r;
 }
 
 static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
 {
-   if (!decode_jpeg_header(j, SCAN_header)) {
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
       stbi__rewind( j->s );
       return 0;
    }
@@ -1977,6 +3323,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
    j.s = s;
    return stbi__jpeg_info_raw(&j, x, y, comp);
 }
+#endif
 
 // public domain zlib decode    v0.2  Sean Barrett 2006-11-18
 //    simple implementation
@@ -1985,6 +3332,8 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 //    performance
 //      - fast huffman
 
+#ifndef STBI_NO_ZLIB
+
 // fast-way is faster to check than jpeg huffman, but slow way is slower
 #define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
 #define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
@@ -1998,7 +3347,7 @@ typedef struct
    int maxcode[17];
    stbi__uint16 firstsymbol[16];
    stbi_uc  size[288];
-   stbi__uint16 value[288]; 
+   stbi__uint16 value[288];
 } stbi__zhuffman;
 
 stbi_inline static int stbi__bitreverse16(int n)
@@ -2014,7 +3363,7 @@ stbi_inline static int stbi__bit_reverse(int v, int bits)
 {
    STBI_ASSERT(bits <= 16);
    // to bit reverse n bits, reverse 16 and shift
-   // stbi__err.g. 11 bits, bit reverse and shift away 5
+   // e.g. 11 bits, bit reverse and shift away 5
    return stbi__bitreverse16(v) >> (16-bits);
 }
 
@@ -2025,8 +3374,8 @@ static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
 
    // DEFLATE spec for generating codes
    memset(sizes, 0, sizeof(sizes));
-   memset(z->fast, 255, sizeof(z->fast));
-   for (i=0; i < num; ++i) 
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
       ++sizes[sizelist[i]];
    sizes[0] = 0;
    for (i=1; i < 16; ++i)
@@ -2048,12 +3397,13 @@ static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
       int s = sizelist[i];
       if (s) {
          int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
          z->size [c] = (stbi_uc     ) s;
          z->value[c] = (stbi__uint16) i;
          if (s <= STBI__ZFAST_BITS) {
             int k = stbi__bit_reverse(next_code[s],s);
             while (k < (1 << STBI__ZFAST_BITS)) {
-               z->fast[k] = (stbi__uint16) c;
+               z->fast[k] = fastv;
                k += (1 << s);
             }
          }
@@ -2105,21 +3455,12 @@ stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
    k = z->code_buffer & ((1 << n) - 1);
    z->code_buffer >>= n;
    z->num_bits -= n;
-   return k;   
+   return k;
 }
 
-stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
 {
    int b,s,k;
-   if (a->num_bits < 16) stbi__fill_bits(a);
-   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
-   if (b < 0xffff) {
-      s = z->size[b];
-      a->code_buffer >>= s;
-      a->num_bits -= s;
-      return z->value[b];
-   }
-
    // not resolved by fast table, so compute it the slow way
    // use jpeg approach, which requires MSbits at top
    k = stbi__bit_reverse(a->code_buffer, 16);
@@ -2135,16 +3476,31 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
    return z->value[b];
 }
 
-static int stbi__zexpand(stbi__zbuf *z, int n)  // need to make room for n bytes
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) stbi__fill_bits(a);
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
 {
    char *q;
    int cur, limit;
+   z->zout = zout;
    if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
    cur   = (int) (z->zout     - z->zout_start);
    limit = (int) (z->zout_end - z->zout_start);
    while (cur + n > limit)
       limit *= 2;
-   q = (char *) realloc(z->zout_start, limit);
+   q = (char *) STBI_REALLOC(z->zout_start, limit);
    if (q == NULL) return stbi__err("outofmem", "Out of memory");
    z->zout_start = q;
    z->zout       = q + cur;
@@ -2157,7 +3513,7 @@ static int stbi__zlength_base[31] = {
    15,17,19,23,27,31,35,43,51,59,
    67,83,99,115,131,163,195,227,258,0,0 };
 
-static int stbi__zlength_extra[31]= 
+static int stbi__zlength_extra[31]=
 { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
 
 static int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
@@ -2168,16 +3524,23 @@ static int stbi__zdist_extra[32] =
 
 static int stbi__parse_huffman_block(stbi__zbuf *a)
 {
+   char *zout = a->zout;
    for(;;) {
       int z = stbi__zhuffman_decode(a, &a->z_length);
       if (z < 256) {
          if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
-         if (a->zout >= a->zout_end) if (!stbi__zexpand(a, 1)) return 0;
-         *a->zout++ = (char) z;
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
       } else {
          stbi_uc *p;
          int len,dist;
-         if (z == 256) return 1;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
          z -= 257;
          len = stbi__zlength_base[z];
          if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
@@ -2185,11 +3548,18 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
          if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
          dist = stbi__zdist_base[z];
          if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
-         if (a->zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
-         if (a->zout + len > a->zout_end) if (!stbi__zexpand(a, len)) return 0;
-         p = (stbi_uc *) (a->zout - dist);
-         while (len--)
-            *a->zout++ = *p++;
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            do *zout++ = v; while (--len);
+         } else {
+            do *zout++ = *p++; while (--len);
+         }
       }
    }
 }
@@ -2262,7 +3632,7 @@ static int stbi__parse_uncomperssed_block(stbi__zbuf *a)
    if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
    if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
    if (a->zout + len > a->zout_end)
-      if (!stbi__zexpand(a, len)) return 0;
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
    memcpy(a->zout, a->zbuffer, len);
    a->zbuffer += len;
    a->zout += len;
@@ -2337,7 +3707,7 @@ static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse
 STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
 {
    stbi__zbuf a;
-   char *p = (char *) malloc(initial_size);
+   char *p = (char *) stbi__malloc(initial_size);
    if (p == NULL) return NULL;
    a.zbuffer = (stbi_uc *) buffer;
    a.zbuffer_end = (stbi_uc *) buffer + len;
@@ -2345,7 +3715,7 @@ STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int
       if (outlen) *outlen = (int) (a.zout - a.zout_start);
       return a.zout_start;
    } else {
-      free(a.zout_start);
+      STBI_FREE(a.zout_start);
       return NULL;
    }
 }
@@ -2358,7 +3728,7 @@ STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
 STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
 {
    stbi__zbuf a;
-   char *p = (char *) malloc(initial_size);
+   char *p = (char *) stbi__malloc(initial_size);
    if (p == NULL) return NULL;
    a.zbuffer = (stbi_uc *) buffer;
    a.zbuffer_end = (stbi_uc *) buffer + len;
@@ -2366,7 +3736,7 @@ STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, i
       if (outlen) *outlen = (int) (a.zout - a.zout_start);
       return a.zout_start;
    } else {
-      free(a.zout_start);
+      STBI_FREE(a.zout_start);
       return NULL;
    }
 }
@@ -2385,7 +3755,7 @@ STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer
 STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
 {
    stbi__zbuf a;
-   char *p = (char *) malloc(16384);
+   char *p = (char *) stbi__malloc(16384);
    if (p == NULL) return NULL;
    a.zbuffer = (stbi_uc *) buffer;
    a.zbuffer_end = (stbi_uc *) buffer+len;
@@ -2393,7 +3763,7 @@ STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int
       if (outlen) *outlen = (int) (a.zout - a.zout_start);
       return a.zout_start;
    } else {
-      free(a.zout_start);
+      STBI_FREE(a.zout_start);
       return NULL;
    }
 }
@@ -2408,6 +3778,7 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char
    else
       return -1;
 }
+#endif
 
 // public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
 //    simple implementation
@@ -2419,15 +3790,13 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char
 //    performance
 //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
 
-
+#ifndef STBI_NO_PNG
 typedef struct
 {
    stbi__uint32 length;
    stbi__uint32 type;
 } stbi__pngchunk;
 
-#define PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
-
 static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
 {
    stbi__pngchunk c;
@@ -2453,13 +3822,23 @@ typedef struct
 
 
 enum {
-   STBI__F_none=0, STBI__F_sub=1, STBI__F_up=2, STBI__F_avg=3, STBI__F_paeth=4,
-   STBI__F_avg_first, STBI__F_paeth_first
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
 };
 
 static stbi_uc first_row_filter[5] =
 {
-   STBI__F_none, STBI__F_sub, STBI__F_none, STBI__F_avg_first, STBI__F_paeth_first
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
 };
 
 static int stbi__paeth(int a, int b, int c)
@@ -2473,32 +3852,50 @@ static int stbi__paeth(int a, int b, int c)
    return c;
 }
 
-#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+static stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
 
 // create the png data from post-deflated data
-static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y)
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
 {
    stbi__context *s = a->s;
    stbi__uint32 i,j,stride = x*out_n;
+   stbi__uint32 img_len, img_width_bytes;
    int k;
    int img_n = s->img_n; // copy it into a local for later
+
    STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
-   a->out = (stbi_uc *) malloc(x * y * out_n);
+   a->out = (stbi_uc *) stbi__malloc(x * y * out_n); // extra bytes to write off the end into
    if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
    if (s->img_x == x && s->img_y == y) {
-      if (raw_len != (img_n * x + 1) * y) return stbi__err("not enough pixels","Corrupt PNG");
+      if (raw_len != img_len) return stbi__err("not enough pixels","Corrupt PNG");
    } else { // interlaced:
-      if (raw_len < (img_n * x + 1) * y) return stbi__err("not enough pixels","Corrupt PNG");
+      if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
    }
+
    for (j=0; j < y; ++j) {
       stbi_uc *cur = a->out + stride*j;
       stbi_uc *prior = cur - stride;
       int filter = *raw++;
-      if (filter > 4) return stbi__err("invalid filter","Corrupt PNG");
+      int filter_bytes = img_n;
+      int width = x;
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         STBI_ASSERT(img_width_bytes <= x);
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+
       // if first row, use special filter that doesn't sample previous row
       if (j == 0) filter = first_row_filter[filter];
-      // handle first pixel explicitly
-      for (k=0; k < img_n; ++k) {
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
          switch (filter) {
             case STBI__F_none       : cur[k] = raw[k]; break;
             case STBI__F_sub        : cur[k] = raw[k]; break;
@@ -2509,26 +3906,37 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
             case STBI__F_paeth_first: cur[k] = raw[k]; break;
          }
       }
-      if (img_n != out_n) cur[img_n] = 255;
-      raw += img_n;
-      cur += out_n;
-      prior += out_n;
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
       // this is a little gross, so that we don't switch per-pixel or per-component
-      if (img_n == out_n) {
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*img_n;
          #define CASE(f) \
              case f:     \
-                for (i=x-1; i >= 1; --i, raw+=img_n,cur+=img_n,prior+=img_n) \
-                   for (k=0; k < img_n; ++k)
+                for (k=0; k < nk; ++k)
          switch (filter) {
-            CASE(STBI__F_none)         cur[k] = raw[k]; break;
-            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-img_n]); break;
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); break;
             CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-img_n])>>1)); break;
-            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-img_n],prior[k],prior[k-img_n])); break;
-            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-img_n] >> 1)); break;
-            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-img_n],0,0)); break;
+            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); break;
+            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); break;
+            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); break;
+            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); break;
          }
          #undef CASE
+         raw += nk;
       } else {
          STBI_ASSERT(img_n+1 == out_n);
          #define CASE(f) \
@@ -2547,18 +3955,93 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
          #undef CASE
       }
    }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            // insert alpha = 255
+            stbi_uc *cur = a->out + stride*j;
+            int i;
+            if (img_n == 1) {
+               for (i=x-1; i >= 0; --i) {
+                  cur[i*2+1] = 255;
+                  cur[i*2+0] = cur[i];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (i=x-1; i >= 0; --i) {
+                  cur[i*4+3] = 255;
+                  cur[i*4+2] = cur[i*3+2];
+                  cur[i*4+1] = cur[i*3+1];
+                  cur[i*4+0] = cur[i*3+0];
+               }
+            }
+         }
+      }
+   }
+
    return 1;
 }
 
-static int stbi__create_png_image(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, int interlaced)
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
 {
    stbi_uc *final;
    int p;
    if (!interlaced)
-      return stbi__create_png_image_raw(a, raw, raw_len, out_n, a->s->img_x, a->s->img_y);
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
 
    // de-interlacing
-   final = (stbi_uc *) malloc(a->s->img_x * a->s->img_y * out_n);
+   final = (stbi_uc *) stbi__malloc(a->s->img_x * a->s->img_y * out_n);
    for (p=0; p < 7; ++p) {
       int xorig[] = { 0,4,0,2,0,1,0 };
       int yorig[] = { 0,0,4,0,2,0,1 };
@@ -2569,17 +4052,22 @@ static int stbi__create_png_image(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_l
       x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
       y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
       if (x && y) {
-         if (!stbi__create_png_image_raw(a, raw, raw_len, out_n, x, y)) {
-            free(final);
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
             return 0;
          }
-         for (j=0; j < y; ++j)
-            for (i=0; i < x; ++i)
-               memcpy(final + (j*yspc[p]+yorig[p])*a->s->img_x*out_n + (i*xspc[p]+xorig[p])*out_n,
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_n + out_x*out_n,
                       a->out + (j*x+i)*out_n, out_n);
-         free(a->out);
-         raw += (x*out_n+1)*y;
-         raw_len -= (x*out_n+1)*y;
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
       }
    }
    a->out = final;
@@ -2617,7 +4105,7 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int
    stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
    stbi_uc *p, *temp_out, *orig = a->out;
 
-   p = (stbi_uc *) malloc(pixel_count * pal_img_n);
+   p = (stbi_uc *) stbi__malloc(pixel_count * pal_img_n);
    if (p == NULL) return stbi__err("outofmem", "Out of memory");
 
    // between here and free(out) below, exitting would leak
@@ -2641,7 +4129,7 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int
          p += 4;
       }
    }
-   free(a->out);
+   STBI_FREE(a->out);
    a->out = temp_out;
 
    STBI_NOTUSED(len);
@@ -2689,7 +4177,7 @@ static void stbi__de_iphone(stbi__png *z)
             } else {
                p[0] = p[2];
                p[2] = t;
-            } 
+            }
             p += 4;
          }
       } else {
@@ -2704,12 +4192,14 @@ static void stbi__de_iphone(stbi__png *z)
    }
 }
 
+#define STBI__PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+
 static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
 {
    stbi_uc palette[1024], pal_img_n=0;
    stbi_uc has_trans=0, tc[3];
    stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
-   int first=1,k,interlace=0, is_iphone=0;
+   int first=1,k,interlace=0, color=0, depth=0, is_iphone=0;
    stbi__context *s = z->s;
 
    z->expanded = NULL;
@@ -2718,23 +4208,23 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
 
    if (!stbi__check_png_header(s)) return 0;
 
-   if (scan == SCAN_type) return 1;
+   if (scan == STBI__SCAN_type) return 1;
 
    for (;;) {
       stbi__pngchunk c = stbi__get_chunk_header(s);
       switch (c.type) {
-         case PNG_TYPE('C','g','B','I'):
+         case STBI__PNG_TYPE('C','g','B','I'):
             is_iphone = 1;
             stbi__skip(s, c.length);
             break;
-         case PNG_TYPE('I','H','D','R'): {
-            int depth,color,comp,filter;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
             if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
             first = 0;
             if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
             s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
             s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
-            depth = stbi__get8(s);  if (depth != 8)        return stbi__err("8bit only","PNG not supported: 8-bit only");
+            depth = stbi__get8(s);  if (depth != 1 && depth != 2 && depth != 4 && depth != 8)  return stbi__err("1/2/4/8-bit only","PNG not supported: 1/2/4/8-bit only");
             color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
             if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
             comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
@@ -2744,7 +4234,7 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (!pal_img_n) {
                s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
                if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
-               if (scan == SCAN_header) return 1;
+               if (scan == STBI__SCAN_header) return 1;
             } else {
                // if paletted, then pal_n is our final components, and
                // img_n is # components to decompress/filter.
@@ -2755,7 +4245,7 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             break;
          }
 
-         case PNG_TYPE('P','L','T','E'):  {
+         case STBI__PNG_TYPE('P','L','T','E'):  {
             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
             if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
             pal_len = c.length / 3;
@@ -2769,11 +4259,11 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             break;
          }
 
-         case PNG_TYPE('t','R','N','S'): {
+         case STBI__PNG_TYPE('t','R','N','S'): {
             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
             if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
             if (pal_img_n) {
-               if (scan == SCAN_header) { s->img_n = 4; return 1; }
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
                if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
                if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
                pal_img_n = 4;
@@ -2784,21 +4274,21 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
                has_trans = 1;
                for (k=0; k < s->img_n; ++k)
-                  tc[k] = (stbi_uc) (stbi__get16be(s) & 255); // non 8-bit images will be larger
+                  tc[k] = (stbi_uc) (stbi__get16be(s) & 255) * stbi__depth_scale_table[depth]; // non 8-bit images will be larger
             }
             break;
          }
 
-         case PNG_TYPE('I','D','A','T'): {
+         case STBI__PNG_TYPE('I','D','A','T'): {
             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
             if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
-            if (scan == SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
             if (ioff + c.length > idata_limit) {
                stbi_uc *p;
                if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
                while (ioff + c.length > idata_limit)
                   idata_limit *= 2;
-               p = (stbi_uc *) realloc(z->idata, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               p = (stbi_uc *) STBI_REALLOC(z->idata, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
                z->idata = p;
             }
             if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
@@ -2806,19 +4296,22 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             break;
          }
 
-         case PNG_TYPE('I','E','N','D'): {
-            stbi__uint32 raw_len;
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (scan != SCAN_load) return 1;
+            if (scan != STBI__SCAN_load) return 1;
             if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
-            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, 16384, (int *) &raw_len, !is_iphone);
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
             if (z->expanded == NULL) return 0; // zlib should set error
-            free(z->idata); z->idata = NULL;
+            STBI_FREE(z->idata); z->idata = NULL;
             if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
                s->img_out_n = s->img_n+1;
             else
                s->img_out_n = s->img_n;
-            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, interlace)) return 0;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, depth, color, interlace)) return 0;
             if (has_trans)
                if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
             if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
@@ -2831,7 +4324,7 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
                   return 0;
             }
-            free(z->expanded); z->expanded = NULL;
+            STBI_FREE(z->expanded); z->expanded = NULL;
             return 1;
          }
 
@@ -2861,7 +4354,7 @@ static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req
 {
    unsigned char *result=NULL;
    if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
-   if (stbi__parse_png_file(p, SCAN_load, req_comp)) {
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
       result = p->out;
       p->out = NULL;
       if (req_comp && req_comp != p->s->img_out_n) {
@@ -2871,11 +4364,11 @@ static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req
       }
       *x = p->s->img_x;
       *y = p->s->img_y;
-      if (n) *n = p->s->img_n;
+      if (n) *n = p->s->img_out_n;
    }
-   free(p->out);      p->out      = NULL;
-   free(p->expanded); p->expanded = NULL;
-   free(p->idata);    p->idata    = NULL;
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
 
    return result;
 }
@@ -2897,7 +4390,7 @@ static int stbi__png_test(stbi__context *s)
 
 static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
 {
-   if (!stbi__parse_png_file(p, SCAN_header, 0)) {
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
       stbi__rewind( p->s );
       return 0;
    }
@@ -2913,8 +4406,11 @@ static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
    p.s = s;
    return stbi__png_info_raw(&p, x, y, comp);
 }
+#endif
 
 // Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
 static int stbi__bmp_test_raw(stbi__context *s)
 {
    int r;
@@ -3073,11 +4569,11 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
       target = req_comp;
    else
       target = s->img_n; // if they want monochrome, we'll post-convert
-   out = (stbi_uc *) malloc(target * s->img_x * s->img_y);
+   out = (stbi_uc *) stbi__malloc(target * s->img_x * s->img_y);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
    if (bpp < 16) {
       int z=0;
-      if (psize == 0 || psize > 256) { free(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
       for (i=0; i < psize; ++i) {
          pal[i][2] = stbi__get8(s);
          pal[i][1] = stbi__get8(s);
@@ -3088,7 +4584,7 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
       stbi__skip(s, offset - 14 - hsz - psize * (hsz == 12 ? 3 : 4));
       if (bpp == 4) width = (s->img_x + 1) >> 1;
       else if (bpp == 8) width = s->img_x;
-      else { free(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
       pad = (-width)&3;
       for (j=0; j < (int) s->img_y; ++j) {
          for (i=0; i < (int) s->img_x; i += 2) {
@@ -3126,7 +4622,7 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
             easy = 2;
       }
       if (!easy) {
-         if (!mr || !mg || !mb) { free(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
          // right shift amt to put high bit in position #7
          rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
          gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
@@ -3152,7 +4648,7 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
                a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
-               if (target == 4) out[z++] = STBI__BYTECAST(a); 
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
             }
          }
          stbi__skip(s, pad);
@@ -3179,10 +4675,11 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
    if (comp) *comp = s->img_n;
    return out;
 }
+#endif
 
 // Targa Truevision - TGA
 // by Jonathan Dummer
-
+#ifndef STBI_NO_TGA
 static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
 {
     int tga_w, tga_h, tga_comp;
@@ -3302,7 +4799,7 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
    *y = tga_height;
    if (comp) *comp = tga_comp;
 
-   tga_data = (unsigned char*)malloc( tga_width * tga_height * tga_comp );
+   tga_data = (unsigned char*)stbi__malloc( tga_width * tga_height * tga_comp );
    if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
 
    // skip to the data's starting position (offset usually = 0)
@@ -3321,14 +4818,14 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
          //   any data to skip? (offset usually = 0)
          stbi__skip(s, tga_palette_start );
          //   load the palette
-         tga_palette = (unsigned char*)malloc( tga_palette_len * tga_palette_bits / 8 );
+         tga_palette = (unsigned char*)stbi__malloc( tga_palette_len * tga_palette_bits / 8 );
          if (!tga_palette) {
-            free(tga_data);
+            STBI_FREE(tga_data);
             return stbi__errpuc("outofmem", "Out of memory");
          }
          if (!stbi__getn(s, tga_palette, tga_palette_len * tga_palette_bits / 8 )) {
-            free(tga_data);
-            free(tga_palette);
+            STBI_FREE(tga_data);
+            STBI_FREE(tga_palette);
             return stbi__errpuc("bad palette", "Corrupt TGA");
          }
       }
@@ -3410,7 +4907,7 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
       //   clear my palette, if I had one
       if ( tga_palette != NULL )
       {
-         free( tga_palette );
+         STBI_FREE( tga_palette );
       }
    }
 
@@ -3438,10 +4935,12 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
    //   OK, done
    return tga_data;
 }
+#endif
 
 // *************************************************************************************************
 // Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
 
+#ifndef STBI_NO_PSD
 static int stbi__psd_test(stbi__context *s)
 {
    int r = (stbi__get32be(s) == 0x38425053);
@@ -3476,7 +4975,7 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
    // Read the rows and columns of the image.
    h = stbi__get32be(s);
    w = stbi__get32be(s);
-   
+
    // Make sure the depth is 8 bits.
    if (stbi__get16be(s) != 8)
       return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 bit");
@@ -3512,13 +5011,13 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
       return stbi__errpuc("bad compression", "PSD has an unknown compression format");
 
    // Create the destination image.
-   out = (stbi_uc *) malloc(4 * w*h);
+   out = (stbi_uc *) stbi__malloc(4 * w*h);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
    pixelCount = w*h;
 
    // Initialize the data to zero.
    //memset( out, 0, pixelCount * 4 );
-   
+
    // Finally, the image data.
    if (compression) {
       // RLE as used by .PSD and .TIFF
@@ -3536,7 +5035,7 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
       // Read the RLE data by channel.
       for (channel = 0; channel < 4; channel++) {
          stbi_uc *p;
-         
+
          p = out+channel;
          if (channel >= channelCount) {
             // Fill this channel with default data.
@@ -3574,15 +5073,15 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
             }
          }
       }
-      
+
    } else {
       // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
       // where each channel consists of an 8-bit value for each pixel in the image.
-      
+
       // Read the data by channel.
       for (channel = 0; channel < 4; channel++) {
          stbi_uc *p;
-         
+
          p = out + channel;
          if (channel > channelCount) {
             // Fill this channel with default data.
@@ -3603,9 +5102,10 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
    if (comp) *comp = channelCount;
    *y = h;
    *x = w;
-   
+
    return out;
 }
+#endif
 
 // *************************************************************************************************
 // Softimage PIC loader
@@ -3614,6 +5114,7 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
 // See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
 // See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
 
+#ifndef STBI_NO_PIC
 static int stbi__pic_is4(stbi__context *s,const char *str)
 {
    int i;
@@ -3797,11 +5298,11 @@ static stbi_uc *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int re
    stbi__get16be(s); //skip `pad'
 
    // intermediate buffer is RGBA
-   result = (stbi_uc *) malloc(x*y*4);
+   result = (stbi_uc *) stbi__malloc(x*y*4);
    memset(result, 0xff, x*y*4);
 
    if (!stbi__pic_load_core(s,x,y,comp, result)) {
-      free(result);
+      STBI_FREE(result);
       result=0;
    }
    *px = x;
@@ -3818,10 +5319,13 @@ static int stbi__pic_test(stbi__context *s)
    stbi__rewind(s);
    return r;
 }
+#endif
 
 // *************************************************************************************************
 // GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
-typedef struct 
+
+#ifndef STBI_NO_GIF
+typedef struct
 {
    stbi__int16 prefix;
    stbi_uc first;
@@ -3869,8 +5373,8 @@ static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], in
       pal[i][2] = stbi__get8(s);
       pal[i][1] = stbi__get8(s);
       pal[i][0] = stbi__get8(s);
-      pal[i][3] = transp ? 0 : 255;
-   }   
+      pal[i][3] = transp == i ? 0 : 255;
+   }
 }
 
 static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
@@ -3881,8 +5385,8 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
 
    version = stbi__get8(s);
    if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
-   if (stbi__get8(s) != 'a')                      return stbi__err("not GIF", "Corrupt GIF");
- 
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
    stbi__g_failure_reason = "";
    g->w = stbi__get16le(s);
    g->h = stbi__get16le(s);
@@ -3903,7 +5407,7 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
 
 static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
 {
-   stbi__gif g;   
+   stbi__gif g;
    if (!stbi__gif_header(s, &g, comp, 1)) {
       stbi__rewind( s );
       return 0;
@@ -3923,7 +5427,7 @@ static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
       stbi__out_gif_code(g, g->codes[code].prefix);
 
    if (g->cur_y >= g->max_y) return;
-  
+
    p = &g->out[g->cur_x + g->cur_y];
    c = &g->color_table[g->codes[code].suffix * 4];
 
@@ -3977,7 +5481,7 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
       if (valid_bits < codesize) {
          if (len == 0) {
             len = stbi__get8(s); // start new block
-            if (len == 0) 
+            if (len == 0)
                return g->out;
          }
          --len;
@@ -4022,7 +5526,7 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
          } else {
             return stbi__errpuc("illegal code in raster", "Corrupt GIF");
          }
-      } 
+      }
    }
 }
 
@@ -4048,19 +5552,19 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
 
    if (g->out == 0) {
       if (!stbi__gif_header(s, g, comp,0))     return 0; // stbi__g_failure_reason set by stbi__gif_header
-      g->out = (stbi_uc *) malloc(4 * g->w * g->h);
+      g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
       if (g->out == 0)                      return stbi__errpuc("outofmem", "Out of memory");
       stbi__fill_gif_background(g);
    } else {
       // animated-gif-only path
       if (((g->eflags & 0x1C) >> 2) == 3) {
          old_out = g->out;
-         g->out = (stbi_uc *) malloc(4 * g->w * g->h);
+         g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
          if (g->out == 0)                   return stbi__errpuc("outofmem", "Out of memory");
          memcpy(g->out, old_out, g->w*g->h*4);
       }
    }
-    
+
    for (;;) {
       switch (stbi__get8(s)) {
          case 0x2C: /* Image Descriptor */
@@ -4095,16 +5599,16 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
 
             if (g->lflags & 0x80) {
                stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
-               g->color_table = (stbi_uc *) g->lpal;       
+               g->color_table = (stbi_uc *) g->lpal;
             } else if (g->flags & 0x80) {
                for (i=0; i < 256; ++i)  // @OPTIMIZE: stbi__jpeg_reset only the previous transparent
-                  g->pal[i][3] = 255; 
+                  g->pal[i][3] = 255;
                if (g->transparent >= 0 && (g->eflags & 0x01))
                   g->pal[g->transparent][3] = 0;
                g->color_table = (stbi_uc *) g->pal;
             } else
                return stbi__errpuc("missing color table", "Corrupt GIF");
-   
+
             o = stbi__process_gif_raster(s, g);
             if (o == NULL) return NULL;
 
@@ -4161,7 +5665,7 @@ static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
 {
    return stbi__gif_info_raw(s,x,y,comp);
 }
-
+#endif
 
 // *************************************************************************************************
 // Radiance RGBE HDR loader
@@ -4250,7 +5754,7 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    // Check identifier
    if (strcmp(stbi__hdr_gettoken(s,buffer), "#?RADIANCE") != 0)
       return stbi__errpf("not HDR", "Corrupt HDR image");
-   
+
    // Parse header
    for(;;) {
       token = stbi__hdr_gettoken(s,buffer);
@@ -4278,7 +5782,7 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    if (req_comp == 0) req_comp = 3;
 
    // Read data
-   hdr_data = (float *) malloc(height * width * req_comp * sizeof(float));
+   hdr_data = (float *) stbi__malloc(height * width * req_comp * sizeof(float));
 
    // Load image data
    // image data is stored as some number of sca
@@ -4311,14 +5815,14 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
             stbi__hdr_convert(hdr_data, rgbe, req_comp);
             i = 1;
             j = 0;
-            free(scanline);
+            STBI_FREE(scanline);
             goto main_decode_loop; // yes, this makes no sense
          }
          len <<= 8;
          len |= stbi__get8(s);
-         if (len != width) { free(hdr_data); free(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
-         if (scanline == NULL) scanline = (stbi_uc *) malloc(width * 4);
-            
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) scanline = (stbi_uc *) stbi__malloc(width * 4);
+
          for (k = 0; k < 4; ++k) {
             i = 0;
             while (i < width) {
@@ -4339,7 +5843,7 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
          for (i=0; i < width; ++i)
             stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
       }
-      free(scanline);
+      STBI_FREE(scanline);
    }
 
    return hdr_data;
@@ -4385,6 +5889,7 @@ static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
 }
 #endif // STBI_NO_HDR
 
+#ifndef STBI_NO_BMP
 static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
 {
    int hsz;
@@ -4412,7 +5917,9 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
    *comp = stbi__get16le(s) / 8;
    return 1;
 }
+#endif
 
+#ifndef STBI_NO_PSD
 static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
 {
    int channelCount;
@@ -4443,7 +5950,9 @@ static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
    *comp = 4;
    return 1;
 }
+#endif
 
+#ifndef STBI_NO_PIC
 static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 {
    int act_comp=0,num_packets=0,chained;
@@ -4488,28 +5997,156 @@ static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 
    return 1;
 }
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+//    Does not support 16-bit-per-channel
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *out;
+   if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
+      return 0;
+   *x = s->img_x;
+   *y = s->img_y;
+   *comp = s->img_n;
+
+   out = (stbi_uc *) stbi__malloc(s->img_n * s->img_x * s->img_y);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+
+   if (req_comp && req_comp != s->img_n) {
+      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+      *c = (char) stbi__get8(s);
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv;
+   char c, p, t;
+
+   stbi__rewind( s );
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+
+   if (maxv > 255)
+      return stbi__err("max value > 255", "PPM image not 8-bit");
+   else
+      return 1;
+}
+#endif
 
 static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
 {
-   if (stbi__jpeg_info(s, x, y, comp))
-       return 1;
-   if (stbi__png_info(s, x, y, comp))
-       return 1;
-   if (stbi__gif_info(s, x, y, comp))
-       return 1;
-   if (stbi__bmp_info(s, x, y, comp))
-       return 1;
-   if (stbi__psd_info(s, x, y, comp))
-       return 1;
-   if (stbi__pic_info(s, x, y, comp))
-       return 1;
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
    #ifndef STBI_NO_HDR
-   if (stbi__hdr_info(s, x, y, comp))
-       return 1;
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
    #endif
+
    // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
    if (stbi__tga_info(s, x, y, comp))
        return 1;
+   #endif
    return stbi__err("unknown image type", "Image not of any known type, or corrupt");
 }
 
@@ -4552,122 +6189,138 @@ STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int
 
 #endif // STB_IMAGE_IMPLEMENTATION
 
-#if !defined(STBI_NO_STDIO) && defined(_MSC_VER) && _MSC_VER >= 1400
-#pragma warning(pop)
-#endif
-
-
 /*
    revision history:
-      1.42 (2014-07-09)
-             don't define _CRT_SECURE_NO_WARNINGS (affects user code)
-             fixes to stbi__cleanup_jpeg path
-             added STBI_ASSERT to avoid requiring assert.h
-      1.41 (2014-06-25)
-             fix search&replace from 1.36 that messed up comments/error messages
-      1.40 (2014-06-22)
-             fix gcc struct-initialization warning
-      1.39 (2014-06-15)
-             fix to TGA optimization when req_comp != number of components in TGA;
-             fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
-             add support for BMP version 5 (more ignored fields)
-      1.38 (2014-06-06)
-             suppress MSVC warnings on integer casts truncating values
-             fix accidental rename of 'skip' field of I/O
-      1.37 (2014-06-04)
-             remove duplicate typedef
-      1.36 (2014-06-03)
-             convert to header file single-file library
-             if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
-      1.35 (2014-05-27)
-             various warnings
-             fix broken STBI_SIMD path
-             fix bug where stbi_load_from_file no longer left file pointer in correct place
-             fix broken non-easy path for 32-bit BMP (possibly never used)
-             TGA optimization by Arseny Kapoulkine
-      1.34 (unknown)
-             use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
-      1.33 (2011-07-14)
-             make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
-      1.32 (2011-07-13)
-             support for "info" function for all supported filetypes (SpartanJ)
-      1.31 (2011-06-20)
-             a few more leak fixes, bug in PNG handling (SpartanJ)
-      1.30 (2011-06-11)
-             added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
-             removed deprecated format-specific test/load functions
-             removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
-             error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
-             fix inefficiency in decoding 32-bit BMP (David Woo)
-      1.29 (2010-08-16)
-             various warning fixes from Aurelien Pocheville 
-      1.28 (2010-08-01)
-             fix bug in GIF palette transparency (SpartanJ)
-      1.27 (2010-08-01)
-             cast-to-stbi_uc to fix warnings
-      1.26 (2010-07-24)
-             fix bug in file buffering for PNG reported by SpartanJ
-      1.25 (2010-07-17)
-             refix trans_data warning (Won Chun)
-      1.24 (2010-07-12)
-             perf improvements reading from files on platforms with lock-heavy fgetc()
-             minor perf improvements for jpeg
-             deprecated type-specific functions so we'll get feedback if they're needed
-             attempt to fix trans_data warning (Won Chun)
-      1.23   fixed bug in iPhone support
-      1.22 (2010-07-10)
-             removed image *writing* support
-             stbi_info support from Jetro Lauha
-             GIF support from Jean-Marc Lienher
-             iPhone PNG-extensions from James Brown
-             warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
-      1.21   fix use of 'stbi_uc' in header (reported by jon blow)
-      1.20   added support for Softimage PIC, by Tom Seddon
-      1.19   bug in interlaced PNG corruption check (found by ryg)
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
       1.18 2008-08-02
-             fix a threading bug (local mutable static)
-      1.17   support interlaced PNG
-      1.16   major bugfix - stbi__convert_format converted one too many pixels
-      1.15   initialize some fields for thread safety
-      1.14   fix threadsafe conversion bug
-             header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
-      1.13   threadsafe
-      1.12   const qualifiers in the API
-      1.11   Support installable IDCT, colorspace conversion routines
-      1.10   Fixes for 64-bit (don't use "unsigned long")
-             optimized upsampling by Fabian "ryg" Giesen
-      1.09   Fix format-conversion for PSD code (bad global variables!)
-      1.08   Thatcher Ulrich's PSD code integrated by Nicolas Schulz
-      1.07   attempt to fix C++ warning/errors again
-      1.06   attempt to fix C++ warning/errors again
-      1.05   fix TGA loading to return correct *comp and use good luminance calc
-      1.04   default float alpha is 1, not 255; use 'void *' for stbi_image_free
-      1.03   bugfixes to STBI_NO_STDIO, STBI_NO_HDR
-      1.02   support for (subset of) HDR files, float interface for preferred access to them
-      1.01   fix bug: possible bug in handling right-side up bmps... not sure
-             fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
-      1.00   interface to zlib that skips zlib header
-      0.99   correct handling of alpha in palette
-      0.98   TGA loader by lonesock; dynamically add loaders (untested)
-      0.97   jpeg errors on too large a file; also catch another malloc failure
-      0.96   fix detection of invalid v value - particleman@mollyrocket forum
-      0.95   during header scan, seek to markers in case of padding
-      0.94   STBI_NO_STDIO to disable stdio usage; rename all #defines the same
-      0.93   handle jpegtran output; verbose errors
-      0.92   read 4,8,16,24,32-bit BMP files of several formats
-      0.91   output 24-bit Windows 3.0 BMP files
-      0.90   fix a few more warnings; bump version number to approach 1.0
-      0.61   bugfixes due to Marc LeBlanc, Christopher Lloyd
-      0.60   fix compiling as c++
-      0.59   fix warnings: merge Dave Moore's -Wall fixes
-      0.58   fix bug: zlib uncompressed mode len/nlen was wrong endian
-      0.57   fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
-      0.56   fix bug: zlib uncompressed mode len vs. nlen
-      0.55   fix bug: restart_interval not initialized to 0
-      0.54   allow NULL for 'int *comp'
-      0.53   fix bug in png 3->4; speedup png decoding
-      0.52   png handles req_comp=3,4 directly; minor cleanup; jpeg comments
-      0.51   obey req_comp requests, 1-component jpegs return as 1-component,
-             on 'test' only check type, not whether we support this variant
-      0.50   first released version
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50    first released version
 */
-- 
2.11.4.GIT