From 374e7358174b9c4d1c9db8f50023a2d46d132662 Mon Sep 17 00:00:00 2001
From: Ketmar Dark <ketmar@ketmar.no-ip.org>
Date: Sun, 24 Apr 2016 22:17:10 +0300
Subject: [PATCH] brand new jpeg reader, this time it is working

---
 d2dimage.d |   19 +-
 glutils.d  |   14 +-
 jpeg.d     | 1673 ++++++++++++++++++++++++++++++++----------------------------
 3 files changed, 924 insertions(+), 782 deletions(-)
 rewrite jpeg.d (97%)

diff --git a/d2dimage.d b/d2dimage.d
index a481c44..8638064 100644
--- a/d2dimage.d
+++ b/d2dimage.d
@@ -377,14 +377,14 @@ private:
   }
 
   void loadJpeg (VFile fl) {
-    auto jpg = new JpegDecoder(fl, JpegDecoder.Upsampling.BILINEAR);
-    if (jpg.image.width < 1 || jpg.image.width > 32760) throw new Exception("invalid image width");
-    if (jpg.image.height < 1 || jpg.image.height > 32760) throw new Exception("invalid image height");
+    auto jpg = readJpeg(fl);
+    if (jpg.width < 1 || jpg.width > 32760) throw new Exception("invalid image width");
+    if (jpg.height < 1 || jpg.height > 32760) throw new Exception("invalid image height");
     mtex = null;
-    mwidth = jpg.image.width;
-    mheight = jpg.image.height;
+    mwidth = jpg.width;
+    mheight = jpg.height;
     sx = sy = 0;
-    mimg = jpg.image;
+    mimg = jpg;
   }
 
   void load (VFile fl) {
@@ -399,7 +399,8 @@ private:
       return;
     }
     // jpeg?
-    if (sign[0..2] == "\xff\xd8") {
+    if (sign[0..2] == "\xff\xd8" && detectJpeg(fl)) {
+      /*
       fl.seek(-2, Seek.End);
       fl.rawReadExact(sign[0..2]);
       if (sign[0..2] == "\xff\xd9") {
@@ -407,6 +408,10 @@ private:
         loadJpeg(fl);
         return;
       }
+      */
+      fl.seek(0);
+      loadJpeg(fl);
+      return;
     }
     // alas, this must be vga
     fl.seek(0);
diff --git a/glutils.d b/glutils.d
index c87f55e..a4ca846 100644
--- a/glutils.d
+++ b/glutils.d
@@ -272,10 +272,10 @@ public final class Texture : OpenGLObject {
   }
 
   void loadJpeg (VFile fl, in Option[] opts...) {
-    auto jpg = new JpegDecoder(fl, JpegDecoder.Upsampling.BILINEAR);
-    if (jpg.image.width < 1 || jpg.image.width > 32760) throw new Exception("invalid image width");
-    if (jpg.image.height < 1 || jpg.image.height > 32760) throw new Exception("invalid image height");
-    createIntr(jpg.image.width, jpg.image.height, jpg.image, opts);
+    auto jpg = readJpeg(fl);
+    if (jpg.width < 1 || jpg.width > 32760) throw new Exception("invalid image width");
+    if (jpg.height < 1 || jpg.height > 32760) throw new Exception("invalid image height");
+    createIntr(jpg.width, jpg.height, jpg, opts);
   }
 
   void loadImage (string fname, in Option[] opts...) {
@@ -292,7 +292,8 @@ public final class Texture : OpenGLObject {
       return;
     }
     // jpeg?
-    if (sign[0..2] == "\xff\xd8") {
+    if (sign[0..2] == "\xff\xd8" && detectJpeg(fl)) {
+      /*
       fl.seek(-2, Seek.End);
       fl.rawReadExact(sign[0..2]);
       fl.seek(0);
@@ -300,6 +301,9 @@ public final class Texture : OpenGLObject {
         loadJpeg(fl, opts);
         return;
       }
+      */
+      loadJpeg(fl, opts);
+      return;
     }
     throw new Exception("invalid texture image format");
   }
diff --git a/jpeg.d b/jpeg.d
dissimilarity index 97%
index 3781c13..228b463 100644
--- a/jpeg.d
+++ b/jpeg.d
@@ -1,770 +1,903 @@
-// Written in the D programming language.
-
-/**
-* Copyright: Copyright 2012 -
-* License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
-* Authors: Callum Anderson
-* Date: June 6, 2012
-*/
-module jpeg;
-
-import iv.vfs;
-import arsd.color;
-
-
-/**
-* Jpeg decoder. Great reference for baseline JPEG
-* deconding: http://www.opennet.ru/docs/formats/jpeg.txt.
-*/
-class JpegDecoder  {
-  static struct IMGError {
-    string message;
-    int code;
-  }
-
-  bool imageComplete = false;
-  IMGError m_errorState;
-  TrueColorImage m_image;
-
-  @property final auto image() () { pragma(inline, true); return m_image; }
-  @property final IMGError errorState() () const { return m_errorState; }
-
-  // Clamp an integer to 0-255 (ubyte)
-  static ubyte clamp() (const int x) {
-    pragma(inline, true);
-    return (x < 0 ? 0 : (x > 0xFF ? 0xFF : cast(ubyte) x));
-  }
-
-  // Algorithms for upsampling the chroma components, defaults to NEAREST.
-  enum Upsampling {
-    NEAREST,  // Nearest neighbour (fastest)
-    BILINEAR, // Bilinear interpolation
-  }
-
-  // Empty constructor, useful for parsing a stream manually
-  this (VFile fl, in Upsampling algo=Upsampling.NEAREST) {
-    // set the resampling algorithm delegate
-         if (algo == Upsampling.NEAREST) resampleDgt = &nearestNeighbourResample;
-    else if (algo == Upsampling.BILINEAR) resampleDgt = &bilinearResample;
-    else resampleDgt = &nearestNeighbourResample; // just in case
-    ubyte[1] b;
-    for (;;) {
-      if (fl.rawRead(b[]).length != 1) break;
-      parseByte(b.ptr[0]);
-      if (imageComplete) break;
-    }
-    if (!imageComplete) throw new Exception("invalid jpeg image");
-  }
-
-  this (ubyte[] data, in Upsampling algo=Upsampling.NEAREST) {
-    // set the resampling algorithm delegate
-         if (algo == Upsampling.NEAREST) resampleDgt = &nearestNeighbourResample;
-    else if (algo == Upsampling.BILINEAR) resampleDgt = &bilinearResample;
-    else resampleDgt = &nearestNeighbourResample; // just in case
-    foreach (ubyte b; data) {
-      parseByte(b);
-      if (imageComplete) break;
-    }
-    if (!imageComplete) throw new Exception("invalid jpeg image");
-  }
-
-  // parse a single byte
-  void parseByte (ubyte bite) {
-    segment.buffer ~= bite;
-    if (bite == 0xFF) { markerPending = true; return; }
-    if (markerPending) {
-      markerPending = false;
-      if (bite == 0x00) {
-        // this is an 0xFF value
-        segment.buffer = segment.buffer[0..$-1];
-        bite = 0xFF;
-      } else if (bite >= 0xD0 && bite <= 0xD7) {
-        // restart marker
-        segment.buffer = segment.buffer[0..$-2];
-        return;
-      } else if (cast(Marker)bite == Marker.EndOfImage) {
-        previousMarker = currentMarker;
-        currentMarker = cast(Marker) bite;
-        endOfImage();
-        return;
-      } else {
-        previousMarker = currentMarker;
-        currentMarker = cast(Marker) bite;
-        segment = JPGSegment();
-        return;
-      }
-    }
-    if (!segment.headerProcessed) {
-      if (segment.buffer.length == 2) {
-        segment.headerLength = (segment.buffer[0] << 8 | segment.buffer[1]);
-        return;
-      } else if (segment.buffer.length == segment.headerLength) {
-        debug if (m_logging) writeln(currentMarker);
-        processHeader();
-        segment.headerProcessed = true;
-        segment.buffer = null;
-        return;
-      }
-    } else {
-      if (currentMarker == Marker.StartOfScan) sosAction(bite);
-    }
-    ++totalBytesParsed;
-  } // parse
-
-private:
-  // Markers courtesy of http://techstumbler.blogspot.com/2008/09/jpeg-marker-codes.html
-  enum Marker {
-    None = 0x00,
-
-    // Start of Frame markers, non-differential, Huffman coding
-    HuffBaselineDCT = 0xC0,
-    HuffExtSequentialDCT = 0xC1,
-    HuffProgressiveDCT = 0xC2,
-    HuffLosslessSeq = 0xC3,
-
-    // Start of Frame markers, differential, Huffman coding
-    HuffDiffSequentialDCT = 0xC5,
-    HuffDiffProgressiveDCT = 0xC6,
-    HuffDiffLosslessSeq = 0xC7,
-
-    // Start of Frame markers, non-differential, arithmetic coding
-    ArthBaselineDCT = 0xC8,
-    ArthExtSequentialDCT = 0xC9,
-    ArthProgressiveDCT = 0xCA,
-    ArthLosslessSeq = 0xCB,
-
-    // Start of Frame markers, differential, arithmetic coding
-    ArthDiffSequentialDCT = 0xCD,
-    ArthDiffProgressiveDCT = 0xCE,
-    ArthDiffLosslessSeq = 0xCF,
-
-    // Huffman table spec
-    HuffmanTableDef = 0xC4,
-
-    // Arithmetic table spec
-    ArithmeticTableDef = 0xCC,
-
-    // Restart Interval termination
-    RestartIntervalStart = 0xD0,
-    RestartIntervalEnd = 0xD7,
-
-    // Other markers
-    StartOfImage = 0xD8,
-    EndOfImage = 0xD9,
-    StartOfScan = 0xDA,
-    QuantTableDef = 0xDB,
-    NumberOfLinesDef = 0xDC,
-    RestartIntervalDef = 0xDD,
-    HierarchProgressionDef = 0xDE,
-    ExpandRefComponents = 0xDF,
-
-    // Restarts
-    Rst0 = 0xD0, Rst1 = 0xD1, Rst2 = 0xD2, Rst3 = 0xD3,
-    Rst4 = 0xD4, Rst5 = 0xD5, Rst6 = 0xD6, Rst7 = 0xD7,
-
-    // App segments
-    App0 = 0xE0, App1 = 0xE1, App2 = 0xE2, App3 = 0xE3,
-    App4 = 0xE4, App5 = 0xE5, App6 = 0xE6, App7 = 0xE7,
-    App8 = 0xE8, App9 = 0xE9, App10 = 0xEA, App11 = 0xEB,
-    App12 = 0xEC, App13 = 0xED, App14 = 0xEE, App15 = 0xEF,
-
-    // Jpeg Extensions
-    JpegExt0 = 0xF0, JpegExt1 = 0xF1, JpegExt2 = 0xF2, JpegExt3 = 0xF3,
-    JpegExt4 = 0xF4, JpegExt5 = 0xF5, JpegExt6 = 0xF6, JpegExt7 = 0xF7,
-    JpegExt8 = 0xF8, JpegExt9 = 0xF9, JpegExtA = 0xFA, JpegExtB = 0xFB,
-    JpegExtC = 0xFC, JpegExtD = 0xFD,
-
-    // Comments
-    Comment = 0xFE,
-
-    // Reserved
-    ArithTemp = 0x01,
-    ReservedStart = 0x02,
-    ReservedEnd = 0xBF
-  }
-
-  // Value at dctComponent[ix] should go to grid[block_order[ix]]
-  static immutable ubyte[64] block_order =
-      [ 0,  1,  8, 16,  9,  2,  3, 10,   17, 24, 32, 25, 18, 11,  4,  5,
-        12, 19, 26, 33, 40, 48, 41, 34,   27, 20, 13,  6,  7, 14, 21, 28,
-        35, 42, 49, 56, 57, 50, 43, 36,   29, 22, 15, 23, 30, 37, 44, 51,
-        58, 59, 52, 45, 38, 31, 39, 46,   53, 60, 61, 54, 47, 55, 62, 63 ];
-
-  ulong totalBytesParsed = 0;
-  ulong segmentBytesParsed = 0;
-  Marker currentMarker = Marker.None;
-  Marker previousMarker = Marker.None;
-  bool markerPending = false;
-  void delegate(uint cmpIndex) resampleDgt;
-
-  string format = "unknown"; // File format (will only do JFIF)
-  ubyte nComponents, precision;
-
-  struct Component {
-    int id, // component id
-    qtt, // quantization table id
-    h_sample, // horizontal samples
-    v_sample; // vertical samples
-    ubyte[] data; // a single MCU of data for this component
-    int x, y; // x, y are size of MCU
-  }
-  Component[] components;
-
-  // Store the image comment field if any
-  char[] comment;
-
-  // Quantization Tables (hash map of ubyte[64]'s, indexed by table index)
-  ubyte[][int] quantTable;
-
-  // Huffman tables are stored in a hash map
-  ubyte[16] nCodes; // Number of codes of each bit length (cleared after each table is defined)
-  struct hashKey {
-    ubyte index;    // Table index
-    ubyte nBits;    // Number of bits in code
-    short bitCode;  // Actual bit code
-  }
-  ubyte[hashKey] huffmanTable;
-
-  // Track the state of a scan segment
-  struct ScanState {
-    short cmpIdx = 0; // Current component index in scan
-
-    int MCUWidth, MCUHeight; // Dimensions of an MCU
-    int nxMCU, nyMCU, xMCU, yMCU; // Number of MCU's, and current MCU
-
-    uint buffer = 0, bufferLength = 0, needBits = 0;
-    bool comparing = true;
-    ubyte[3] dct, act, nCmpBlocks; // dct, act store the DC and AC table indexes for each component
-
-    int[3] dcTerm;  // DC coefficients for each component
-    int[64] dctComponents; // DCT coefficients for current component
-    uint dctCmpIndex = 0, blockNumber = 0; // DCT coefficient index and current block in MCU
-    int restartInterval; // How many MCU's are parsed before a restart (reset the DC terms)
-    int MCUSParsed; // Number of image MCU's parsed, for use with restart interval
-  }
-  ScanState scState; // ditto
-
-  struct JPGSegment {
-    bool headerProcessed;
-    int headerLength;
-    ubyte[] buffer;
-  }
-  JPGSegment segment;
-
-  debug bool m_logging;
-  short x, y; // These are the final image width and height
-
-  // Process a segment header
-  void processHeader () {
-    /**
-    * Remember: first two bytes in the buffer are the header length,
-    * so header info starts at segment.buffer[2]!
-    */
-    switch (currentMarker) {
-      case Marker.Comment: // Comment segment
-        comment = cast(char[])segment.buffer[2..$];
-        debug if (m_logging) writeln("JPEG: Comment: ", comment);
-        break;
-      case Marker.App0: // App0, indicates JFIF format
-        if (previousMarker == Marker.StartOfImage) format = "JFIF";
-        break;
-      case Marker.RestartIntervalDef: // Restart interval definition
-        scState.restartInterval = cast(int)(segment.buffer[2] << 8 | segment.buffer[3]);
-        debug if (m_logging) writeln("JPEG: Restart interval = ", scState.restartInterval);
-        break;
-      case Marker.QuantTableDef: // A quantization table definition
-        for (int i = 2; i < segment.buffer.length; i += 65) {
-          int precision = (segment.buffer[i] >> 4);
-          int index = (segment.buffer[i] & 0x0F);
-          quantTable[index] = segment.buffer[i+1..i+1+64].dup;
-          debug if (m_logging) writefln("JPEG: Quantization table %s defined", index);
-        }
-        break;
-      case Marker.HuffBaselineDCT: // Baseline frame
-        ubyte precision = segment.buffer[2];
-        y = cast(short) (segment.buffer[3] << 8 | segment.buffer[4]);
-        x = cast(short) (segment.buffer[5] << 8 | segment.buffer[6]);
-        nComponents = segment.buffer[7];
-        components.length = nComponents;
-        int i = 8;
-        foreach (cmp; 0..nComponents) {
-          components[cmp].id = segment.buffer[i];
-          components[cmp].h_sample = (segment.buffer[i+1] >> 4);
-          components[cmp].v_sample = (segment.buffer[i+1] & 0x0F);
-          components[cmp].qtt = segment.buffer[i+2];
-          i += 3;
-          debug if (m_logging) writefln("JPEG: Component %s defined", cmp);
-        }
-        break;
-      case Marker.HuffProgressiveDCT: // Progressive JPEG, cannot decode
-        m_errorState.code = 1;
-        m_errorState.message = "JPG: Progressive JPEG detected, unable to load";
-        break;
-      case Marker.HuffmanTableDef: // Huffman Table Definition, the mapping between bitcodes and Huffman codes
-        int i = 2;
-        while (i < segment.buffer.length) {
-          import std.algorithm : reduce;
-          ubyte index = segment.buffer[i]; // Huffman table index
-          ++i;
-
-          auto nCodes = segment.buffer[i..i+16]; // Number of codes at each tree depth
-          int totalCodes = reduce!("a + b")(0, nCodes); // Sum up total codes, so we know when we are done
-          int storedCodes = 0;
-          i += 16;
-
-          ubyte huffmanRow = 0;
-          short huffmanCol = 0;
-          while (storedCodes != totalCodes) {
-            /**
-            * If nCodes is zero, we need to move down the table. The 'table'
-            * is basically a binary tree, seen as an array.
-            */
-            while (huffmanRow < 15 && nCodes[huffmanRow] == 0) {
-              ++huffmanRow;
-              huffmanCol *= 2;
-            }
-
-            if (huffmanRow < 16) {
-              // Store the code into the hash table, using index, row and bitcode to make the key
-              hashKey key = { index:index, nBits: cast(ubyte)(huffmanRow+1), bitCode: huffmanCol};
-              huffmanTable[key] = segment.buffer[i];
-              ++storedCodes;
-              ++huffmanCol;
-              --nCodes[huffmanRow];
-              ++i;
-            }
-          } // while storedCodes != totalCodes
-        }
-        break;
-      case Marker.StartOfScan: // StartOfScan (image data) header
-        int scanComponents = segment.buffer[2]; // Number of components in the scan
-        if (scanComponents != nComponents) throw new Exception("JPEG: Scan components != image components!");
-
-        int i = 3;
-        foreach (cmp; 0..scanComponents) {
-          ubyte id = cast(ubyte)(segment.buffer[i] - 1);
-          scState.dct[id] = segment.buffer[i+1] >> 4;   // Component's DC huffman table
-          scState.act[id] = segment.buffer[i+1] & 0x0F; // Component's AC huffman table
-        }
-        // There is more to the header, but it is not needed
-
-        // Calculate MCU dimensions
-        int v_samp_max = 0, h_samp_max = 0;
-        foreach (cmp; components) {
-          if (cmp.h_sample > h_samp_max) h_samp_max = cmp.h_sample;
-          if (cmp.v_sample > v_samp_max) v_samp_max = cmp.v_sample;
-        }
-        scState.MCUWidth = h_samp_max*8;
-        scState.MCUHeight = v_samp_max*8;
-
-        // Number of MCU's in the whole transformed image (the actual image could be smaller)
-        scState.nxMCU = x / scState.MCUWidth;
-        scState.nyMCU = y / scState.MCUHeight;
-        if (x % scState.MCUWidth > 0) ++scState.nxMCU;
-        if (y % scState.MCUHeight > 0) ++scState.nyMCU;
-
-        // Allocate the image
-        m_image = new TrueColorImage(scState.nxMCU*scState.MCUWidth, scState.nyMCU*scState.MCUHeight);
-
-        // Calculate the number of pixels for each component from the number of MCU's and sampling rate
-        foreach (idx, ref cmp; components) {
-          // Just make it big enough for a single MCU
-          cmp.x = cmp.h_sample*8;
-          cmp.y = cmp.v_sample*8;
-          cmp.data = new ubyte[](cmp.x*cmp.y);
-          debug if (m_logging) writefln("Component %s, x:%s, y:%s", idx, cmp.x, cmp.y);
-        }
-        break;
-      default:
-        debug if (m_logging) writeln("JPEG: ProcessHeader called on un-handled segment: ", currentMarker);
-        break;
-    }
-  }
-
-  // Start of scan (image)
-  void sosAction (ubyte bite) {
-    // Put the new bite into the buffer
-    scState.buffer = scState.buffer << 8 | bite ;
-    scState.bufferLength += 8;
-    segment.buffer = null;
-    while (scState.bufferLength >= scState.needBits) {
-      if (scState.comparing) {
-        // Try to get a Huffman code from the buffer
-        ubyte* huffCode = fetchHuffmanCode(scState.buffer, scState.bufferLength, scState.needBits, scState.cmpIdx);
-        if (huffCode !is null) {
-          // Found a valid huffman code
-          // Our buffer has effectively shrunk by the number of bits we just took
-          scState.bufferLength -= scState.needBits;
-          scState.needBits = 1;
-          processHuffmanCode(*huffCode);
-          continue;
-        } else {
-          // Failed to get a Huffman code, try with more bits
-          ++scState.needBits;
-        }
-      } else {
-        // Not comparing, getting value bits
-        if (scState.bufferLength < scState.needBits) continue; // Need more bits in the buffer
-        // We have enough bits now to grab the value, so do that
-        int dctComp = fetchDCTComponent(scState.buffer, scState.bufferLength, scState.needBits);
-        // Clear these bits from the buffer, set flag back to 'comparing'
-        scState.bufferLength -= scState.needBits;
-        scState.comparing = true;
-        // Put the new value into the component array
-        scState.dctComponents[scState.dctCmpIndex] = dctComp;
-        ++scState.dctCmpIndex; // Increment our index in the components array
-        if (scState.dctCmpIndex == 64) endOfBlock(); // If we have reached the last index, this is end of block
-        scState.needBits = 1; // Reset the number of bits we need for comparing
-      } // if !comparing
-    } // while bufferLength >= needBits
-  } // sosAction
-
-  // Check the buffer for a valid Huffman code
-  ubyte* fetchHuffmanCode (int buffer, int bufferLength, int needBits, int componentIndex) {
-    // Create a mask to compare needBits bits in the buffer
-    uint mask = ((1 << needBits) - 1) << (bufferLength-needBits);
-    ushort bitCode = cast(ushort) ((mask & buffer) >> (bufferLength - needBits));
-    ubyte tableId = 0;
-    ubyte huffIndex = cast(ubyte) (componentIndex != 0);
-    if (scState.dctCmpIndex != 0) {
-      // This is an AC component
-      huffIndex += 16;
-      tableId = scState.act[componentIndex];
-    } else {
-      // This is a DC component
-      tableId = scState.dct[componentIndex];
-    }
-    hashKey key = hashKey(huffIndex, cast(ubyte)needBits, bitCode);
-    return (key in huffmanTable);
-
-  } // fetchHuffmanCode
-
-  // Process a Huffman code
-  void processHuffmanCode (short huffCode) {
-    if (huffCode == 0x00) {
-      // END OF BLOCK
-      if (scState.dctCmpIndex == 0) {
-        // If we are on the DC term, don't call end of block...
-        ++scState.dctCmpIndex; // just increment the index
-      } else {
-        endOfBlock();
-      }
-    } else {
-      // Not an end of block
-      // The zero run length (not used for the DC component)
-      ubyte preZeros = cast(ubyte) (huffCode >> 4);
-      // Increment the index by the number of preceeding zeros
-      scState.dctCmpIndex += preZeros;
-      // The number of bits we need to get an actual value
-      if (scState.dctCmpIndex == 64) {
-        // Check if we are at the end of the block
-        endOfBlock();
-      } else {
-        scState.comparing = false; // Not comparing bits anymore, waiting for a bitcode
-        scState.needBits = cast(uint)(huffCode & 0x0F); // Number of bits in the bitcode
-      }
-    }
-  } // processHuffmanCode
-
-  // Fetch the actual DCT component value
-  int fetchDCTComponent (int buffer, int bufferLength, int needBits) {
-    // Create a mask to get the value from the (int) buffer
-    uint mask = ((1 << needBits) - 1) << (bufferLength-needBits);
-    short bits = cast(short) ((mask & buffer) >> (bufferLength - needBits));
-    // The first bit tells us which side of the value table we are on (- or +)
-    int bit0 = bits >> (needBits-1);
-    int offset = 2^^needBits;
-    return (bits & ((1 << (needBits-1)) - 1)) + (bit0*offset/2 - (1-bit0)*(offset - 1));
-  } // fetchDCTComponent
-
-  // Have reached the end of a block, within a scan
-  void endOfBlock () {
-    import std.conv : to;
-    // Convert the DC value from relative to absolute
-    scState.dctComponents[0] += scState.dcTerm[scState.cmpIdx];
-    // Store this block's DC term, to apply to the next block
-    scState.dcTerm[scState.cmpIdx] = scState.dctComponents[0];
-    // Grab the quantization table for this component
-    int[] qTable = to!(int[])(quantTable[components[scState.cmpIdx].qtt]);
-    // Dequantize the coefficients
-    scState.dctComponents[] *= qTable[];
-    // Un zig-zag
-    int[64] block;
-    foreach (idx, elem; block_order) block[elem] = scState.dctComponents[idx];
-    // Calculate the offset into the component's pixel array
-    int offset = 0;
-    with (scState) {
-      // Each component now only holds a single MCU
-      offset = 8*( (blockNumber % 2) + (blockNumber / 2)*components[cmpIdx].x);
-    }
-    // The recieving buffer of the IDCT is then the component's pixel array
-    ubyte* pix = components[scState.cmpIdx].data.ptr + offset;
-    // Do the inverse discrete cosine transform
-    foreach(i; 0..8) colIDCT(block.ptr + i); // columns
-    foreach(i; 0..8) rowIDCT(block.ptr + i*8, pix + i*components[scState.cmpIdx].x); // rows
-    scState.dctCmpIndex = 0;
-    scState.dctComponents[] = 0;
-    scState.comparing = true;
-    // We have just decoded an 8x8 'block'
-    ++scState.blockNumber;
-    if (scState.blockNumber == components[scState.cmpIdx].h_sample*components[scState.cmpIdx].v_sample) {
-      // All the components in this block have been parsed
-      scState.blockNumber = 0;
-      ++scState.cmpIdx;
-      if (scState.cmpIdx == nComponents) {
-        // All components in the MCU have been parsed, so increment
-        endOfMCU();
-        scState.cmpIdx = 0;
-        ++scState.MCUSParsed;
-        ++scState.xMCU;
-        if (scState.xMCU == scState.nxMCU) {
-          scState.xMCU = 0;
-          ++scState.yMCU;
-        }
-      }
-    } // if done all blocks for this component in the current MCU
-    // Check for restart marker
-    if (scState.restartInterval != 0 && scState.MCUSParsed == scState.restartInterval) {
-      // We have come up to a restart marker, so reset the DC terms
-      scState.dcTerm[] = 0;
-      scState.MCUSParsed = 0;
-      // Need to skip all the bits up to the next byte boundary
-      while (scState.bufferLength % 8 != 0) --scState.bufferLength;
-    }
-  } // endOfBlock
-
-  // An MCU has been decoded, so resample, convert, and store
-  void endOfMCU () {
-    if (nComponents == 3) {
-      // Resample if needed
-      if (components[1].x != scState.MCUWidth) resampleDgt(1);
-      if (components[2].x != scState.MCUWidth) resampleDgt(2);
-      // YCbCr -> RGB conversion
-      YCrCBtoRGB();
-    }
-  }
-
-  // Resample an MCU with nearest neighbour interp
-  void nearestNeighbourResample (uint cmpIndex) {
-    with (components[cmpIndex]) {
-      ubyte[] buffer = new ubyte[](scState.MCUWidth*scState.MCUHeight);
-      float x_ratio = cast(float)(x-1)/cast(float)(scState.MCUWidth);
-      float y_ratio = cast(float)(y-1)/cast(float)(scState.MCUHeight);
-      foreach (immutable r; 0..scState.MCUHeight) {
-        foreach (immutable c; 0..scState.MCUWidth) {
-          int px = cast(int)(x_ratio * cast(float)c);
-          int py = cast(int)(y_ratio * cast(float)r);
-          buffer[c + scState.MCUWidth*r] = data[px + py*x];
-        } // cols
-      } // rows
-      //data.clear;
-      data = buffer;
-    } // with components[cmpIdx]
-  }
-
-  // Resample an MCU with bilinear interp
-  void bilinearResample (uint cmpIndex) {
-    with (components[cmpIndex]) {
-      ubyte[] buffer = new ubyte[](scState.MCUWidth*scState.MCUHeight);
-      float x_ratio = cast(float)(x-1)/cast(float)(scState.MCUWidth);
-      float y_ratio = cast(float)(y-1)/cast(float)(scState.MCUHeight);
-      foreach (immutable r; 0..scState.MCUHeight) {
-        foreach (immutable c; 0..scState.MCUWidth) {
-          float px = (x_ratio * cast(float)c);
-          float py = (y_ratio * cast(float)r);
-          int x0 = cast(int)px;
-          int y0 = cast(int)py;
-          // Weighting factors
-          float fx = px - x0;
-          float fy = py - y0;
-          float fx1 = 1.0f - fx;
-          float fy1 = 1.0f - fy;
-          /** Get the locations in the src array of the 2x2 block surrounding (row,col)
-          * 01 ------- 11
-          * | (row,col) |
-          * 00 ------- 10
-          */
-          ubyte p1 = data[x0 + y0*x];
-          ubyte p2 = data[(x0+1) + y0*x];
-          ubyte p3 = data[x0 + (y0+1)*x];
-          ubyte p4 = data[(x0+1) + (y0+1)*x];
-          int wgt1 = cast(int)(fx1 * fy1 * 256.0f);
-          int wgt2 = cast(int)(fx  * fy1 * 256.0f);
-          int wgt3 = cast(int)(fx1 * fy  * 256.0f);
-          int wgt4 = cast(int)(fx  * fy  * 256.0f);
-          int v = (p1 * wgt1 + p2 * wgt2 + p3 * wgt3 + p4 * wgt4) >> 8;
-          buffer[c + scState.MCUWidth*r] = cast(ubyte)v;
-        } // cols
-      } // rows
-      //data.clear;
-      data = buffer;
-    } // with components[cmpIdx]
-  } // bilinearResample
-
-  // Convert YCbCr to RGB an store in output image
-  void YCrCBtoRGB () {
-    // Convert to RGB
-    ubyte[] RGBref = m_image.imageData.bytes;
-    ubyte[] Yref = components[0].data;
-    ubyte[] Cbref = components[1].data;
-    ubyte[] Crref = components[2].data;
-    int r, g, b, i = 0, stride = scState.MCUWidth;
-    int ip0 = 0, ipStride = 0;
-    with (scState) {
-      ip0 = (xMCU*MCUWidth + yMCU*MCUWidth*MCUHeight*nxMCU)*4;
-      ipStride = MCUWidth*nxMCU*4;
-    }
-    foreach (immutable y; 0..scState.MCUHeight) {
-      foreach (immutable x; 0..scState.MCUWidth) {
-        int y_fixed = (Yref[i+x] << 16) + 32768; // rounding
-        int cr = Crref[i+x] - 128;
-        int cb = Cbref[i+x] - 128;
-        r = y_fixed + cr*cast(int)(1.40200f * 65536 + 0.5);
-        g = y_fixed - cr*cast(int)(0.71414f * 65536 + 0.5) -
-            cb*cast(int)(0.34414f * 65536 + 0.5);
-        b = y_fixed + cb*cast(int)(1.77200f * 65536 + 0.5);
-        r >>= 16;
-        g >>= 16;
-        b >>= 16;
-
-        RGBref[ip0+x*4..ip0+x*4+3] = [clamp(r), clamp(g), clamp(b)];
-        RGBref[ip0+x*4+3] = 255;
-      }
-      i += stride;
-      ip0 += ipStride;
-    }
-  } // YCbCrtoRGB
-
-  // End of Image
-  void endOfImage () {
-    /**
-    * Crop the image back to its real size (JPEG encoders can increase
-    * increase the dimensions to make them divisible by 8 for the DCT
-    */
-    //image.resize(x, y, Image.ResizeAlgo.CROP);
-    if (m_image.width != x || m_image.height != y) {
-      TrueColorImage newImg = new TrueColorImage(x, y);
-      foreach (immutable dy; 0..y) {
-        newImg.imageData.colors[dy*newImg.width..dy*newImg.width+x] = m_image.imageData.colors[dy*m_image.width..dy*m_image.width+x];
-      }
-      m_image = newImg;
-    }
-    // Clear some fields
-    scState = ScanState();
-    quantTable.clear;
-    huffmanTable.clear;
-    components = null; //.clear;
-    imageComplete = true;
-  } // eoiAction
-
-  /**
-  * The following inverse discrete cosine transform (IDCT) voodoo comes from:
-  * stbi-1.33 - public domain JPEG/PNG reader - http://nothings.org/stb_image.c
-  */
-  void colIDCT (int* block) {
-    int x0, x1, x2, x3, t0, t1, t2, t3, p1, p2, p3, p4, p5;
-    if (block[8] == 0 && block[16] == 0 && block[24] == 0 && block[32] == 0 && block[40] == 0 && block[48] == 0 && block[56] == 0) {
-      int dcterm = block[0] << 2;
-      block[0] = block[8] = block[16] = block[24] =
-      block[32] = block[40] = block[48] = block[56] = dcterm;
-      return;
-    }
-    p2 = block[16];
-    p3 = block[48];
-    p1 = (p2+p3)*cast(int)(0.5411961f * 4096 + 0.5);
-    t2 = p1 + p3*cast(int)(-1.847759065f * 4096 + 0.5);
-    t3 = p1 + p2*cast(int)( 0.765366865f * 4096 + 0.5);
-    p2 = block[0];
-    p3 = block[32];
-    t0 = (p2+p3) << 12;
-    t1 = (p2-p3) << 12;
-    x0 = t0+t3;
-    x3 = t0-t3;
-    x1 = t1+t2;
-    x2 = t1-t2;
-    t0 = block[56];
-    t1 = block[40];
-    t2 = block[24];
-    t3 = block[8];
-    p3 = t0+t2;
-    p4 = t1+t3;
-    p1 = t0+t3;
-    p2 = t1+t2;
-    p5 = (p3+p4)*cast(int)( 1.175875602f * 4096 + 0.5);
-    t0 = t0*cast(int)( 0.298631336f * 4096 + 0.5);
-    t1 = t1*cast(int)( 2.053119869f * 4096 + 0.5);
-    t2 = t2*cast(int)( 3.072711026f * 4096 + 0.5);
-    t3 = t3*cast(int)( 1.501321110f * 4096 + 0.5);
-    p1 = p5 + p1*cast(int)(-0.899976223f * 4096 + 0.5);
-    p2 = p5 + p2*cast(int)(-2.562915447f * 4096 + 0.5);
-    p3 = p3*cast(int)(-1.961570560f * 4096 + 0.5);
-    p4 = p4*cast(int)(-0.390180644f * 4096 + 0.5);
-    t3 += p1+p4;
-    t2 += p2+p3;
-    t1 += p2+p4;
-    t0 += p1+p3;
-    x0 += 512;
-    x1 += 512;
-    x2 += 512;
-    x3 += 512;
-    block[0]  = (x0+t3) >> 10;
-    block[56] = (x0-t3) >> 10;
-    block[8]  = (x1+t2) >> 10;
-    block[48] = (x1-t2) >> 10;
-    block[16] = (x2+t1) >> 10;
-    block[40] = (x2-t1) >> 10;
-    block[24] = (x3+t0) >> 10;
-    block[32] = (x3-t0) >> 10;
-  } // IDCT_1D_COL
-
-  // ditto
-  void rowIDCT (int* block, ubyte* outData) {
-    int x0, x1, x2, x3, t0, t1, t2, t3, p1, p2, p3, p4, p5;
-    p2 = block[2];
-    p3 = block[6];
-    p1 = (p2+p3)*cast(int)(0.5411961f * 4096 + 0.5);
-    t2 = p1 + p3*cast(int)(-1.847759065f * 4096 + 0.5);
-    t3 = p1 + p2*cast(int)( 0.765366865f * 4096 + 0.5);
-    p2 = block[0];
-    p3 = block[4];
-    t0 = (p2+p3) << 12;
-    t1 = (p2-p3) << 12;
-    x0 = t0+t3;
-    x3 = t0-t3;
-    x1 = t1+t2;
-    x2 = t1-t2;
-    t0 = block[7];
-    t1 = block[5];
-    t2 = block[3];
-    t3 = block[1];
-    p3 = t0+t2;
-    p4 = t1+t3;
-    p1 = t0+t3;
-    p2 = t1+t2;
-    p5 = (p3+p4)*cast(int)( 1.175875602f * 4096 + 0.5);
-    t0 = t0*cast(int)( 0.298631336f * 4096 + 0.5);
-    t1 = t1*cast(int)( 2.053119869f * 4096 + 0.5);
-    t2 = t2*cast(int)( 3.072711026f * 4096 + 0.5);
-    t3 = t3*cast(int)( 1.501321110f * 4096 + 0.5);
-    p1 = p5 + p1*cast(int)(-0.899976223f * 4096 + 0.5);
-    p2 = p5 + p2*cast(int)(-2.562915447f * 4096 + 0.5);
-    p3 = p3*cast(int)(-1.961570560f * 4096 + 0.5);
-    p4 = p4*cast(int)(-0.390180644f * 4096 + 0.5);
-    t3 += p1+p4;
-    t2 += p2+p3;
-    t1 += p2+p4;
-    t0 += p1+p3;
-    x0 += 65536 + (128<<17);
-    x1 += 65536 + (128<<17);
-    x2 += 65536 + (128<<17);
-    x3 += 65536 + (128<<17);
-    outData[0] = clamp((x0+t3) >> 17);
-    outData[7] = clamp((x0-t3) >> 17);
-    outData[1] = clamp((x1+t2) >> 17);
-    outData[6] = clamp((x1-t2) >> 17);
-    outData[2] = clamp((x2+t1) >> 17);
-    outData[5] = clamp((x2-t1) >> 17);
-    outData[3] = clamp((x3+t0) >> 17);
-    outData[4] = clamp((x3-t0) >> 17);
-  } // IDCT_1D_ROW
-} // JpegDecoder
+// Baseline JPEG decoder
+// adapted from https://github.com/lgvz/imageformats
+// Boost License, i suppose
+module jpeg;
+
+private:
+import arsd.color;
+import iv.vfs;
+import iv.vfs.streams;
+
+
+// ////////////////////////////////////////////////////////////////////////// //
+public class ImageIOException : Exception {
+  this (string msg, string file=__FILE__, size_t line=__LINE__, Throwable next=null) const pure nothrow @safe @nogc {
+    super(msg, file, line, next);
+  }
+}
+
+
+// ////////////////////////////////////////////////////////////////////////// //
+// public declarations
+public bool detectJpeg (VFile stream) {
+  try {
+    int w, h, c;
+    readJpegInfo(stream, w, h, c);
+    return true;
+  } catch (Exception) {
+    return false;
+  } finally {
+    stream.seek(0, Seek.Set);
+  }
+}
+
+
+public void readJpegInfo (VFile stream, out int w, out int h, out int chans) {
+  import std.bitmanip : bigEndianToNative;
+
+  ubyte[2] marker = void;
+  stream.rawReadExact(marker[]);
+
+  // SOI
+  if (marker[0..2] != [0xff, 0xd8]) throw new ImageIOException("not JPEG");
+
+  for (;;) {
+    stream.rawReadExact(marker[]);
+    if (marker[0] != 0xff) throw new ImageIOException("no frame header");
+    while (marker[1] == 0xff) stream.rawReadExact(marker[1..$]);
+    enum SKIP = 0xff;
+    switch (marker[1]) with (Marker) {
+      case SOF0: .. case SOF3: goto case;
+      case SOF9: .. case SOF11:
+        ubyte[8] tmp;
+        stream.rawReadExact(tmp[0..8]);
+        //int len = bigEndianToNative!ushort(tmp[0..2]);
+        w = bigEndianToNative!ushort(tmp[5..7]);
+        h = bigEndianToNative!ushort(tmp[3..5]);
+        chans = tmp[7];
+        return;
+      case SOS, EOI: throw new ImageIOException("no frame header");
+      case DRI, DHT, DQT, COM: goto case SKIP;
+      case APP0: .. case APPf: goto case SKIP;
+      case SKIP:
+        ubyte[2] lenbuf = void;
+        stream.rawReadExact(lenbuf[]);
+        int skiplen = bigEndianToNative!ushort(lenbuf)-2;
+        stream.seek(skiplen, Seek.Cur);
+        break;
+      default: throw new ImageIOException("unsupported marker");
+    }
+  }
+  assert(0);
+}
+
+
+public TrueColorImage readJpeg (VFile stream) {
+  enum req_chans = 4;
+
+  // SOI
+  ubyte[2] tmp = void;
+  stream.rawReadExact(tmp[]);
+  if (tmp[0..2] != [0xff, 0xd8]) throw new ImageIOException("not JPEG");
+
+  JPEG_Decoder dc = { stream: stream };
+
+  read_markers(dc); // reads until first scan header or eoi
+  if (dc.eoi_reached) throw new ImageIOException("no image data");
+
+  dc.tgt_chans = (req_chans == 0 ? dc.num_comps : cast(int)req_chans);
+
+  auto pixels = decode_jpeg(dc);
+  //assert(pixels.length == dc.width*dc.height*4);
+  return new TrueColorImage(dc.width, dc.height, pixels);
+}
+
+
+// ////////////////////////////////////////////////////////////////////////// //
+private:
+struct JPEG_Decoder {
+  @disable this (this); // just in case
+
+  VFile stream;
+
+  bool has_frame_header = false;
+  bool eoi_reached = false;
+
+  ubyte[64][4] qtables;
+  HuffTab[2] ac_tables;
+  HuffTab[2] dc_tables;
+
+  ubyte cb;      // current byte (next bit always at MSB)
+  int bits_left; // num of unused bits in cb
+
+  bool correct_comp_ids;
+  Component[3] comps;
+  ubyte num_comps;
+  int tgt_chans;
+
+  int width, height;
+
+  int hmax, vmax;
+
+  ushort restart_interval; // number of MCUs in restart interval
+
+  // image component
+  static struct Component {
+    ubyte sfx, sfy; // sampling factors, aka. h and v
+    size_t x, y;    // total num of samples, without fill samples
+    ubyte qtable;
+    ubyte ac_table;
+    ubyte dc_table;
+    int pred;       // dc prediction
+    ubyte[] data;   // reconstructed samples
+  }
+
+  int num_mcu_x;
+  int num_mcu_y;
+}
+
+
+struct HuffTab {
+  ubyte[256] values;
+  ubyte[257] sizes;
+  short[16] mincode, maxcode;
+  short[16] valptr;
+}
+
+
+enum Marker : ubyte {
+  SOI = 0xd8,    // start of image
+  SOF0 = 0xc0,   // start of frame / baseline DCT
+  //SOF1 = 0xc1,   // start of frame / extended seq.
+  //SOF2 = 0xc2,   // start of frame / progressive DCT
+  SOF3 = 0xc3,   // start of frame / lossless
+  SOF9 = 0xc9,   // start of frame / extended seq., arithmetic
+  SOF11 = 0xcb,   // start of frame / lossless, arithmetic
+  DHT = 0xc4,    // define huffman tables
+  DQT = 0xdb,    // define quantization tables
+  DRI = 0xdd,    // define restart interval
+  SOS = 0xda,    // start of scan
+  DNL = 0xdc,    // define number of lines
+  RST0 = 0xd0,   // restart entropy coded data
+  // ...
+  RST7 = 0xd7,   // restart entropy coded data
+  APP0 = 0xe0,   // application 0 segment
+  // ...
+  APPf = 0xef,   // application f segment
+  //DAC = 0xcc,    // define arithmetic conditioning table
+  COM = 0xfe,    // comment
+  EOI = 0xd9,    // end of image
+}
+
+
+void read_markers (ref JPEG_Decoder dc) {
+  import std.bitmanip : bigEndianToNative;
+  bool has_next_scan_header = false;
+  while (!has_next_scan_header && !dc.eoi_reached) {
+    ubyte[2] marker = void;
+    dc.stream.rawReadExact(marker[]);
+    if (marker[0] != 0xff) throw new ImageIOException("no marker");
+    while (marker[1] == 0xff) dc.stream.rawReadExact(marker[1..$]);
+    debug(DebugJPEG) writefln("marker: %s (%1$x)\t", cast(Marker)marker[1]);
+    switch (marker[1]) with (Marker) {
+      case DHT: dc.read_huffman_tables(); break;
+      case DQT: dc.read_quantization_tables(); break;
+      case SOF0:
+        if (dc.has_frame_header) throw new ImageIOException("extra frame header");
+        debug(DebugJPEG) writeln();
+        dc.read_frame_header();
+        dc.has_frame_header = true;
+        break;
+      case SOS:
+        if (!dc.has_frame_header) throw new ImageIOException("no frame header");
+        dc.read_scan_header();
+        has_next_scan_header = true;
+        break;
+      case DRI: dc.read_restart_interval(); break;
+      case EOI: dc.eoi_reached = true; break;
+      case APP0: .. case APPf: goto case;
+      case COM:
+        debug(DebugJPEG) writefln("-> skipping segment");
+        ubyte[2] lenbuf = void;
+        dc.stream.rawReadExact(lenbuf[]);
+        int len = bigEndianToNative!ushort(lenbuf)-2;
+        dc.stream.seek(len, Seek.Cur);
+        break;
+      default: throw new ImageIOException("invalid / unsupported marker");
+    }
+  }
+}
+
+
+// DHT -- define huffman tables
+void read_huffman_tables (ref JPEG_Decoder dc) {
+  import std.bitmanip : bigEndianToNative;
+  ubyte[19] tmp = void;
+  dc.stream.rawReadExact(tmp[0..2]);
+  int len = bigEndianToNative!ushort(tmp[0..2]);
+  len -= 2;
+  while (0 < len) {
+    dc.stream.rawReadExact(tmp[0..17]);   // info byte & the BITS
+    ubyte table_slot = tmp[0]&0xf; // must be 0 or 1 for baseline
+    ubyte table_class = tmp[0]>>4;  // 0 = dc table, 1 = ac table
+    if (1 < table_slot || 1 < table_class) throw new ImageIOException("invalid / not supported");
+    // compute total number of huffman codes
+    int mt = 0;
+    foreach (immutable i; 1..17) mt += tmp[i];
+    if (256 < mt) throw new ImageIOException("invalid / not supported"); // TODO where in the spec?
+    if (table_class == 0) {
+      dc.stream.rawReadExact(dc.dc_tables[table_slot].values[0..mt]);
+      derive_table(dc.dc_tables[table_slot], tmp[1..17]);
+    } else {
+      dc.stream.rawReadExact(dc.ac_tables[table_slot].values[0..mt]);
+      derive_table(dc.ac_tables[table_slot], tmp[1..17]);
+    }
+    len -= 17+mt;
+  }
+}
+
+
+// num_values is the BITS
+void derive_table (ref HuffTab table, in ref ubyte[16] num_values) {
+  short[256] codes;
+  int k = 0;
+  foreach (immutable i; 0..16) {
+    foreach (immutable j; 0..num_values[i]) {
+      table.sizes[k] = cast(ubyte)(i+1);
+      ++k;
+    }
+  }
+  table.sizes[k] = 0;
+  k = 0;
+  short code = 0;
+  ubyte si = table.sizes[k];
+  for (;;) {
+    do {
+      codes[k] = code;
+      ++code;
+      ++k;
+    } while (si == table.sizes[k]);
+    if (table.sizes[k] == 0) break;
+    debug(DebugJPEG) assert(si < table.sizes[k]);
+    do {
+      code <<= 1;
+      ++si;
+    } while (si != table.sizes[k]);
+  }
+  derive_mincode_maxcode_valptr(table.mincode, table.maxcode, table.valptr, codes, num_values);
+}
+
+
+// F.15
+void derive_mincode_maxcode_valptr (ref short[16] mincode, ref short[16] maxcode, ref short[16] valptr, in ref short[256] codes, in ref ubyte[16] num_values) pure {
+  mincode[] = -1;
+  maxcode[] = -1;
+  valptr[] = -1;
+  int j = 0;
+  foreach (immutable i; 0..16) {
+    if (num_values[i] != 0) {
+      valptr[i] = cast(short)j;
+      mincode[i] = codes[j];
+      j += num_values[i]-1;
+      maxcode[i] = codes[j];
+      j += 1;
+    }
+  }
+}
+
+
+// DQT -- define quantization tables
+void read_quantization_tables (ref JPEG_Decoder dc) {
+  import std.bitmanip : bigEndianToNative;
+  ubyte[2] tmp = void;
+  dc.stream.rawReadExact(tmp[0..2]);
+  int len = bigEndianToNative!ushort(tmp[0..2]);
+  if (len%65 != 2) throw new ImageIOException("invalid / not supported");
+  len -= 2;
+  while (0 < len) {
+    dc.stream.rawReadExact(tmp[0..1]);
+    ubyte table_info = tmp[0];
+    ubyte table_slot = table_info&0xf;
+    ubyte precision = table_info>>4;  // 0 = 8 bit, 1 = 16 bit
+    if (3 < table_slot || precision != 0) throw new ImageIOException("invalid / not supported"); // only 8 bit for baseline
+    dc.stream.rawReadExact(dc.qtables[table_slot][0..64]);
+    len -= 1+64;
+  }
+}
+
+
+// SOF0 -- start of frame
+void read_frame_header (ref JPEG_Decoder dc) {
+  import std.bitmanip : bigEndianToNative;
+  ubyte[9] tmp = void;
+  dc.stream.rawReadExact(tmp[0..8]);
+  int len = bigEndianToNative!ushort(tmp[0..2]);  // 8+num_comps*3
+  ubyte precision = tmp[2];
+  dc.height = bigEndianToNative!ushort(tmp[3..5]);
+  dc.width = bigEndianToNative!ushort(tmp[5..7]);
+  dc.num_comps = tmp[7];
+  if (precision != 8 || (dc.num_comps != 1 && dc.num_comps != 3) || len != 8+dc.num_comps*3) throw new ImageIOException("invalid / not supported");
+  dc.hmax = 0;
+  dc.vmax = 0;
+  int mcu_du = 0; // data units in one mcu
+  dc.stream.rawReadExact(tmp[0..dc.num_comps*3]);
+  foreach (immutable i; 0..dc.num_comps) {
+    ubyte ci = tmp[i*3];
+    // JFIF says ci should be i+1, but there are images where ci is i. Normalize ids
+    // so that ci == i, always. So much for standards...
+    if (i == 0) { dc.correct_comp_ids = ci == i+1; }
+    if ((dc.correct_comp_ids && ci != i+1) || (!dc.correct_comp_ids && ci != i)) throw new ImageIOException("invalid component id");
+    auto comp = &dc.comps[i];
+    ubyte sampling_factors = tmp[i*3+1];
+    comp.sfx = sampling_factors>>4;
+    comp.sfy = sampling_factors&0xf;
+    comp.qtable = tmp[i*3+2];
+    if (comp.sfy < 1 || 4 < comp.sfy || comp.sfx < 1 || 4 < comp.sfx || 3 < comp.qtable) throw new ImageIOException("invalid / not supported");
+    if (dc.hmax < comp.sfx) dc.hmax = comp.sfx;
+    if (dc.vmax < comp.sfy) dc.vmax = comp.sfy;
+    mcu_du += comp.sfx*comp.sfy;
+  }
+  if (10 < mcu_du) throw new ImageIOException("invalid / not supported");
+  foreach (immutable i; 0..dc.num_comps) {
+    import std.math : ceil;
+    dc.comps[i].x = cast(size_t)ceil(dc.width*(cast(double)dc.comps[i].sfx/dc.hmax));
+    dc.comps[i].y = cast(size_t)ceil(dc.height*(cast(double)dc.comps[i].sfy/dc.vmax));
+    debug(DebugJPEG) writefln("%d comp %d sfx/sfy: %d/%d", i, dc.comps[i].id, dc.comps[i].sfx, dc.comps[i].sfy);
+  }
+  size_t mcu_w = dc.hmax*8;
+  size_t mcu_h = dc.vmax*8;
+  dc.num_mcu_x = cast(int)((dc.width+mcu_w-1)/mcu_w);
+  dc.num_mcu_y = cast(int)((dc.height+mcu_h-1)/mcu_h);
+  debug(DebugJPEG) {
+    writefln("\tlen: %s", len);
+    writefln("\tprecision: %s", precision);
+    writefln("\tdimensions: %s x %s", dc.width, dc.height);
+    writefln("\tnum_comps: %s", dc.num_comps);
+    writefln("\tnum_mcu_x: %s", dc.num_mcu_x);
+    writefln("\tnum_mcu_y: %s", dc.num_mcu_y);
+  }
+}
+
+
+// SOS -- start of scan
+void read_scan_header (ref JPEG_Decoder dc) {
+  import std.bitmanip : bigEndianToNative;
+  import core.stdc.stdlib : alloca;
+  ubyte[3] tmp = void;
+  dc.stream.rawReadExact(tmp[]);
+  ushort len = bigEndianToNative!ushort(tmp[0..2]);
+  ubyte num_scan_comps = tmp[2];
+  if (num_scan_comps != dc.num_comps || len != 6+num_scan_comps*2) throw new ImageIOException("invalid / not supported");
+  auto buf = (cast(ubyte*)alloca((len-3)*ubyte.sizeof))[0..len-3];
+  dc.stream.rawReadExact(buf[]);
+  foreach (immutable i; 0..num_scan_comps) {
+    uint ci = buf[i*2]-((dc.correct_comp_ids) ? 1 : 0);
+    if (ci >= dc.num_comps) throw new ImageIOException("invalid component id");
+    ubyte tables = buf[i*2+1];
+    dc.comps[ci].dc_table = tables>>4;
+    dc.comps[ci].ac_table = tables&0xf;
+    if (1 < dc.comps[ci].dc_table || 1 < dc.comps[ci].ac_table) throw new ImageIOException("invalid / not supported");
+  }
+  // ignore these
+  //ubyte spectral_start = buf[$-3];
+  //ubyte spectral_end = buf[$-2];
+  //ubyte approx = buf[$-1];
+}
+
+
+void read_restart_interval (ref JPEG_Decoder dc) {
+  import std.bitmanip : bigEndianToNative;
+  ubyte[4] tmp = void;
+  dc.stream.rawReadExact(tmp[]);
+  ushort len = bigEndianToNative!ushort(tmp[0..2]);
+  if (len != 4) throw new ImageIOException("invalid / not supported");
+  dc.restart_interval = bigEndianToNative!ushort(tmp[2..4]);
+  debug(DebugJPEG) writeln("restart interval set to: ", dc.restart_interval);
+}
+
+
+// reads data after the SOS segment
+ubyte[] decode_jpeg (ref JPEG_Decoder dc) {
+  foreach (ref comp; dc.comps[0..dc.num_comps]) comp.data = new ubyte[dc.num_mcu_x*comp.sfx*8*dc.num_mcu_y*comp.sfy*8];
+  // E.7 -- Multiple scans are for progressive images which are not supported
+  //while (!dc.eoi_reached) {
+    decode_scan(dc);    // E.2.3
+    //read_markers(dc);   // reads until next scan header or eoi
+  //}
+  // throw away fill samples and convert to target format
+  return dc.reconstruct();
+}
+
+
+// E.2.3 and E.8 and E.9
+void decode_scan (ref JPEG_Decoder dc) {
+  debug(DebugJPEG) writeln("decode scan...");
+  int intervals, mcus;
+  if (0 < dc.restart_interval) {
+    int total_mcus = dc.num_mcu_x*dc.num_mcu_y;
+    intervals = (total_mcus+dc.restart_interval-1)/dc.restart_interval;
+    mcus = dc.restart_interval;
+  } else {
+    intervals = 1;
+    mcus = dc.num_mcu_x*dc.num_mcu_y;
+  }
+  debug(DebugJPEG) writeln("intervals: ", intervals);
+  foreach (immutable mcu_j; 0..dc.num_mcu_y) {
+    foreach (immutable mcu_i; 0..dc.num_mcu_x) {
+      // decode mcu
+      foreach (immutable c; 0..dc.num_comps) {
+        auto comp = &dc.comps[c];
+        foreach (immutable du_j; 0..comp.sfy) {
+          foreach (immutable du_i; 0..comp.sfx) {
+            // decode entropy, dequantize & dezigzag
+            short[64] data = decode_block(dc, *comp, dc.qtables[comp.qtable]);
+            // idct & level-shift
+            int outx = (mcu_i*comp.sfx+du_i)*8;
+            int outy = (mcu_j*comp.sfy+du_j)*8;
+            int dst_stride = dc.num_mcu_x*comp.sfx*8;
+            ubyte* dst = comp.data.ptr+outy*dst_stride+outx;
+            stbi__idct_block(dst, dst_stride, data);
+          }
+        }
+      }
+      --mcus;
+      if (!mcus) {
+        --intervals;
+        if (!intervals) return;
+        read_restart(dc.stream); // RSTx marker
+        if (intervals == 1) {
+          // last interval, may have fewer MCUs than defined by DRI
+          mcus = (dc.num_mcu_y-mcu_j-1)*dc.num_mcu_x+dc.num_mcu_x-mcu_i-1;
+        } else {
+          mcus = dc.restart_interval;
+        }
+        // reset decoder
+        dc.cb = 0;
+        dc.bits_left = 0;
+        foreach (immutable k; 0..dc.num_comps) dc.comps[k].pred = 0;
+      }
+    }
+  }
+}
+
+
+// RST0-RST7
+void read_restart (VFile stream) {
+  ubyte[2] tmp = void;
+  stream.rawReadExact(tmp[]);
+  if (tmp[0] != 0xff || tmp[1] < Marker.RST0 || Marker.RST7 < tmp[1]) throw new ImageIOException("reset marker missing");
+  // the markers should cycle 0 through 7, could check that here...
+}
+
+
+immutable ubyte[64] dezigzag = [
+     0,  1,  8, 16,  9,  2,  3, 10,
+    17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+    27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36,
+    29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63,
+];
+
+
+// decode entropy, dequantize & dezigzag (see section F.2)
+short[64] decode_block (ref JPEG_Decoder dc, ref JPEG_Decoder.Component comp, in ref ubyte[64] qtable) {
+  short[64] res = 0;
+  ubyte t = decode_huff(dc, dc.dc_tables[comp.dc_table]);
+  int diff = t ? dc.receive_and_extend(t) : 0;
+  comp.pred = comp.pred+diff;
+  res[0] = cast(short)(comp.pred*qtable[0]);
+  int k = 1;
+  do {
+    ubyte rs = decode_huff(dc, dc.ac_tables[comp.ac_table]);
+    ubyte rrrr = rs>>4;
+    ubyte ssss = rs&0xf;
+    if (ssss == 0) {
+      if (rrrr != 0xf) break; // end of block
+      k += 16; // run length is 16
+      continue;
+    }
+    k += rrrr;
+    if (63 < k) throw new ImageIOException("corrupt block");
+    res[dezigzag[k]] = cast(short)(dc.receive_and_extend(ssss)*qtable[k]);
+    k += 1;
+  } while (k < 64);
+  return res;
+}
+
+
+int receive_and_extend (ref JPEG_Decoder dc, ubyte s) {
+  // receive
+  int symbol = 0;
+  foreach (immutable _; 0..s) symbol = (symbol<<1)+nextbit(dc);
+  // extend
+  int vt = 1<<(s-1);
+  if (symbol < vt) return symbol+(-1<<s)+1;
+  return symbol;
+}
+
+
+// F.16 -- the DECODE
+ubyte decode_huff (ref JPEG_Decoder dc, in ref HuffTab tab) {
+  short code = nextbit(dc);
+  int i = 0;
+  while (tab.maxcode[i] < code) {
+    code = cast(short)((code<<1)+nextbit(dc));
+    i += 1;
+    if (tab.maxcode.length <= i) throw new ImageIOException("corrupt huffman coding");
+  }
+  int j = tab.valptr[i]+code-tab.mincode[i];
+  if (tab.values.length <= cast(uint)j) throw new ImageIOException("corrupt huffman coding");
+  return tab.values[j];
+}
+
+
+// F.2.2.5 and F.18
+ubyte nextbit (ref JPEG_Decoder dc) {
+  if (!dc.bits_left) {
+    ubyte[1] bytebuf = void;
+    dc.stream.rawReadExact(bytebuf[]);
+    dc.cb = bytebuf[0];
+    dc.bits_left = 8;
+    if (dc.cb == 0xff) {
+      dc.stream.rawReadExact(bytebuf[]);
+      if (bytebuf[0] != 0x0) throw new ImageIOException("unexpected marker");
+    }
+  }
+  ubyte r = dc.cb>>7;
+  dc.cb <<= 1;
+  dc.bits_left -= 1;
+  return r;
+}
+
+
+ubyte[] reconstruct (in ref JPEG_Decoder dc) {
+  auto result = new ubyte[dc.width*dc.height*dc.tgt_chans];
+  switch (dc.num_comps*10+dc.tgt_chans) {
+    case 34, 33:
+      // Use specialized bilinear filtering functions for the frequent cases where
+      // Cb & Cr channels have half resolution.
+      if ((dc.comps[0].sfx <= 2 && dc.comps[0].sfy <= 2) &&
+         (dc.comps[0].sfx+dc.comps[0].sfy >= 3) &&
+         dc.comps[1].sfx == 1 && dc.comps[1].sfy == 1 &&
+         dc.comps[2].sfx == 1 && dc.comps[2].sfy == 1) {
+          void function (in ubyte[], in ubyte[], ubyte[]) resample;
+          switch (dc.comps[0].sfx*10+dc.comps[0].sfy) {
+            case 22: resample = &upsample_h2_v2; break;
+            case 21: resample = &upsample_h2_v1; break;
+            case 12: resample = &upsample_h1_v2; break;
+            default: throw new ImageIOException("bug");
+          }
+          auto comp1 = new ubyte[](dc.width);
+          auto comp2 = new ubyte[](dc.width);
+          size_t s = 0;
+          size_t di = 0;
+          foreach (immutable j; 0..dc.height) {
+            size_t mi = j/dc.comps[0].sfy;
+            size_t si = (mi == 0 || mi >= (dc.height-1)/dc.comps[0].sfy ? mi : mi-1+s*2);
+            s ^= 1;
+            size_t cs = dc.num_mcu_x*dc.comps[1].sfx*8;
+            size_t cl0 = mi*cs;
+            size_t cl1 = si*cs;
+            resample(dc.comps[1].data[cl0..cl0+dc.comps[1].x], dc.comps[1].data[cl1..cl1+dc.comps[1].x], comp1[]);
+            resample(dc.comps[2].data[cl0..cl0+dc.comps[2].x], dc.comps[2].data[cl1..cl1+dc.comps[2].x], comp2[]);
+            foreach (immutable i; 0..dc.width) {
+              result[di..di+3] = ycbcr_to_rgb(dc.comps[0].data[j*dc.num_mcu_x*dc.comps[0].sfx*8+i], comp1[i], comp2[i]);
+              if (dc.tgt_chans == 4) result[di+3] = 255;
+              di += dc.tgt_chans;
+            }
+          }
+          return result;
+      }
+      foreach (const ref comp; dc.comps[0..dc.num_comps]) {
+        if (comp.sfx != dc.hmax || comp.sfy != dc.vmax) return dc.upsample(result);
+      }
+      size_t si, di;
+      foreach (immutable j; 0..dc.height) {
+        foreach (immutable i; 0..dc.width) {
+          result[di..di+3] = ycbcr_to_rgb(dc.comps[0].data[si+i], dc.comps[1].data[si+i], dc.comps[2].data[si+i]);
+          if (dc.tgt_chans == 4) result[di+3] = 255;
+          di += dc.tgt_chans;
+        }
+        si += dc.num_mcu_x*dc.comps[0].sfx*8;
+      }
+      return result;
+    case 32, 12, 31, 11:
+      const comp = &dc.comps[0];
+      if (comp.sfx == dc.hmax && comp.sfy == dc.vmax) {
+        size_t si, di;
+        if (dc.tgt_chans == 2) {
+          foreach (immutable j; 0..dc.height) {
+            foreach (immutable i; 0..dc.width) {
+              result[di++] = comp.data[si+i];
+              result[di++] = 255;
+            }
+            si += dc.num_mcu_x*comp.sfx*8;
+          }
+        } else {
+          foreach (immutable j; 0..dc.height) {
+            result[di..di+dc.width] = comp.data[si..si+dc.width];
+            si += dc.num_mcu_x*comp.sfx*8;
+            di += dc.width;
+          }
+        }
+        return result;
+      }
+      // need to resample (haven't tested this...)
+      return dc.upsample_luma(result);
+    case 14, 13:
+      const comp = &dc.comps[0];
+      size_t si, di;
+      foreach (immutable j; 0..dc.height) {
+        foreach (immutable i; 0..dc.width) {
+          result[di..di+3] = comp.data[si+i];
+          if (dc.tgt_chans == 4) result[di+3] = 255;
+          di += dc.tgt_chans;
+        }
+        si += dc.num_mcu_x*comp.sfx*8;
+      }
+      return result;
+    default: assert(0);
+  }
+}
+
+
+void upsample_h2_v2(in ubyte[] line0, in ubyte[] line1, ubyte[] result) {
+  ubyte mix() (ubyte mm, ubyte ms, ubyte sm, ubyte ss) {
+    pragma(inline, true);
+    return cast(ubyte)((cast(uint)mm*3*3+cast(uint)ms*3*1+cast(uint)sm*1*3+cast(uint)ss*1*1+8)/16);
+  }
+
+  result[0] = cast(ubyte)((cast(uint)line0[0]*3+cast(uint)line1[0]*1+2)/4);
+  if (line0.length == 1) return;
+  result[1] = mix(line0[0], line0[1], line1[0], line1[1]);
+
+  size_t di = 2;
+  foreach (immutable i; 1..line0.length) {
+    result[di] = mix(line0[i], line0[i-1], line1[i], line1[i-1]);
+    di += 1;
+    if (i == line0.length-1) {
+      if (di < result.length) result[di] = cast(ubyte)((cast(uint)line0[i]*3+cast(uint)line1[i]*1+2)/4);
+      return;
+    }
+    result[di] = mix(line0[i], line0[i+1], line1[i], line1[i+1]);
+    di += 1;
+  }
+}
+
+
+void upsample_h2_v1 (in ubyte[] line0, in ubyte[] _line1, ubyte[] result) {
+  result[0] = line0[0];
+  if (line0.length == 1) return;
+  result[1] = cast(ubyte)((cast(uint)line0[0]*3+cast(uint)line0[1]*1+2)/4);
+  size_t di = 2;
+  foreach (immutable i; 1..line0.length) {
+    result[di] = cast(ubyte)((cast(uint)line0[i-1]*1+cast(uint)line0[i+0]*3+2)/4);
+    di += 1;
+    if (i == line0.length-1) {
+      if (di < result.length) result[di] = line0[i];
+      return;
+    }
+    result[di] = cast(ubyte)((cast(uint)line0[i+0]*3+cast(uint)line0[i+1]*1+2)/4);
+    di += 1;
+  }
+}
+
+
+void upsample_h1_v2 (in ubyte[] line0, in ubyte[] line1, ubyte[] result) {
+  foreach (immutable i; 0..result.length) result[i] = cast(ubyte)((cast(uint)line0[i]*3+cast(uint)line1[i]*1+2)/4);
+}
+
+
+// Nearest neighbor
+ubyte[] upsample_luma (in ref JPEG_Decoder dc, ubyte[] result) {
+  const size_t stride0 = dc.num_mcu_x*dc.comps[0].sfx*8;
+  const y_step0 = cast(float)dc.comps[0].sfy/cast(float)dc.vmax;
+  const x_step0 = cast(float)dc.comps[0].sfx/cast(float)dc.hmax;
+  float y0 = y_step0*0.5;
+  size_t y0i = 0;
+  size_t di;
+  foreach (immutable j; 0..dc.height) {
+    float x0 = x_step0*0.5;
+    size_t x0i = 0;
+    foreach (immutable i; 0..dc.width) {
+      result[di] = dc.comps[0].data[y0i+x0i];
+      if (dc.tgt_chans == 2) result[di+1] = 255;
+      di += dc.tgt_chans;
+      x0 += x_step0;
+      if (x0 >= 1.0) { x0 -= 1.0; x0i += 1; }
+    }
+    y0 += y_step0;
+    if (y0 >= 1.0) { y0 -= 1.0; y0i += stride0; }
+  }
+  return result;
+}
+
+
+// Nearest neighbor
+ubyte[] upsample (in ref JPEG_Decoder dc, ubyte[] result) {
+  const size_t stride0 = dc.num_mcu_x*dc.comps[0].sfx*8;
+  const size_t stride1 = dc.num_mcu_x*dc.comps[1].sfx*8;
+  const size_t stride2 = dc.num_mcu_x*dc.comps[2].sfx*8;
+  const y_step0 = cast(float)dc.comps[0].sfy/cast(float)dc.vmax;
+  const y_step1 = cast(float)dc.comps[1].sfy/cast(float)dc.vmax;
+  const y_step2 = cast(float)dc.comps[2].sfy/cast(float)dc.vmax;
+  const x_step0 = cast(float)dc.comps[0].sfx/cast(float)dc.hmax;
+  const x_step1 = cast(float)dc.comps[1].sfx/cast(float)dc.hmax;
+  const x_step2 = cast(float)dc.comps[2].sfx/cast(float)dc.hmax;
+  float y0 = y_step0*0.5;
+  float y1 = y_step1*0.5;
+  float y2 = y_step2*0.5;
+  size_t y0i = 0;
+  size_t y1i = 0;
+  size_t y2i = 0;
+  size_t di;
+  foreach (immutable _j; 0..dc.height) {
+    float x0 = x_step0*0.5;
+    float x1 = x_step1*0.5;
+    float x2 = x_step2*0.5;
+    size_t x0i = 0;
+    size_t x1i = 0;
+    size_t x2i = 0;
+    foreach (immutable i; 0..dc.width) {
+      result[di..di+3] = ycbcr_to_rgb(dc.comps[0].data[y0i+x0i], dc.comps[1].data[y1i+x1i], dc.comps[2].data[y2i+x2i]);
+      if (dc.tgt_chans == 4) result[di+3] = 255;
+      di += dc.tgt_chans;
+      x0 += x_step0;
+      x1 += x_step1;
+      x2 += x_step2;
+      if (x0 >= 1.0) { x0 -= 1.0; x0i += 1; }
+      if (x1 >= 1.0) { x1 -= 1.0; x1i += 1; }
+      if (x2 >= 1.0) { x2 -= 1.0; x2i += 1; }
+    }
+    y0 += y_step0;
+    y1 += y_step1;
+    y2 += y_step2;
+    if (y0 >= 1.0) { y0 -= 1.0; y0i += stride0; }
+    if (y1 >= 1.0) { y1 -= 1.0; y1i += stride1; }
+    if (y2 >= 1.0) { y2 -= 1.0; y2i += stride2; }
+  }
+  return result;
+}
+
+
+ubyte[3] ycbcr_to_rgb (ubyte y, ubyte cb, ubyte cr) pure {
+  ubyte[3] rgb = void;
+  rgb[0] = clamp(y+1.402*(cr-128));
+  rgb[1] = clamp(y-0.34414*(cb-128)-0.71414*(cr-128));
+  rgb[2] = clamp(y+1.772*(cb-128));
+  return rgb;
+}
+
+
+ubyte clamp() (float x) pure {
+  if (x < 0) return 0;
+  if (255 < x) return 255;
+  return cast(ubyte)x;
+}
+
+
+// ------------------------------------------------------------
+// The IDCT stuff here (to the next dashed line) is copied and adapted from
+// stb_image which is released under public domain.  Many thanks to stb_image
+// author, Sean Barrett.
+// Link: https://github.com/nothings/stb/blob/master/stb_image.h
+int f2f() (float x) pure { pragma(inline, true); return cast(int)(x*4096+0.5); }
+int fsh() (int x) pure { pragma(inline, true); return x<<12; }
+
+// from stb_image, derived from jidctint -- DCT_ISLOW
+void STBI__IDCT_1D() (ref int t0, ref int t1, ref int t2, ref int t3,
+                      ref int x0, ref int x1, ref int x2, ref int x3,
+                      int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) pure
+{
+  int p1, p2, p3, p4, p5;
+  //int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3;
+  p2 = s2;
+  p3 = s6;
+  p1 = (p2+p3)*f2f(0.5411961f);
+  t2 = p1+p3*f2f(-1.847759065f);
+  t3 = p1+p2*f2f(0.765366865f);
+  p2 = s0;
+  p3 = s4;
+  t0 = fsh(p2+p3);
+  t1 = fsh(p2-p3);
+  x0 = t0+t3;
+  x3 = t0-t3;
+  x1 = t1+t2;
+  x2 = t1-t2;
+  t0 = s7;
+  t1 = s5;
+  t2 = s3;
+  t3 = s1;
+  p3 = t0+t2;
+  p4 = t1+t3;
+  p1 = t0+t3;
+  p2 = t1+t2;
+  p5 = (p3+p4)*f2f(1.175875602f);
+  t0 = t0*f2f(0.298631336f);
+  t1 = t1*f2f(2.053119869f);
+  t2 = t2*f2f(3.072711026f);
+  t3 = t3*f2f(1.501321110f);
+  p1 = p5+p1*f2f(-0.899976223f);
+  p2 = p5+p2*f2f(-2.562915447f);
+  p3 = p3*f2f(-1.961570560f);
+  p4 = p4*f2f(-0.390180644f);
+  t3 += p1+p4;
+  t2 += p2+p3;
+  t1 += p2+p4;
+  t0 += p1+p3;
+}
+
+// idct and level-shift
+void stbi__idct_block (ubyte* dst, int dst_stride, in ref short[64] data) pure {
+  int i;
+  int[64] val;
+  int* v = val.ptr;
+  const(short)* d = data.ptr;
+  // columns
+  for (i = 0; i < 8; ++i, ++d, ++v) {
+    // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+    if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && d[48] == 0 && d[56] == 0) {
+      //    no shortcut                 0     seconds
+      //    (1|2|3|4|5|6|7)==0          0     seconds
+      //    all separate               -0.047 seconds
+      //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+      int dcterm = d[0]<<2;
+      v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+    } else {
+      int t0, t1, t2, t3, x0, x1, x2, x3;
+      STBI__IDCT_1D(t0, t1, t2, t3, x0, x1, x2, x3, d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56]);
+      // constants scaled things up by 1<<12; let's bring them back
+      // down, but keep 2 extra bits of precision
+      x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+      v[ 0] = (x0+t3)>>10;
+      v[56] = (x0-t3)>>10;
+      v[ 8] = (x1+t2)>>10;
+      v[48] = (x1-t2)>>10;
+      v[16] = (x2+t1)>>10;
+      v[40] = (x2-t1)>>10;
+      v[24] = (x3+t0)>>10;
+      v[32] = (x3-t0)>>10;
+    }
+  }
+
+  ubyte* o = dst;
+  for (i = 0, v = val.ptr; i < 8; ++i, v += 8, o += dst_stride) {
+    // no fast case since the first 1D IDCT spread components out
+    int t0, t1, t2, t3, x0, x1, x2, x3;
+    STBI__IDCT_1D(t0, t1, t2, t3, x0, x1, x2, x3, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+    // constants scaled things up by 1<<12, plus we had 1<<2 from first
+    // loop, plus horizontal and vertical each scale by sqrt(8) so together
+    // we've got an extra 1<<3, so 1<<17 total we need to remove.
+    // so we want to round that, which means adding 0.5*1<<17,
+    // aka 65536. Also, we'll end up with -128 to 127 that we want
+    // to encode as 0-255 by adding 128, so we'll add that before the shift
+    x0 += 65536+(128<<17);
+    x1 += 65536+(128<<17);
+    x2 += 65536+(128<<17);
+    x3 += 65536+(128<<17);
+    // tried computing the shifts into temps, or'ing the temps to see
+    // if any were out of range, but that was slower
+    o[0] = stbi__clamp((x0+t3)>>17);
+    o[7] = stbi__clamp((x0-t3)>>17);
+    o[1] = stbi__clamp((x1+t2)>>17);
+    o[6] = stbi__clamp((x1-t2)>>17);
+    o[2] = stbi__clamp((x2+t1)>>17);
+    o[5] = stbi__clamp((x2-t1)>>17);
+    o[3] = stbi__clamp((x3+t0)>>17);
+    o[4] = stbi__clamp((x3-t0)>>17);
+  }
+}
+
+// clamp to 0-255
+ubyte stbi__clamp() (int x) pure {
+  if (cast(uint)x > 255) {
+    if (x < 0) return 0;
+    if (x > 255) return 255;
+  }
+  return cast(ubyte)x;
+}
+// the above is adapted from stb_image
+// ------------------------------------------------------------
-- 
2.11.4.GIT